2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * PACKET - implements raw packet sockets.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
35 * Ulises Alonso : Frame number limit removal and
36 * packet_set_ring memory leak.
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
40 * byte arrays at the end of sockaddr_ll
42 * Johann Baudy : Added TX RING.
43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
55 #include <linux/types.h>
57 #include <linux/capability.h>
58 #include <linux/fcntl.h>
59 #include <linux/socket.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_packet.h>
64 #include <linux/wireless.h>
65 #include <linux/kernel.h>
66 #include <linux/kmod.h>
67 #include <linux/slab.h>
68 #include <linux/vmalloc.h>
69 #include <net/net_namespace.h>
71 #include <net/protocol.h>
72 #include <linux/skbuff.h>
74 #include <linux/errno.h>
75 #include <linux/timer.h>
76 #include <linux/uaccess.h>
77 #include <asm/ioctls.h>
79 #include <asm/cacheflush.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/poll.h>
84 #include <linux/module.h>
85 #include <linux/init.h>
86 #include <linux/mutex.h>
87 #include <linux/if_vlan.h>
88 #include <linux/virtio_net.h>
89 #include <linux/errqueue.h>
90 #include <linux/net_tstamp.h>
91 #include <linux/percpu.h>
93 #include <net/inet_common.h>
95 #include <linux/bpf.h>
96 #include <net/compat.h>
102 - if device has no dev->hard_header routine, it adds and removes ll header
103 inside itself. In this case ll header is invisible outside of device,
104 but higher levels still should reserve dev->hard_header_len.
105 Some devices are enough clever to reallocate skb, when header
106 will not fit to reserved space (tunnel), another ones are silly
108 - packet socket receives packets with pulled ll header,
109 so that SOCK_RAW should push it back.
114 Incoming, dev->hard_header!=NULL
115 mac_header -> ll header
118 Outgoing, dev->hard_header!=NULL
119 mac_header -> ll header
122 Incoming, dev->hard_header==NULL
123 mac_header -> UNKNOWN position. It is very likely, that it points to ll
124 header. PPP makes it, that is wrong, because introduce
125 assymetry between rx and tx paths.
128 Outgoing, dev->hard_header==NULL
129 mac_header -> data. ll header is still not built!
133 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
139 dev->hard_header != NULL
140 mac_header -> ll header
143 dev->hard_header == NULL (ll header is added by device, we cannot control it)
147 We should set nh.raw on output to correct posistion,
148 packet classifier depends on it.
151 /* Private packet socket structures. */
153 /* identical to struct packet_mreq except it has
154 * a longer address field.
156 struct packet_mreq_max {
158 unsigned short mr_type;
159 unsigned short mr_alen;
160 unsigned char mr_address[MAX_ADDR_LEN];
164 struct tpacket_hdr *h1;
165 struct tpacket2_hdr *h2;
166 struct tpacket3_hdr *h3;
170 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
171 int closing, int tx_ring);
173 #define V3_ALIGNMENT (8)
175 #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
177 #define BLK_PLUS_PRIV(sz_of_priv) \
178 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
180 #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181 #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182 #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183 #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184 #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186 #define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
189 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
190 struct packet_type *pt, struct net_device *orig_dev);
192 static void *packet_previous_frame(struct packet_sock *po,
193 struct packet_ring_buffer *rb,
195 static void packet_increment_head(struct packet_ring_buffer *buff);
196 static int prb_curr_blk_in_use(struct tpacket_block_desc *);
197 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
198 struct packet_sock *);
199 static void prb_retire_current_block(struct tpacket_kbdq_core *,
200 struct packet_sock *, unsigned int status);
201 static int prb_queue_frozen(struct tpacket_kbdq_core *);
202 static void prb_open_block(struct tpacket_kbdq_core *,
203 struct tpacket_block_desc *);
204 static void prb_retire_rx_blk_timer_expired(unsigned long);
205 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
206 static void prb_init_blk_timer(struct packet_sock *,
207 struct tpacket_kbdq_core *,
208 void (*func) (unsigned long));
209 static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
210 static void prb_clear_rxhash(struct tpacket_kbdq_core *,
211 struct tpacket3_hdr *);
212 static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
213 struct tpacket3_hdr *);
214 static void packet_flush_mclist(struct sock *sk);
215 static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb);
217 struct packet_skb_cb {
219 struct sockaddr_pkt pkt;
221 /* Trick: alias skb original length with
222 * ll.sll_family and ll.protocol in order
225 unsigned int origlen;
226 struct sockaddr_ll ll;
231 #define vio_le() virtio_legacy_is_little_endian()
233 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
235 #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
236 #define GET_PBLOCK_DESC(x, bid) \
237 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
238 #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
239 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
240 #define GET_NEXT_PRB_BLK_NUM(x) \
241 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
242 ((x)->kactive_blk_num+1) : 0)
244 static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
245 static void __fanout_link(struct sock *sk, struct packet_sock *po);
247 static int packet_direct_xmit(struct sk_buff *skb)
249 struct net_device *dev = skb->dev;
250 struct sk_buff *orig_skb = skb;
251 struct netdev_queue *txq;
252 int ret = NETDEV_TX_BUSY;
254 if (unlikely(!netif_running(dev) ||
255 !netif_carrier_ok(dev)))
258 skb = validate_xmit_skb_list(skb, dev);
262 packet_pick_tx_queue(dev, skb);
263 txq = skb_get_tx_queue(dev, skb);
267 HARD_TX_LOCK(dev, txq, smp_processor_id());
268 if (!netif_xmit_frozen_or_drv_stopped(txq))
269 ret = netdev_start_xmit(skb, dev, txq, false);
270 HARD_TX_UNLOCK(dev, txq);
274 if (!dev_xmit_complete(ret))
279 atomic_long_inc(&dev->tx_dropped);
281 return NET_XMIT_DROP;
284 static struct net_device *packet_cached_dev_get(struct packet_sock *po)
286 struct net_device *dev;
289 dev = rcu_dereference(po->cached_dev);
297 static void packet_cached_dev_assign(struct packet_sock *po,
298 struct net_device *dev)
300 rcu_assign_pointer(po->cached_dev, dev);
303 static void packet_cached_dev_reset(struct packet_sock *po)
305 RCU_INIT_POINTER(po->cached_dev, NULL);
308 static bool packet_use_direct_xmit(const struct packet_sock *po)
310 return po->xmit == packet_direct_xmit;
313 static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
315 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
318 static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
320 const struct net_device_ops *ops = dev->netdev_ops;
323 if (ops->ndo_select_queue) {
324 queue_index = ops->ndo_select_queue(dev, skb, NULL,
325 __packet_pick_tx_queue);
326 queue_index = netdev_cap_txqueue(dev, queue_index);
328 queue_index = __packet_pick_tx_queue(dev, skb);
331 skb_set_queue_mapping(skb, queue_index);
334 /* __register_prot_hook must be invoked through register_prot_hook
335 * or from a context in which asynchronous accesses to the packet
336 * socket is not possible (packet_create()).
338 static void __register_prot_hook(struct sock *sk)
340 struct packet_sock *po = pkt_sk(sk);
344 __fanout_link(sk, po);
346 dev_add_pack(&po->prot_hook);
353 static void register_prot_hook(struct sock *sk)
355 lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
356 __register_prot_hook(sk);
359 /* If the sync parameter is true, we will temporarily drop
360 * the po->bind_lock and do a synchronize_net to make sure no
361 * asynchronous packet processing paths still refer to the elements
362 * of po->prot_hook. If the sync parameter is false, it is the
363 * callers responsibility to take care of this.
365 static void __unregister_prot_hook(struct sock *sk, bool sync)
367 struct packet_sock *po = pkt_sk(sk);
369 lockdep_assert_held_once(&po->bind_lock);
374 __fanout_unlink(sk, po);
376 __dev_remove_pack(&po->prot_hook);
381 spin_unlock(&po->bind_lock);
383 spin_lock(&po->bind_lock);
387 static void unregister_prot_hook(struct sock *sk, bool sync)
389 struct packet_sock *po = pkt_sk(sk);
392 __unregister_prot_hook(sk, sync);
395 static inline struct page * __pure pgv_to_page(void *addr)
397 if (is_vmalloc_addr(addr))
398 return vmalloc_to_page(addr);
399 return virt_to_page(addr);
402 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
404 union tpacket_uhdr h;
406 /* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */
409 switch (po->tp_version) {
411 WRITE_ONCE(h.h1->tp_status, status);
412 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
415 WRITE_ONCE(h.h2->tp_status, status);
416 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
419 WRITE_ONCE(h.h3->tp_status, status);
420 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
423 WARN(1, "TPACKET version not supported.\n");
430 static int __packet_get_status(struct packet_sock *po, void *frame)
432 union tpacket_uhdr h;
436 /* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */
439 switch (po->tp_version) {
441 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
442 return READ_ONCE(h.h1->tp_status);
444 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
445 return READ_ONCE(h.h2->tp_status);
447 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
448 return READ_ONCE(h.h3->tp_status);
450 WARN(1, "TPACKET version not supported.\n");
456 static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
459 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
462 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
463 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
464 return TP_STATUS_TS_RAW_HARDWARE;
466 if (ktime_to_timespec_cond(skb->tstamp, ts))
467 return TP_STATUS_TS_SOFTWARE;
472 static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
475 union tpacket_uhdr h;
479 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
483 switch (po->tp_version) {
485 h.h1->tp_sec = ts.tv_sec;
486 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
489 h.h2->tp_sec = ts.tv_sec;
490 h.h2->tp_nsec = ts.tv_nsec;
493 h.h3->tp_sec = ts.tv_sec;
494 h.h3->tp_nsec = ts.tv_nsec;
497 WARN(1, "TPACKET version not supported.\n");
501 /* one flush is safe, as both fields always lie on the same cacheline */
502 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
508 static void *packet_lookup_frame(struct packet_sock *po,
509 struct packet_ring_buffer *rb,
510 unsigned int position,
513 unsigned int pg_vec_pos, frame_offset;
514 union tpacket_uhdr h;
516 pg_vec_pos = position / rb->frames_per_block;
517 frame_offset = position % rb->frames_per_block;
519 h.raw = rb->pg_vec[pg_vec_pos].buffer +
520 (frame_offset * rb->frame_size);
522 if (status != __packet_get_status(po, h.raw))
528 static void *packet_current_frame(struct packet_sock *po,
529 struct packet_ring_buffer *rb,
532 return packet_lookup_frame(po, rb, rb->head, status);
535 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
537 del_timer_sync(&pkc->retire_blk_timer);
540 static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
541 struct sk_buff_head *rb_queue)
543 struct tpacket_kbdq_core *pkc;
545 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
547 spin_lock_bh(&rb_queue->lock);
548 pkc->delete_blk_timer = 1;
549 spin_unlock_bh(&rb_queue->lock);
551 prb_del_retire_blk_timer(pkc);
554 static void prb_init_blk_timer(struct packet_sock *po,
555 struct tpacket_kbdq_core *pkc,
556 void (*func) (unsigned long))
558 init_timer(&pkc->retire_blk_timer);
559 pkc->retire_blk_timer.data = (long)po;
560 pkc->retire_blk_timer.function = func;
561 pkc->retire_blk_timer.expires = jiffies;
564 static void prb_setup_retire_blk_timer(struct packet_sock *po)
566 struct tpacket_kbdq_core *pkc;
568 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
569 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
572 static int prb_calc_retire_blk_tmo(struct packet_sock *po,
573 int blk_size_in_bytes)
575 struct net_device *dev;
576 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
577 struct ethtool_link_ksettings ecmd;
581 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
582 if (unlikely(!dev)) {
584 return DEFAULT_PRB_RETIRE_TOV;
586 err = __ethtool_get_link_ksettings(dev, &ecmd);
590 * If the link speed is so slow you don't really
591 * need to worry about perf anyways
593 if (ecmd.base.speed < SPEED_1000 ||
594 ecmd.base.speed == SPEED_UNKNOWN) {
595 return DEFAULT_PRB_RETIRE_TOV;
598 div = ecmd.base.speed / 1000;
601 return DEFAULT_PRB_RETIRE_TOV;
603 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
615 static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
616 union tpacket_req_u *req_u)
618 p1->feature_req_word = req_u->req3.tp_feature_req_word;
621 static void init_prb_bdqc(struct packet_sock *po,
622 struct packet_ring_buffer *rb,
624 union tpacket_req_u *req_u)
626 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
627 struct tpacket_block_desc *pbd;
629 memset(p1, 0x0, sizeof(*p1));
631 p1->knxt_seq_num = 1;
633 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
634 p1->pkblk_start = pg_vec[0].buffer;
635 p1->kblk_size = req_u->req3.tp_block_size;
636 p1->knum_blocks = req_u->req3.tp_block_nr;
637 p1->hdrlen = po->tp_hdrlen;
638 p1->version = po->tp_version;
639 p1->last_kactive_blk_num = 0;
640 po->stats.stats3.tp_freeze_q_cnt = 0;
641 if (req_u->req3.tp_retire_blk_tov)
642 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
644 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
645 req_u->req3.tp_block_size);
646 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
647 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
649 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
650 prb_init_ft_ops(p1, req_u);
651 prb_setup_retire_blk_timer(po);
652 prb_open_block(p1, pbd);
655 /* Do NOT update the last_blk_num first.
656 * Assumes sk_buff_head lock is held.
658 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
660 mod_timer(&pkc->retire_blk_timer,
661 jiffies + pkc->tov_in_jiffies);
662 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
667 * 1) We refresh the timer only when we open a block.
668 * By doing this we don't waste cycles refreshing the timer
669 * on packet-by-packet basis.
671 * With a 1MB block-size, on a 1Gbps line, it will take
672 * i) ~8 ms to fill a block + ii) memcpy etc.
673 * In this cut we are not accounting for the memcpy time.
675 * So, if the user sets the 'tmo' to 10ms then the timer
676 * will never fire while the block is still getting filled
677 * (which is what we want). However, the user could choose
678 * to close a block early and that's fine.
680 * But when the timer does fire, we check whether or not to refresh it.
681 * Since the tmo granularity is in msecs, it is not too expensive
682 * to refresh the timer, lets say every '8' msecs.
683 * Either the user can set the 'tmo' or we can derive it based on
684 * a) line-speed and b) block-size.
685 * prb_calc_retire_blk_tmo() calculates the tmo.
688 static void prb_retire_rx_blk_timer_expired(unsigned long data)
690 struct packet_sock *po = (struct packet_sock *)data;
691 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
693 struct tpacket_block_desc *pbd;
695 spin_lock(&po->sk.sk_receive_queue.lock);
697 frozen = prb_queue_frozen(pkc);
698 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
700 if (unlikely(pkc->delete_blk_timer))
703 /* We only need to plug the race when the block is partially filled.
705 * lock(); increment BLOCK_NUM_PKTS; unlock()
706 * copy_bits() is in progress ...
707 * timer fires on other cpu:
708 * we can't retire the current block because copy_bits
712 if (BLOCK_NUM_PKTS(pbd)) {
713 while (atomic_read(&pkc->blk_fill_in_prog)) {
714 /* Waiting for skb_copy_bits to finish... */
719 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
721 if (!BLOCK_NUM_PKTS(pbd)) {
722 /* An empty block. Just refresh the timer. */
725 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
726 if (!prb_dispatch_next_block(pkc, po))
731 /* Case 1. Queue was frozen because user-space was
734 if (prb_curr_blk_in_use(pbd)) {
736 * Ok, user-space is still behind.
737 * So just refresh the timer.
741 /* Case 2. queue was frozen,user-space caught up,
742 * now the link went idle && the timer fired.
743 * We don't have a block to close.So we open this
744 * block and restart the timer.
745 * opening a block thaws the queue,restarts timer
746 * Thawing/timer-refresh is a side effect.
748 prb_open_block(pkc, pbd);
755 _prb_refresh_rx_retire_blk_timer(pkc);
758 spin_unlock(&po->sk.sk_receive_queue.lock);
761 static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
762 struct tpacket_block_desc *pbd1, __u32 status)
764 /* Flush everything minus the block header */
766 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
771 /* Skip the block header(we know header WILL fit in 4K) */
774 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
775 for (; start < end; start += PAGE_SIZE)
776 flush_dcache_page(pgv_to_page(start));
781 /* Now update the block status. */
783 BLOCK_STATUS(pbd1) = status;
785 /* Flush the block header */
787 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
789 flush_dcache_page(pgv_to_page(start));
799 * 2) Increment active_blk_num
801 * Note:We DONT refresh the timer on purpose.
802 * Because almost always the next block will be opened.
804 static void prb_close_block(struct tpacket_kbdq_core *pkc1,
805 struct tpacket_block_desc *pbd1,
806 struct packet_sock *po, unsigned int stat)
808 __u32 status = TP_STATUS_USER | stat;
810 struct tpacket3_hdr *last_pkt;
811 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
812 struct sock *sk = &po->sk;
814 if (po->stats.stats3.tp_drops)
815 status |= TP_STATUS_LOSING;
817 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
818 last_pkt->tp_next_offset = 0;
820 /* Get the ts of the last pkt */
821 if (BLOCK_NUM_PKTS(pbd1)) {
822 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
823 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
825 /* Ok, we tmo'd - so get the current time.
827 * It shouldn't really happen as we don't close empty
828 * blocks. See prb_retire_rx_blk_timer_expired().
832 h1->ts_last_pkt.ts_sec = ts.tv_sec;
833 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
838 /* Flush the block */
839 prb_flush_block(pkc1, pbd1, status);
841 sk->sk_data_ready(sk);
843 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
846 static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
848 pkc->reset_pending_on_curr_blk = 0;
852 * Side effect of opening a block:
854 * 1) prb_queue is thawed.
855 * 2) retire_blk_timer is refreshed.
858 static void prb_open_block(struct tpacket_kbdq_core *pkc1,
859 struct tpacket_block_desc *pbd1)
862 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
866 /* We could have just memset this but we will lose the
867 * flexibility of making the priv area sticky
870 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
871 BLOCK_NUM_PKTS(pbd1) = 0;
872 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
876 h1->ts_first_pkt.ts_sec = ts.tv_sec;
877 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
879 pkc1->pkblk_start = (char *)pbd1;
880 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
882 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
883 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
885 pbd1->version = pkc1->version;
886 pkc1->prev = pkc1->nxt_offset;
887 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
889 prb_thaw_queue(pkc1);
890 _prb_refresh_rx_retire_blk_timer(pkc1);
896 * Queue freeze logic:
897 * 1) Assume tp_block_nr = 8 blocks.
898 * 2) At time 't0', user opens Rx ring.
899 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
900 * 4) user-space is either sleeping or processing block '0'.
901 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
902 * it will close block-7,loop around and try to fill block '0'.
904 * __packet_lookup_frame_in_block
905 * prb_retire_current_block()
906 * prb_dispatch_next_block()
907 * |->(BLOCK_STATUS == USER) evaluates to true
908 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
909 * 6) Now there are two cases:
910 * 6.1) Link goes idle right after the queue is frozen.
911 * But remember, the last open_block() refreshed the timer.
912 * When this timer expires,it will refresh itself so that we can
913 * re-open block-0 in near future.
914 * 6.2) Link is busy and keeps on receiving packets. This is a simple
915 * case and __packet_lookup_frame_in_block will check if block-0
916 * is free and can now be re-used.
918 static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
919 struct packet_sock *po)
921 pkc->reset_pending_on_curr_blk = 1;
922 po->stats.stats3.tp_freeze_q_cnt++;
925 #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
928 * If the next block is free then we will dispatch it
929 * and return a good offset.
930 * Else, we will freeze the queue.
931 * So, caller must check the return value.
933 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
934 struct packet_sock *po)
936 struct tpacket_block_desc *pbd;
940 /* 1. Get current block num */
941 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
943 /* 2. If this block is currently in_use then freeze the queue */
944 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
945 prb_freeze_queue(pkc, po);
951 * open this block and return the offset where the first packet
952 * needs to get stored.
954 prb_open_block(pkc, pbd);
955 return (void *)pkc->nxt_offset;
958 static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
959 struct packet_sock *po, unsigned int status)
961 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
963 /* retire/close the current block */
964 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
966 * Plug the case where copy_bits() is in progress on
967 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
968 * have space to copy the pkt in the current block and
969 * called prb_retire_current_block()
971 * We don't need to worry about the TMO case because
972 * the timer-handler already handled this case.
974 if (!(status & TP_STATUS_BLK_TMO)) {
975 while (atomic_read(&pkc->blk_fill_in_prog)) {
976 /* Waiting for skb_copy_bits to finish... */
980 prb_close_block(pkc, pbd, po, status);
985 static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
987 return TP_STATUS_USER & BLOCK_STATUS(pbd);
990 static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
992 return pkc->reset_pending_on_curr_blk;
995 static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
996 __releases(&pkc->blk_fill_in_prog_lock)
998 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
999 atomic_dec(&pkc->blk_fill_in_prog);
1002 static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
1003 struct tpacket3_hdr *ppd)
1005 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
1008 static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
1009 struct tpacket3_hdr *ppd)
1011 ppd->hv1.tp_rxhash = 0;
1014 static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
1015 struct tpacket3_hdr *ppd)
1017 if (skb_vlan_tag_present(pkc->skb)) {
1018 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
1019 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1020 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
1022 ppd->hv1.tp_vlan_tci = 0;
1023 ppd->hv1.tp_vlan_tpid = 0;
1024 ppd->tp_status = TP_STATUS_AVAILABLE;
1028 static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
1029 struct tpacket3_hdr *ppd)
1031 ppd->hv1.tp_padding = 0;
1032 prb_fill_vlan_info(pkc, ppd);
1034 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1035 prb_fill_rxhash(pkc, ppd);
1037 prb_clear_rxhash(pkc, ppd);
1040 static void prb_fill_curr_block(char *curr,
1041 struct tpacket_kbdq_core *pkc,
1042 struct tpacket_block_desc *pbd,
1044 __acquires(&pkc->blk_fill_in_prog_lock)
1046 struct tpacket3_hdr *ppd;
1048 ppd = (struct tpacket3_hdr *)curr;
1049 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1051 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1052 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1053 BLOCK_NUM_PKTS(pbd) += 1;
1054 atomic_inc(&pkc->blk_fill_in_prog);
1055 prb_run_all_ft_ops(pkc, ppd);
1058 /* Assumes caller has the sk->rx_queue.lock */
1059 static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1060 struct sk_buff *skb,
1065 struct tpacket_kbdq_core *pkc;
1066 struct tpacket_block_desc *pbd;
1069 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
1070 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1072 /* Queue is frozen when user space is lagging behind */
1073 if (prb_queue_frozen(pkc)) {
1075 * Check if that last block which caused the queue to freeze,
1076 * is still in_use by user-space.
1078 if (prb_curr_blk_in_use(pbd)) {
1079 /* Can't record this packet */
1083 * Ok, the block was released by user-space.
1084 * Now let's open that block.
1085 * opening a block also thaws the queue.
1086 * Thawing is a side effect.
1088 prb_open_block(pkc, pbd);
1093 curr = pkc->nxt_offset;
1095 end = (char *)pbd + pkc->kblk_size;
1097 /* first try the current block */
1098 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1099 prb_fill_curr_block(curr, pkc, pbd, len);
1100 return (void *)curr;
1103 /* Ok, close the current block */
1104 prb_retire_current_block(pkc, po, 0);
1106 /* Now, try to dispatch the next block */
1107 curr = (char *)prb_dispatch_next_block(pkc, po);
1109 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1110 prb_fill_curr_block(curr, pkc, pbd, len);
1111 return (void *)curr;
1115 * No free blocks are available.user_space hasn't caught up yet.
1116 * Queue was just frozen and now this packet will get dropped.
1121 static void *packet_current_rx_frame(struct packet_sock *po,
1122 struct sk_buff *skb,
1123 int status, unsigned int len)
1126 switch (po->tp_version) {
1129 curr = packet_lookup_frame(po, &po->rx_ring,
1130 po->rx_ring.head, status);
1133 return __packet_lookup_frame_in_block(po, skb, status, len);
1135 WARN(1, "TPACKET version not supported\n");
1141 static void *prb_lookup_block(struct packet_sock *po,
1142 struct packet_ring_buffer *rb,
1146 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1147 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
1149 if (status != BLOCK_STATUS(pbd))
1154 static int prb_previous_blk_num(struct packet_ring_buffer *rb)
1157 if (rb->prb_bdqc.kactive_blk_num)
1158 prev = rb->prb_bdqc.kactive_blk_num-1;
1160 prev = rb->prb_bdqc.knum_blocks-1;
1164 /* Assumes caller has held the rx_queue.lock */
1165 static void *__prb_previous_block(struct packet_sock *po,
1166 struct packet_ring_buffer *rb,
1169 unsigned int previous = prb_previous_blk_num(rb);
1170 return prb_lookup_block(po, rb, previous, status);
1173 static void *packet_previous_rx_frame(struct packet_sock *po,
1174 struct packet_ring_buffer *rb,
1177 if (po->tp_version <= TPACKET_V2)
1178 return packet_previous_frame(po, rb, status);
1180 return __prb_previous_block(po, rb, status);
1183 static void packet_increment_rx_head(struct packet_sock *po,
1184 struct packet_ring_buffer *rb)
1186 switch (po->tp_version) {
1189 return packet_increment_head(rb);
1192 WARN(1, "TPACKET version not supported.\n");
1198 static void *packet_previous_frame(struct packet_sock *po,
1199 struct packet_ring_buffer *rb,
1202 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1203 return packet_lookup_frame(po, rb, previous, status);
1206 static void packet_increment_head(struct packet_ring_buffer *buff)
1208 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1211 static void packet_inc_pending(struct packet_ring_buffer *rb)
1213 this_cpu_inc(*rb->pending_refcnt);
1216 static void packet_dec_pending(struct packet_ring_buffer *rb)
1218 this_cpu_dec(*rb->pending_refcnt);
1221 static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1223 unsigned int refcnt = 0;
1226 /* We don't use pending refcount in rx_ring. */
1227 if (rb->pending_refcnt == NULL)
1230 for_each_possible_cpu(cpu)
1231 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1236 static int packet_alloc_pending(struct packet_sock *po)
1238 po->rx_ring.pending_refcnt = NULL;
1240 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1241 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1247 static void packet_free_pending(struct packet_sock *po)
1249 free_percpu(po->tx_ring.pending_refcnt);
1252 #define ROOM_POW_OFF 2
1253 #define ROOM_NONE 0x0
1254 #define ROOM_LOW 0x1
1255 #define ROOM_NORMAL 0x2
1257 static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
1261 len = po->rx_ring.frame_max + 1;
1262 idx = po->rx_ring.head;
1264 idx += len >> pow_off;
1267 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1270 static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1274 len = po->rx_ring.prb_bdqc.knum_blocks;
1275 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1277 idx += len >> pow_off;
1280 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1283 static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1285 struct sock *sk = &po->sk;
1286 int ret = ROOM_NONE;
1288 if (po->prot_hook.func != tpacket_rcv) {
1289 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1290 - (skb ? skb->truesize : 0);
1291 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1299 if (po->tp_version == TPACKET_V3) {
1300 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1302 else if (__tpacket_v3_has_room(po, 0))
1305 if (__tpacket_has_room(po, ROOM_POW_OFF))
1307 else if (__tpacket_has_room(po, 0))
1314 static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1319 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1320 ret = __packet_rcv_has_room(po, skb);
1321 has_room = ret == ROOM_NORMAL;
1322 if (po->pressure == has_room)
1323 po->pressure = !has_room;
1324 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
1329 static void packet_sock_destruct(struct sock *sk)
1331 skb_queue_purge(&sk->sk_error_queue);
1333 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1334 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1336 if (!sock_flag(sk, SOCK_DEAD)) {
1337 pr_err("Attempt to release alive packet socket: %p\n", sk);
1341 sk_refcnt_debug_dec(sk);
1344 static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1346 u32 *history = po->rollover->history;
1350 rxhash = skb_get_hash(skb);
1351 for (i = 0; i < ROLLOVER_HLEN; i++)
1352 if (READ_ONCE(history[i]) == rxhash)
1355 victim = prandom_u32() % ROLLOVER_HLEN;
1357 /* Avoid dirtying the cache line if possible */
1358 if (READ_ONCE(history[victim]) != rxhash)
1359 WRITE_ONCE(history[victim], rxhash);
1361 return count > (ROLLOVER_HLEN >> 1);
1364 static unsigned int fanout_demux_hash(struct packet_fanout *f,
1365 struct sk_buff *skb,
1368 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
1371 static unsigned int fanout_demux_lb(struct packet_fanout *f,
1372 struct sk_buff *skb,
1375 unsigned int val = atomic_inc_return(&f->rr_cur);
1380 static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1381 struct sk_buff *skb,
1384 return smp_processor_id() % num;
1387 static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1388 struct sk_buff *skb,
1391 return prandom_u32_max(num);
1394 static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1395 struct sk_buff *skb,
1396 unsigned int idx, bool try_self,
1399 struct packet_sock *po, *po_next, *po_skip = NULL;
1400 unsigned int i, j, room = ROOM_NONE;
1402 po = pkt_sk(f->arr[idx]);
1405 room = packet_rcv_has_room(po, skb);
1406 if (room == ROOM_NORMAL ||
1407 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1412 i = j = min_t(int, po->rollover->sock, num - 1);
1414 po_next = pkt_sk(f->arr[i]);
1415 if (po_next != po_skip && !po_next->pressure &&
1416 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
1418 po->rollover->sock = i;
1419 atomic_long_inc(&po->rollover->num);
1420 if (room == ROOM_LOW)
1421 atomic_long_inc(&po->rollover->num_huge);
1429 atomic_long_inc(&po->rollover->num_failed);
1433 static unsigned int fanout_demux_qm(struct packet_fanout *f,
1434 struct sk_buff *skb,
1437 return skb_get_queue_mapping(skb) % num;
1440 static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1441 struct sk_buff *skb,
1444 struct bpf_prog *prog;
1445 unsigned int ret = 0;
1448 prog = rcu_dereference(f->bpf_prog);
1450 ret = bpf_prog_run_clear_cb(prog, skb) % num;
1456 static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1458 return f->flags & (flag >> 8);
1461 static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1462 struct packet_type *pt, struct net_device *orig_dev)
1464 struct packet_fanout *f = pt->af_packet_priv;
1465 unsigned int num = READ_ONCE(f->num_members);
1466 struct net *net = read_pnet(&f->net);
1467 struct packet_sock *po;
1470 if (!net_eq(dev_net(dev), net) || !num) {
1475 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1476 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
1481 case PACKET_FANOUT_HASH:
1483 idx = fanout_demux_hash(f, skb, num);
1485 case PACKET_FANOUT_LB:
1486 idx = fanout_demux_lb(f, skb, num);
1488 case PACKET_FANOUT_CPU:
1489 idx = fanout_demux_cpu(f, skb, num);
1491 case PACKET_FANOUT_RND:
1492 idx = fanout_demux_rnd(f, skb, num);
1494 case PACKET_FANOUT_QM:
1495 idx = fanout_demux_qm(f, skb, num);
1497 case PACKET_FANOUT_ROLLOVER:
1498 idx = fanout_demux_rollover(f, skb, 0, false, num);
1500 case PACKET_FANOUT_CBPF:
1501 case PACKET_FANOUT_EBPF:
1502 idx = fanout_demux_bpf(f, skb, num);
1506 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1507 idx = fanout_demux_rollover(f, skb, idx, true, num);
1509 po = pkt_sk(f->arr[idx]);
1510 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1513 DEFINE_MUTEX(fanout_mutex);
1514 EXPORT_SYMBOL_GPL(fanout_mutex);
1515 static LIST_HEAD(fanout_list);
1516 static u16 fanout_next_id;
1518 static void __fanout_link(struct sock *sk, struct packet_sock *po)
1520 struct packet_fanout *f = po->fanout;
1522 spin_lock(&f->lock);
1523 f->arr[f->num_members] = sk;
1526 if (f->num_members == 1)
1527 dev_add_pack(&f->prot_hook);
1528 spin_unlock(&f->lock);
1531 static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1533 struct packet_fanout *f = po->fanout;
1536 spin_lock(&f->lock);
1537 for (i = 0; i < f->num_members; i++) {
1538 if (f->arr[i] == sk)
1541 BUG_ON(i >= f->num_members);
1542 f->arr[i] = f->arr[f->num_members - 1];
1544 if (f->num_members == 0)
1545 __dev_remove_pack(&f->prot_hook);
1546 spin_unlock(&f->lock);
1549 static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
1551 if (sk->sk_family != PF_PACKET)
1554 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
1557 static void fanout_init_data(struct packet_fanout *f)
1560 case PACKET_FANOUT_LB:
1561 atomic_set(&f->rr_cur, 0);
1563 case PACKET_FANOUT_CBPF:
1564 case PACKET_FANOUT_EBPF:
1565 RCU_INIT_POINTER(f->bpf_prog, NULL);
1570 static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1572 struct bpf_prog *old;
1574 spin_lock(&f->lock);
1575 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1576 rcu_assign_pointer(f->bpf_prog, new);
1577 spin_unlock(&f->lock);
1581 bpf_prog_destroy(old);
1585 static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1588 struct bpf_prog *new;
1589 struct sock_fprog fprog;
1592 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1594 if (len != sizeof(fprog))
1596 if (copy_from_user(&fprog, data, len))
1599 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
1603 __fanout_set_data_bpf(po->fanout, new);
1607 static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1610 struct bpf_prog *new;
1613 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1615 if (len != sizeof(fd))
1617 if (copy_from_user(&fd, data, len))
1620 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
1622 return PTR_ERR(new);
1624 __fanout_set_data_bpf(po->fanout, new);
1628 static int fanout_set_data(struct packet_sock *po, char __user *data,
1631 switch (po->fanout->type) {
1632 case PACKET_FANOUT_CBPF:
1633 return fanout_set_data_cbpf(po, data, len);
1634 case PACKET_FANOUT_EBPF:
1635 return fanout_set_data_ebpf(po, data, len);
1641 static void fanout_release_data(struct packet_fanout *f)
1644 case PACKET_FANOUT_CBPF:
1645 case PACKET_FANOUT_EBPF:
1646 __fanout_set_data_bpf(f, NULL);
1650 static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1652 struct packet_fanout *f;
1654 list_for_each_entry(f, &fanout_list, list) {
1655 if (f->id == candidate_id &&
1656 read_pnet(&f->net) == sock_net(sk)) {
1663 static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1665 u16 id = fanout_next_id;
1668 if (__fanout_id_is_free(sk, id)) {
1670 fanout_next_id = id + 1;
1675 } while (id != fanout_next_id);
1680 static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1682 struct packet_rollover *rollover = NULL;
1683 struct packet_sock *po = pkt_sk(sk);
1684 struct packet_fanout *f, *match;
1685 u8 type = type_flags & 0xff;
1686 u8 flags = type_flags >> 8;
1690 case PACKET_FANOUT_ROLLOVER:
1691 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1693 case PACKET_FANOUT_HASH:
1694 case PACKET_FANOUT_LB:
1695 case PACKET_FANOUT_CPU:
1696 case PACKET_FANOUT_RND:
1697 case PACKET_FANOUT_QM:
1698 case PACKET_FANOUT_CBPF:
1699 case PACKET_FANOUT_EBPF:
1705 mutex_lock(&fanout_mutex);
1711 if (type == PACKET_FANOUT_ROLLOVER ||
1712 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
1714 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1717 atomic_long_set(&rollover->num, 0);
1718 atomic_long_set(&rollover->num_huge, 0);
1719 atomic_long_set(&rollover->num_failed, 0);
1722 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1727 if (!fanout_find_new_id(sk, &id)) {
1731 /* ephemeral flag for the first socket in the group: drop it */
1732 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1736 list_for_each_entry(f, &fanout_list, list) {
1738 read_pnet(&f->net) == sock_net(sk)) {
1744 if (match && match->flags != flags)
1748 match = kzalloc(sizeof(*match), GFP_KERNEL);
1751 write_pnet(&match->net, sock_net(sk));
1754 match->flags = flags;
1755 INIT_LIST_HEAD(&match->list);
1756 spin_lock_init(&match->lock);
1757 refcount_set(&match->sk_ref, 0);
1758 fanout_init_data(match);
1759 match->prot_hook.type = po->prot_hook.type;
1760 match->prot_hook.dev = po->prot_hook.dev;
1761 match->prot_hook.func = packet_rcv_fanout;
1762 match->prot_hook.af_packet_priv = match;
1763 match->prot_hook.af_packet_net = read_pnet(&match->net);
1764 match->prot_hook.id_match = match_fanout_group;
1765 list_add(&match->list, &fanout_list);
1769 spin_lock(&po->bind_lock);
1771 match->type == type &&
1772 match->prot_hook.type == po->prot_hook.type &&
1773 match->prot_hook.dev == po->prot_hook.dev) {
1775 if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1776 __dev_remove_pack(&po->prot_hook);
1778 /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */
1779 WRITE_ONCE(po->fanout, match);
1781 po->rollover = rollover;
1783 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
1784 __fanout_link(sk, po);
1788 spin_unlock(&po->bind_lock);
1790 if (err && !refcount_read(&match->sk_ref)) {
1791 list_del(&match->list);
1797 mutex_unlock(&fanout_mutex);
1801 /* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1802 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1803 * It is the responsibility of the caller to call fanout_release_data() and
1804 * free the returned packet_fanout (after synchronize_net())
1806 static struct packet_fanout *fanout_release(struct sock *sk)
1808 struct packet_sock *po = pkt_sk(sk);
1809 struct packet_fanout *f;
1811 mutex_lock(&fanout_mutex);
1816 if (refcount_dec_and_test(&f->sk_ref))
1821 mutex_unlock(&fanout_mutex);
1826 static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1827 struct sk_buff *skb)
1829 /* Earlier code assumed this would be a VLAN pkt, double-check
1830 * this now that we have the actual packet in hand. We can only
1831 * do this check on Ethernet devices.
1833 if (unlikely(dev->type != ARPHRD_ETHER))
1836 skb_reset_mac_header(skb);
1837 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1840 static const struct proto_ops packet_ops;
1842 static const struct proto_ops packet_ops_spkt;
1844 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1845 struct packet_type *pt, struct net_device *orig_dev)
1848 struct sockaddr_pkt *spkt;
1851 * When we registered the protocol we saved the socket in the data
1852 * field for just this event.
1855 sk = pt->af_packet_priv;
1858 * Yank back the headers [hope the device set this
1859 * right or kerboom...]
1861 * Incoming packets have ll header pulled,
1864 * For outgoing ones skb->data == skb_mac_header(skb)
1865 * so that this procedure is noop.
1868 if (skb->pkt_type == PACKET_LOOPBACK)
1871 if (!net_eq(dev_net(dev), sock_net(sk)))
1874 skb = skb_share_check(skb, GFP_ATOMIC);
1878 /* drop any routing info */
1881 /* drop conntrack reference */
1884 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1886 skb_push(skb, skb->data - skb_mac_header(skb));
1889 * The SOCK_PACKET socket receives _all_ frames.
1892 spkt->spkt_family = dev->type;
1893 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1894 spkt->spkt_protocol = skb->protocol;
1897 * Charge the memory to the socket. This is done specifically
1898 * to prevent sockets using all the memory up.
1901 if (sock_queue_rcv_skb(sk, skb) == 0)
1912 * Output a raw packet to a device layer. This bypasses all the other
1913 * protocol layers and you must therefore supply it with a complete frame
1916 static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1919 struct sock *sk = sock->sk;
1920 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1921 struct sk_buff *skb = NULL;
1922 struct net_device *dev;
1923 struct sockcm_cookie sockc;
1929 * Get and verify the address.
1933 if (msg->msg_namelen < sizeof(struct sockaddr))
1935 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1936 proto = saddr->spkt_protocol;
1938 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1941 * Find the device first to size check it
1944 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1947 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1953 if (!(dev->flags & IFF_UP))
1957 * You may not queue a frame bigger than the mtu. This is the lowest level
1958 * raw protocol and you must do your own fragmentation at this level.
1961 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1962 if (!netif_supports_nofcs(dev)) {
1963 err = -EPROTONOSUPPORT;
1966 extra_len = 4; /* We're doing our own CRC */
1970 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1974 size_t reserved = LL_RESERVED_SPACE(dev);
1975 int tlen = dev->needed_tailroom;
1976 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1979 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1982 /* FIXME: Save some space for broken drivers that write a hard
1983 * header at transmission time by themselves. PPP is the notable
1984 * one here. This should really be fixed at the driver level.
1986 skb_reserve(skb, reserved);
1987 skb_reset_network_header(skb);
1989 /* Try to align data part correctly */
1994 skb_reset_network_header(skb);
1996 err = memcpy_from_msg(skb_put(skb, len), msg, len);
2002 if (!dev_validate_header(dev, skb->data, len) || !skb->len) {
2006 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
2007 !packet_extra_vlan_len_allowed(dev, skb)) {
2012 sockc.tsflags = sk->sk_tsflags;
2013 if (msg->msg_controllen) {
2014 err = sock_cmsg_send(sk, msg, &sockc);
2019 skb->protocol = proto;
2021 skb->priority = sk->sk_priority;
2022 skb->mark = sk->sk_mark;
2024 skb_setup_tx_timestamp(skb, sockc.tsflags);
2026 if (unlikely(extra_len == 4))
2029 skb_probe_transport_header(skb, 0);
2031 dev_queue_xmit(skb);
2042 static unsigned int run_filter(struct sk_buff *skb,
2043 const struct sock *sk,
2046 struct sk_filter *filter;
2049 filter = rcu_dereference(sk->sk_filter);
2051 res = bpf_prog_run_clear_cb(filter->prog, skb);
2057 static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2060 struct virtio_net_hdr vnet_hdr;
2062 if (*len < sizeof(vnet_hdr))
2064 *len -= sizeof(vnet_hdr);
2066 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
2069 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2073 * This function makes lazy skb cloning in hope that most of packets
2074 * are discarded by BPF.
2076 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2077 * and skb->cb are mangled. It works because (and until) packets
2078 * falling here are owned by current CPU. Output packets are cloned
2079 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2080 * sequencially, so that if we return skb to original state on exit,
2081 * we will not harm anyone.
2084 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2085 struct packet_type *pt, struct net_device *orig_dev)
2088 struct sockaddr_ll *sll;
2089 struct packet_sock *po;
2090 u8 *skb_head = skb->data;
2091 int skb_len = skb->len;
2092 unsigned int snaplen, res;
2093 bool is_drop_n_account = false;
2095 if (skb->pkt_type == PACKET_LOOPBACK)
2098 sk = pt->af_packet_priv;
2101 if (!net_eq(dev_net(dev), sock_net(sk)))
2106 if (dev->header_ops) {
2107 /* The device has an explicit notion of ll header,
2108 * exported to higher levels.
2110 * Otherwise, the device hides details of its frame
2111 * structure, so that corresponding packet head is
2112 * never delivered to user.
2114 if (sk->sk_type != SOCK_DGRAM)
2115 skb_push(skb, skb->data - skb_mac_header(skb));
2116 else if (skb->pkt_type == PACKET_OUTGOING) {
2117 /* Special case: outgoing packets have ll header at head */
2118 skb_pull(skb, skb_network_offset(skb));
2124 res = run_filter(skb, sk, snaplen);
2126 goto drop_n_restore;
2130 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
2133 if (skb_shared(skb)) {
2134 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2138 if (skb_head != skb->data) {
2139 skb->data = skb_head;
2146 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
2148 sll = &PACKET_SKB_CB(skb)->sa.ll;
2149 sll->sll_hatype = dev->type;
2150 sll->sll_pkttype = skb->pkt_type;
2151 if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
2152 sll->sll_ifindex = orig_dev->ifindex;
2154 sll->sll_ifindex = dev->ifindex;
2156 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2158 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2159 * Use their space for storing the original skb length.
2161 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
2163 if (pskb_trim(skb, snaplen))
2166 skb_set_owner_r(skb, sk);
2170 /* drop conntrack reference */
2173 spin_lock(&sk->sk_receive_queue.lock);
2174 po->stats.stats1.tp_packets++;
2175 sock_skb_set_dropcount(sk, skb);
2176 __skb_queue_tail(&sk->sk_receive_queue, skb);
2177 spin_unlock(&sk->sk_receive_queue.lock);
2178 sk->sk_data_ready(sk);
2182 is_drop_n_account = true;
2183 spin_lock(&sk->sk_receive_queue.lock);
2184 po->stats.stats1.tp_drops++;
2185 atomic_inc(&sk->sk_drops);
2186 spin_unlock(&sk->sk_receive_queue.lock);
2189 if (skb_head != skb->data && skb_shared(skb)) {
2190 skb->data = skb_head;
2194 if (!is_drop_n_account)
2201 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2202 struct packet_type *pt, struct net_device *orig_dev)
2205 struct packet_sock *po;
2206 struct sockaddr_ll *sll;
2207 union tpacket_uhdr h;
2208 u8 *skb_head = skb->data;
2209 int skb_len = skb->len;
2210 unsigned int snaplen, res;
2211 unsigned long status = TP_STATUS_USER;
2212 unsigned short macoff, hdrlen;
2213 unsigned int netoff;
2214 struct sk_buff *copy_skb = NULL;
2217 bool is_drop_n_account = false;
2218 unsigned int slot_id = 0;
2219 bool do_vnet = false;
2221 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2222 * We may add members to them until current aligned size without forcing
2223 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2225 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2226 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2228 if (skb->pkt_type == PACKET_LOOPBACK)
2231 sk = pt->af_packet_priv;
2234 if (!net_eq(dev_net(dev), sock_net(sk)))
2237 if (dev->header_ops) {
2238 if (sk->sk_type != SOCK_DGRAM)
2239 skb_push(skb, skb->data - skb_mac_header(skb));
2240 else if (skb->pkt_type == PACKET_OUTGOING) {
2241 /* Special case: outgoing packets have ll header at head */
2242 skb_pull(skb, skb_network_offset(skb));
2248 res = run_filter(skb, sk, snaplen);
2250 goto drop_n_restore;
2252 if (skb->ip_summed == CHECKSUM_PARTIAL)
2253 status |= TP_STATUS_CSUMNOTREADY;
2254 else if (skb->pkt_type != PACKET_OUTGOING &&
2255 skb_csum_unnecessary(skb))
2256 status |= TP_STATUS_CSUM_VALID;
2261 if (sk->sk_type == SOCK_DGRAM) {
2262 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2265 unsigned int maclen = skb_network_offset(skb);
2266 netoff = TPACKET_ALIGN(po->tp_hdrlen +
2267 (maclen < 16 ? 16 : maclen)) +
2269 if (po->has_vnet_hdr) {
2270 netoff += sizeof(struct virtio_net_hdr);
2273 macoff = netoff - maclen;
2275 if (netoff > USHRT_MAX) {
2276 spin_lock(&sk->sk_receive_queue.lock);
2277 po->stats.stats1.tp_drops++;
2278 spin_unlock(&sk->sk_receive_queue.lock);
2279 goto drop_n_restore;
2281 if (po->tp_version <= TPACKET_V2) {
2282 if (macoff + snaplen > po->rx_ring.frame_size) {
2283 if (po->copy_thresh &&
2284 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
2285 if (skb_shared(skb)) {
2286 copy_skb = skb_clone(skb, GFP_ATOMIC);
2288 copy_skb = skb_get(skb);
2289 skb_head = skb->data;
2292 memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0,
2293 sizeof(PACKET_SKB_CB(copy_skb)->sa.ll));
2294 skb_set_owner_r(copy_skb, sk);
2297 snaplen = po->rx_ring.frame_size - macoff;
2298 if ((int)snaplen < 0) {
2303 } else if (unlikely(macoff + snaplen >
2304 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2307 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2308 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2309 snaplen, nval, macoff);
2311 if (unlikely((int)snaplen < 0)) {
2313 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2317 spin_lock(&sk->sk_receive_queue.lock);
2318 h.raw = packet_current_rx_frame(po, skb,
2319 TP_STATUS_KERNEL, (macoff+snaplen));
2321 goto drop_n_account;
2323 if (po->tp_version <= TPACKET_V2) {
2324 slot_id = po->rx_ring.head;
2325 if (test_bit(slot_id, po->rx_ring.rx_owner_map))
2326 goto drop_n_account;
2327 __set_bit(slot_id, po->rx_ring.rx_owner_map);
2331 virtio_net_hdr_from_skb(skb, h.raw + macoff -
2332 sizeof(struct virtio_net_hdr),
2333 vio_le(), true, 0)) {
2334 if (po->tp_version == TPACKET_V3)
2335 prb_clear_blk_fill_status(&po->rx_ring);
2336 goto drop_n_account;
2339 if (po->tp_version <= TPACKET_V2) {
2340 packet_increment_rx_head(po, &po->rx_ring);
2342 * LOSING will be reported till you read the stats,
2343 * because it's COR - Clear On Read.
2344 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2347 if (po->stats.stats1.tp_drops)
2348 status |= TP_STATUS_LOSING;
2351 po->stats.stats1.tp_packets++;
2353 status |= TP_STATUS_COPY;
2354 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2356 spin_unlock(&sk->sk_receive_queue.lock);
2358 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
2360 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
2361 getnstimeofday(&ts);
2363 status |= ts_status;
2365 switch (po->tp_version) {
2367 h.h1->tp_len = skb->len;
2368 h.h1->tp_snaplen = snaplen;
2369 h.h1->tp_mac = macoff;
2370 h.h1->tp_net = netoff;
2371 h.h1->tp_sec = ts.tv_sec;
2372 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
2373 hdrlen = sizeof(*h.h1);
2376 h.h2->tp_len = skb->len;
2377 h.h2->tp_snaplen = snaplen;
2378 h.h2->tp_mac = macoff;
2379 h.h2->tp_net = netoff;
2380 h.h2->tp_sec = ts.tv_sec;
2381 h.h2->tp_nsec = ts.tv_nsec;
2382 if (skb_vlan_tag_present(skb)) {
2383 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
2384 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2385 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
2387 h.h2->tp_vlan_tci = 0;
2388 h.h2->tp_vlan_tpid = 0;
2390 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
2391 hdrlen = sizeof(*h.h2);
2394 /* tp_nxt_offset,vlan are already populated above.
2395 * So DONT clear those fields here
2397 h.h3->tp_status |= status;
2398 h.h3->tp_len = skb->len;
2399 h.h3->tp_snaplen = snaplen;
2400 h.h3->tp_mac = macoff;
2401 h.h3->tp_net = netoff;
2402 h.h3->tp_sec = ts.tv_sec;
2403 h.h3->tp_nsec = ts.tv_nsec;
2404 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
2405 hdrlen = sizeof(*h.h3);
2411 sll = h.raw + TPACKET_ALIGN(hdrlen);
2412 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2413 sll->sll_family = AF_PACKET;
2414 sll->sll_hatype = dev->type;
2415 sll->sll_protocol = skb->protocol;
2416 sll->sll_pkttype = skb->pkt_type;
2417 if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
2418 sll->sll_ifindex = orig_dev->ifindex;
2420 sll->sll_ifindex = dev->ifindex;
2424 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
2425 if (po->tp_version <= TPACKET_V2) {
2428 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2431 for (start = h.raw; start < end; start += PAGE_SIZE)
2432 flush_dcache_page(pgv_to_page(start));
2437 if (po->tp_version <= TPACKET_V2) {
2438 spin_lock(&sk->sk_receive_queue.lock);
2439 __packet_set_status(po, h.raw, status);
2440 __clear_bit(slot_id, po->rx_ring.rx_owner_map);
2441 spin_unlock(&sk->sk_receive_queue.lock);
2442 sk->sk_data_ready(sk);
2443 } else if (po->tp_version == TPACKET_V3) {
2444 prb_clear_blk_fill_status(&po->rx_ring);
2448 if (skb_head != skb->data && skb_shared(skb)) {
2449 skb->data = skb_head;
2453 if (!is_drop_n_account)
2460 is_drop_n_account = true;
2461 po->stats.stats1.tp_drops++;
2462 spin_unlock(&sk->sk_receive_queue.lock);
2464 sk->sk_data_ready(sk);
2465 kfree_skb(copy_skb);
2466 goto drop_n_restore;
2469 static void tpacket_destruct_skb(struct sk_buff *skb)
2471 struct packet_sock *po = pkt_sk(skb->sk);
2473 if (likely(po->tx_ring.pg_vec)) {
2477 ph = skb_zcopy_get_nouarg(skb);
2478 packet_dec_pending(&po->tx_ring);
2480 ts = __packet_set_timestamp(po, ph, skb);
2481 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
2483 if (!packet_read_pending(&po->tx_ring))
2484 complete(&po->skb_completion);
2490 static void tpacket_set_protocol(const struct net_device *dev,
2491 struct sk_buff *skb)
2493 if (dev->type == ARPHRD_ETHER) {
2494 skb_reset_mac_header(skb);
2495 skb->protocol = eth_hdr(skb)->h_proto;
2499 static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2501 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2502 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2503 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2504 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2505 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2506 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2507 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2509 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2515 static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2516 struct virtio_net_hdr *vnet_hdr)
2518 if (*len < sizeof(*vnet_hdr))
2520 *len -= sizeof(*vnet_hdr);
2522 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
2525 return __packet_snd_vnet_parse(vnet_hdr, *len);
2528 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2529 void *frame, struct net_device *dev, void *data, int tp_len,
2530 __be16 proto, unsigned char *addr, int hlen, int copylen,
2531 const struct sockcm_cookie *sockc)
2533 union tpacket_uhdr ph;
2534 int to_write, offset, len, nr_frags, len_max;
2535 struct socket *sock = po->sk.sk_socket;
2541 skb->protocol = proto;
2543 skb->priority = po->sk.sk_priority;
2544 skb->mark = po->sk.sk_mark;
2545 skb_setup_tx_timestamp(skb, sockc->tsflags);
2546 skb_zcopy_set_nouarg(skb, ph.raw);
2548 skb_reserve(skb, hlen);
2549 skb_reset_network_header(skb);
2553 if (sock->type == SOCK_DGRAM) {
2554 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2556 if (unlikely(err < 0))
2558 } else if (copylen) {
2559 int hdrlen = min_t(int, copylen, tp_len);
2561 skb_push(skb, dev->hard_header_len);
2562 skb_put(skb, copylen - dev->hard_header_len);
2563 err = skb_store_bits(skb, 0, data, hdrlen);
2566 if (!dev_validate_header(dev, skb->data, hdrlen))
2569 tpacket_set_protocol(dev, skb);
2575 offset = offset_in_page(data);
2576 len_max = PAGE_SIZE - offset;
2577 len = ((to_write > len_max) ? len_max : to_write);
2579 skb->data_len = to_write;
2580 skb->len += to_write;
2581 skb->truesize += to_write;
2582 refcount_add(to_write, &po->sk.sk_wmem_alloc);
2584 while (likely(to_write)) {
2585 nr_frags = skb_shinfo(skb)->nr_frags;
2587 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
2588 pr_err("Packet exceed the number of skb frags(%lu)\n",
2593 page = pgv_to_page(data);
2595 flush_dcache_page(page);
2597 skb_fill_page_desc(skb, nr_frags, page, offset, len);
2600 len_max = PAGE_SIZE;
2601 len = ((to_write > len_max) ? len_max : to_write);
2604 skb_probe_transport_header(skb, 0);
2609 static int tpacket_parse_header(struct packet_sock *po, void *frame,
2610 int size_max, void **data)
2612 union tpacket_uhdr ph;
2617 switch (po->tp_version) {
2619 if (ph.h3->tp_next_offset != 0) {
2620 pr_warn_once("variable sized slot not supported");
2623 tp_len = ph.h3->tp_len;
2626 tp_len = ph.h2->tp_len;
2629 tp_len = ph.h1->tp_len;
2632 if (unlikely(tp_len > size_max)) {
2633 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2637 if (unlikely(po->tp_tx_has_off)) {
2638 int off_min, off_max;
2640 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2641 off_max = po->tx_ring.frame_size - tp_len;
2642 if (po->sk.sk_type == SOCK_DGRAM) {
2643 switch (po->tp_version) {
2645 off = ph.h3->tp_net;
2648 off = ph.h2->tp_net;
2651 off = ph.h1->tp_net;
2655 switch (po->tp_version) {
2657 off = ph.h3->tp_mac;
2660 off = ph.h2->tp_mac;
2663 off = ph.h1->tp_mac;
2667 if (unlikely((off < off_min) || (off_max < off)))
2670 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2673 *data = frame + off;
2677 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2679 struct sk_buff *skb = NULL;
2680 struct net_device *dev;
2681 struct virtio_net_hdr *vnet_hdr = NULL;
2682 struct sockcm_cookie sockc;
2684 int err, reserve = 0;
2686 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2687 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
2688 unsigned char *addr = NULL;
2689 int tp_len, size_max;
2692 int status = TP_STATUS_AVAILABLE;
2693 int hlen, tlen, copylen = 0;
2696 mutex_lock(&po->pg_vec_lock);
2698 /* packet_sendmsg() check on tx_ring.pg_vec was lockless,
2699 * we need to confirm it under protection of pg_vec_lock.
2701 if (unlikely(!po->tx_ring.pg_vec)) {
2705 if (likely(saddr == NULL)) {
2706 dev = packet_cached_dev_get(po);
2707 proto = READ_ONCE(po->num);
2710 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2712 if (msg->msg_namelen < (saddr->sll_halen
2713 + offsetof(struct sockaddr_ll,
2716 proto = saddr->sll_protocol;
2717 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2718 if (po->sk.sk_socket->type == SOCK_DGRAM) {
2719 if (dev && msg->msg_namelen < dev->addr_len +
2720 offsetof(struct sockaddr_ll, sll_addr))
2722 addr = saddr->sll_addr;
2727 if (unlikely(dev == NULL))
2730 if (unlikely(!(dev->flags & IFF_UP)))
2733 sockc.tsflags = po->sk.sk_tsflags;
2734 if (msg->msg_controllen) {
2735 err = sock_cmsg_send(&po->sk, msg, &sockc);
2740 if (po->sk.sk_socket->type == SOCK_RAW)
2741 reserve = dev->hard_header_len;
2742 size_max = po->tx_ring.frame_size
2743 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
2745 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
2746 size_max = dev->mtu + reserve + VLAN_HLEN;
2748 reinit_completion(&po->skb_completion);
2751 ph = packet_current_frame(po, &po->tx_ring,
2752 TP_STATUS_SEND_REQUEST);
2753 if (unlikely(ph == NULL)) {
2754 if (need_wait && skb) {
2755 timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
2756 timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
2758 err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
2762 /* check for additional frames */
2767 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2771 status = TP_STATUS_SEND_REQUEST;
2772 hlen = LL_RESERVED_SPACE(dev);
2773 tlen = dev->needed_tailroom;
2774 if (po->has_vnet_hdr) {
2776 data += sizeof(*vnet_hdr);
2777 tp_len -= sizeof(*vnet_hdr);
2779 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2783 copylen = __virtio16_to_cpu(vio_le(),
2786 copylen = max_t(int, copylen, dev->hard_header_len);
2787 skb = sock_alloc_send_skb(&po->sk,
2788 hlen + tlen + sizeof(struct sockaddr_ll) +
2789 (copylen - dev->hard_header_len),
2792 if (unlikely(skb == NULL)) {
2793 /* we assume the socket was initially writeable ... */
2794 if (likely(len_sum > 0))
2798 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
2799 addr, hlen, copylen, &sockc);
2800 if (likely(tp_len >= 0) &&
2801 tp_len > dev->mtu + reserve &&
2802 !po->has_vnet_hdr &&
2803 !packet_extra_vlan_len_allowed(dev, skb))
2806 if (unlikely(tp_len < 0)) {
2809 __packet_set_status(po, ph,
2810 TP_STATUS_AVAILABLE);
2811 packet_increment_head(&po->tx_ring);
2815 status = TP_STATUS_WRONG_FORMAT;
2821 if (po->has_vnet_hdr) {
2822 if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
2826 virtio_net_hdr_set_proto(skb, vnet_hdr);
2829 skb->destructor = tpacket_destruct_skb;
2830 __packet_set_status(po, ph, TP_STATUS_SENDING);
2831 packet_inc_pending(&po->tx_ring);
2833 status = TP_STATUS_SEND_REQUEST;
2834 err = po->xmit(skb);
2835 if (unlikely(err != 0)) {
2837 err = net_xmit_errno(err);
2838 if (err && __packet_get_status(po, ph) ==
2839 TP_STATUS_AVAILABLE) {
2840 /* skb was destructed already */
2845 * skb was dropped but not destructed yet;
2846 * let's treat it like congestion or err < 0
2850 packet_increment_head(&po->tx_ring);
2852 } while (likely((ph != NULL) ||
2853 /* Note: packet_read_pending() might be slow if we have
2854 * to call it as it's per_cpu variable, but in fast-path
2855 * we already short-circuit the loop with the first
2856 * condition, and luckily don't have to go that path
2859 (need_wait && packet_read_pending(&po->tx_ring))));
2865 __packet_set_status(po, ph, status);
2870 mutex_unlock(&po->pg_vec_lock);
2874 static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2875 size_t reserve, size_t len,
2876 size_t linear, int noblock,
2879 struct sk_buff *skb;
2881 /* Under a page? Don't bother with paged skb. */
2882 if (prepad + len < PAGE_SIZE || !linear)
2885 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2890 skb_reserve(skb, reserve);
2891 skb_put(skb, linear);
2892 skb->data_len = len - linear;
2893 skb->len += len - linear;
2898 static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2900 struct sock *sk = sock->sk;
2901 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2902 struct sk_buff *skb;
2903 struct net_device *dev;
2905 unsigned char *addr = NULL;
2906 int err, reserve = 0;
2907 struct sockcm_cookie sockc;
2908 struct virtio_net_hdr vnet_hdr = { 0 };
2910 struct packet_sock *po = pkt_sk(sk);
2911 bool has_vnet_hdr = false;
2912 int hlen, tlen, linear;
2916 * Get and verify the address.
2919 if (likely(saddr == NULL)) {
2920 dev = packet_cached_dev_get(po);
2921 proto = READ_ONCE(po->num);
2924 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2926 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2928 proto = saddr->sll_protocol;
2929 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2930 if (sock->type == SOCK_DGRAM) {
2931 if (dev && msg->msg_namelen < dev->addr_len +
2932 offsetof(struct sockaddr_ll, sll_addr))
2934 addr = saddr->sll_addr;
2939 if (unlikely(dev == NULL))
2942 if (unlikely(!(dev->flags & IFF_UP)))
2945 sockc.tsflags = sk->sk_tsflags;
2946 sockc.mark = sk->sk_mark;
2947 if (msg->msg_controllen) {
2948 err = sock_cmsg_send(sk, msg, &sockc);
2953 if (sock->type == SOCK_RAW)
2954 reserve = dev->hard_header_len;
2955 if (po->has_vnet_hdr) {
2956 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2959 has_vnet_hdr = true;
2962 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2963 if (!netif_supports_nofcs(dev)) {
2964 err = -EPROTONOSUPPORT;
2967 extra_len = 4; /* We're doing our own CRC */
2971 if (!vnet_hdr.gso_type &&
2972 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
2976 hlen = LL_RESERVED_SPACE(dev);
2977 tlen = dev->needed_tailroom;
2978 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2979 linear = max(linear, min_t(int, len, dev->hard_header_len));
2980 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
2981 msg->msg_flags & MSG_DONTWAIT, &err);
2985 skb_reset_network_header(skb);
2988 if (sock->type == SOCK_DGRAM) {
2989 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
2990 if (unlikely(offset < 0))
2992 } else if (reserve) {
2993 skb_reserve(skb, -reserve);
2995 skb_reset_network_header(skb);
2998 /* Returns -EFAULT on error */
2999 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
3003 if (sock->type == SOCK_RAW &&
3004 !dev_validate_header(dev, skb->data, len)) {
3009 skb_setup_tx_timestamp(skb, sockc.tsflags);
3011 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3012 !packet_extra_vlan_len_allowed(dev, skb)) {
3017 skb->protocol = proto;
3019 skb->priority = sk->sk_priority;
3020 skb->mark = sockc.mark;
3023 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
3026 len += sizeof(vnet_hdr);
3027 virtio_net_hdr_set_proto(skb, &vnet_hdr);
3030 skb_probe_transport_header(skb, reserve);
3032 if (unlikely(extra_len == 4))
3035 err = po->xmit(skb);
3036 if (unlikely(err != 0)) {
3038 err = net_xmit_errno(err);
3056 static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
3058 struct sock *sk = sock->sk;
3059 struct packet_sock *po = pkt_sk(sk);
3061 if (po->tx_ring.pg_vec)
3062 return tpacket_snd(po, msg);
3064 return packet_snd(sock, msg, len);
3068 * Close a PACKET socket. This is fairly simple. We immediately go
3069 * to 'closed' state and remove our protocol entry in the device list.
3072 static int packet_release(struct socket *sock)
3074 struct sock *sk = sock->sk;
3075 struct packet_sock *po;
3076 struct packet_fanout *f;
3078 union tpacket_req_u req_u;
3086 mutex_lock(&net->packet.sklist_lock);
3087 sk_del_node_init_rcu(sk);
3088 mutex_unlock(&net->packet.sklist_lock);
3091 sock_prot_inuse_add(net, sk->sk_prot, -1);
3094 spin_lock(&po->bind_lock);
3095 unregister_prot_hook(sk, false);
3096 packet_cached_dev_reset(po);
3098 if (po->prot_hook.dev) {
3099 dev_put(po->prot_hook.dev);
3100 po->prot_hook.dev = NULL;
3102 spin_unlock(&po->bind_lock);
3104 packet_flush_mclist(sk);
3107 if (po->rx_ring.pg_vec) {
3108 memset(&req_u, 0, sizeof(req_u));
3109 packet_set_ring(sk, &req_u, 1, 0);
3112 if (po->tx_ring.pg_vec) {
3113 memset(&req_u, 0, sizeof(req_u));
3114 packet_set_ring(sk, &req_u, 1, 1);
3118 f = fanout_release(sk);
3123 kfree(po->rollover);
3124 fanout_release_data(f);
3128 * Now the socket is dead. No more input will appear.
3135 skb_queue_purge(&sk->sk_receive_queue);
3136 packet_free_pending(po);
3137 sk_refcnt_debug_release(sk);
3144 * Attach a packet hook.
3147 static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3150 struct packet_sock *po = pkt_sk(sk);
3151 struct net_device *dev_curr;
3154 struct net_device *dev = NULL;
3156 bool unlisted = false;
3159 spin_lock(&po->bind_lock);
3171 dev = dev_get_by_name_rcu(sock_net(sk), name);
3176 } else if (ifindex) {
3177 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3187 proto_curr = po->prot_hook.type;
3188 dev_curr = po->prot_hook.dev;
3190 need_rehook = proto_curr != proto || dev_curr != dev;
3195 /* prevents packet_notifier() from calling
3196 * register_prot_hook()
3198 WRITE_ONCE(po->num, 0);
3199 __unregister_prot_hook(sk, true);
3201 dev_curr = po->prot_hook.dev;
3203 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3207 BUG_ON(po->running);
3208 WRITE_ONCE(po->num, proto);
3209 po->prot_hook.type = proto;
3211 if (unlikely(unlisted)) {
3213 po->prot_hook.dev = NULL;
3214 WRITE_ONCE(po->ifindex, -1);
3215 packet_cached_dev_reset(po);
3217 po->prot_hook.dev = dev;
3218 WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0);
3219 packet_cached_dev_assign(po, dev);
3225 if (proto == 0 || !need_rehook)
3228 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
3229 register_prot_hook(sk);
3231 sk->sk_err = ENETDOWN;
3232 if (!sock_flag(sk, SOCK_DEAD))
3233 sk->sk_error_report(sk);
3238 spin_unlock(&po->bind_lock);
3244 * Bind a packet socket to a device
3247 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3250 struct sock *sk = sock->sk;
3251 char name[sizeof(uaddr->sa_data) + 1];
3257 if (addr_len != sizeof(struct sockaddr))
3259 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3262 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3263 name[sizeof(uaddr->sa_data)] = 0;
3265 return packet_do_bind(sk, name, 0, 0);
3268 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3270 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3271 struct sock *sk = sock->sk;
3277 if (addr_len < sizeof(struct sockaddr_ll))
3279 if (sll->sll_family != AF_PACKET)
3282 return packet_do_bind(sk, NULL, sll->sll_ifindex, sll->sll_protocol);
3285 static struct proto packet_proto = {
3287 .owner = THIS_MODULE,
3288 .obj_size = sizeof(struct packet_sock),
3292 * Create a packet of type SOCK_PACKET.
3295 static int packet_create(struct net *net, struct socket *sock, int protocol,
3299 struct packet_sock *po;
3300 __be16 proto = (__force __be16)protocol; /* weird, but documented */
3303 if (!ns_capable(net->user_ns, CAP_NET_RAW))
3305 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3306 sock->type != SOCK_PACKET)
3307 return -ESOCKTNOSUPPORT;
3309 sock->state = SS_UNCONNECTED;
3312 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
3316 sock->ops = &packet_ops;
3317 if (sock->type == SOCK_PACKET)
3318 sock->ops = &packet_ops_spkt;
3320 sock_init_data(sock, sk);
3323 init_completion(&po->skb_completion);
3324 sk->sk_family = PF_PACKET;
3326 po->xmit = dev_queue_xmit;
3328 err = packet_alloc_pending(po);
3332 packet_cached_dev_reset(po);
3334 sk->sk_destruct = packet_sock_destruct;
3335 sk_refcnt_debug_inc(sk);
3338 * Attach a protocol block
3341 spin_lock_init(&po->bind_lock);
3342 mutex_init(&po->pg_vec_lock);
3343 po->rollover = NULL;
3344 po->prot_hook.func = packet_rcv;
3346 if (sock->type == SOCK_PACKET)
3347 po->prot_hook.func = packet_rcv_spkt;
3349 po->prot_hook.af_packet_priv = sk;
3350 po->prot_hook.af_packet_net = sock_net(sk);
3353 po->prot_hook.type = proto;
3354 __register_prot_hook(sk);
3357 mutex_lock(&net->packet.sklist_lock);
3358 sk_add_node_tail_rcu(sk, &net->packet.sklist);
3359 mutex_unlock(&net->packet.sklist_lock);
3362 sock_prot_inuse_add(net, &packet_proto, 1);
3373 * Pull a packet from our receive queue and hand it to the user.
3374 * If necessary we block.
3377 static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3380 struct sock *sk = sock->sk;
3381 struct sk_buff *skb;
3383 int vnet_hdr_len = 0;
3384 unsigned int origlen = 0;
3387 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
3391 /* What error should we return now? EUNATTACH? */
3392 if (pkt_sk(sk)->ifindex < 0)
3396 if (flags & MSG_ERRQUEUE) {
3397 err = sock_recv_errqueue(sk, msg, len,
3398 SOL_PACKET, PACKET_TX_TIMESTAMP);
3403 * Call the generic datagram receiver. This handles all sorts
3404 * of horrible races and re-entrancy so we can forget about it
3405 * in the protocol layers.
3407 * Now it will return ENETDOWN, if device have just gone down,
3408 * but then it will block.
3411 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
3414 * An error occurred so return it. Because skb_recv_datagram()
3415 * handles the blocking we don't see and worry about blocking
3422 if (pkt_sk(sk)->pressure)
3423 packet_rcv_has_room(pkt_sk(sk), NULL);
3425 if (pkt_sk(sk)->has_vnet_hdr) {
3426 err = packet_rcv_vnet(msg, skb, &len);
3429 vnet_hdr_len = sizeof(struct virtio_net_hdr);
3432 /* You lose any data beyond the buffer you gave. If it worries
3433 * a user program they can ask the device for its MTU
3439 msg->msg_flags |= MSG_TRUNC;
3442 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3446 if (sock->type != SOCK_PACKET) {
3447 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3449 /* Original length was stored in sockaddr_ll fields */
3450 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3451 sll->sll_family = AF_PACKET;
3452 sll->sll_protocol = skb->protocol;
3455 sock_recv_ts_and_drops(msg, sk, skb);
3457 if (msg->msg_name) {
3458 const size_t max_len = min(sizeof(skb->cb),
3459 sizeof(struct sockaddr_storage));
3462 /* If the address length field is there to be filled
3463 * in, we fill it in now.
3465 if (sock->type == SOCK_PACKET) {
3466 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
3467 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3468 copy_len = msg->msg_namelen;
3470 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3472 msg->msg_namelen = sll->sll_halen +
3473 offsetof(struct sockaddr_ll, sll_addr);
3474 copy_len = msg->msg_namelen;
3475 if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
3476 memset(msg->msg_name +
3477 offsetof(struct sockaddr_ll, sll_addr),
3478 0, sizeof(sll->sll_addr));
3479 msg->msg_namelen = sizeof(struct sockaddr_ll);
3482 if (WARN_ON_ONCE(copy_len > max_len)) {
3484 msg->msg_namelen = copy_len;
3486 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
3489 if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_AUXDATA)) {
3490 struct tpacket_auxdata aux;
3492 aux.tp_status = TP_STATUS_USER;
3493 if (skb->ip_summed == CHECKSUM_PARTIAL)
3494 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
3495 else if (skb->pkt_type != PACKET_OUTGOING &&
3496 skb_csum_unnecessary(skb))
3497 aux.tp_status |= TP_STATUS_CSUM_VALID;
3499 aux.tp_len = origlen;
3500 aux.tp_snaplen = skb->len;
3502 aux.tp_net = skb_network_offset(skb);
3503 if (skb_vlan_tag_present(skb)) {
3504 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
3505 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3506 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
3508 aux.tp_vlan_tci = 0;
3509 aux.tp_vlan_tpid = 0;
3511 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
3515 * Free or return the buffer as appropriate. Again this
3516 * hides all the races and re-entrancy issues from us.
3518 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
3521 skb_free_datagram(sk, skb);
3526 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3527 int *uaddr_len, int peer)
3529 struct net_device *dev;
3530 struct sock *sk = sock->sk;
3535 uaddr->sa_family = AF_PACKET;
3536 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
3538 dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex));
3540 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
3542 *uaddr_len = sizeof(*uaddr);
3547 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3548 int *uaddr_len, int peer)
3550 struct net_device *dev;
3551 struct sock *sk = sock->sk;
3552 struct packet_sock *po = pkt_sk(sk);
3553 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
3559 ifindex = READ_ONCE(po->ifindex);
3560 sll->sll_family = AF_PACKET;
3561 sll->sll_ifindex = ifindex;
3562 sll->sll_protocol = READ_ONCE(po->num);
3563 sll->sll_pkttype = 0;
3565 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3567 sll->sll_hatype = dev->type;
3568 sll->sll_halen = dev->addr_len;
3569 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
3571 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3575 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
3580 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3584 case PACKET_MR_MULTICAST:
3585 if (i->alen != dev->addr_len)
3588 return dev_mc_add(dev, i->addr);
3590 return dev_mc_del(dev, i->addr);
3592 case PACKET_MR_PROMISC:
3593 return dev_set_promiscuity(dev, what);
3594 case PACKET_MR_ALLMULTI:
3595 return dev_set_allmulti(dev, what);
3596 case PACKET_MR_UNICAST:
3597 if (i->alen != dev->addr_len)
3600 return dev_uc_add(dev, i->addr);
3602 return dev_uc_del(dev, i->addr);
3610 static void packet_dev_mclist_delete(struct net_device *dev,
3611 struct packet_mclist **mlp)
3613 struct packet_mclist *ml;
3615 while ((ml = *mlp) != NULL) {
3616 if (ml->ifindex == dev->ifindex) {
3617 packet_dev_mc(dev, ml, -1);
3625 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
3627 struct packet_sock *po = pkt_sk(sk);
3628 struct packet_mclist *ml, *i;
3629 struct net_device *dev;
3635 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
3640 if (mreq->mr_alen > dev->addr_len)
3644 i = kmalloc(sizeof(*i), GFP_KERNEL);
3649 for (ml = po->mclist; ml; ml = ml->next) {
3650 if (ml->ifindex == mreq->mr_ifindex &&
3651 ml->type == mreq->mr_type &&
3652 ml->alen == mreq->mr_alen &&
3653 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3655 /* Free the new element ... */
3661 i->type = mreq->mr_type;
3662 i->ifindex = mreq->mr_ifindex;
3663 i->alen = mreq->mr_alen;
3664 memcpy(i->addr, mreq->mr_address, i->alen);
3665 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
3667 i->next = po->mclist;
3669 err = packet_dev_mc(dev, i, 1);
3671 po->mclist = i->next;
3680 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
3682 struct packet_mclist *ml, **mlp;
3686 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3687 if (ml->ifindex == mreq->mr_ifindex &&
3688 ml->type == mreq->mr_type &&
3689 ml->alen == mreq->mr_alen &&
3690 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3691 if (--ml->count == 0) {
3692 struct net_device *dev;
3694 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3696 packet_dev_mc(dev, ml, -1);
3706 static void packet_flush_mclist(struct sock *sk)
3708 struct packet_sock *po = pkt_sk(sk);
3709 struct packet_mclist *ml;
3715 while ((ml = po->mclist) != NULL) {
3716 struct net_device *dev;
3718 po->mclist = ml->next;
3719 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3721 packet_dev_mc(dev, ml, -1);
3728 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
3730 struct sock *sk = sock->sk;
3731 struct packet_sock *po = pkt_sk(sk);
3734 if (level != SOL_PACKET)
3735 return -ENOPROTOOPT;
3738 case PACKET_ADD_MEMBERSHIP:
3739 case PACKET_DROP_MEMBERSHIP:
3741 struct packet_mreq_max mreq;
3743 memset(&mreq, 0, sizeof(mreq));
3744 if (len < sizeof(struct packet_mreq))
3746 if (len > sizeof(mreq))
3748 if (copy_from_user(&mreq, optval, len))
3750 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3752 if (optname == PACKET_ADD_MEMBERSHIP)
3753 ret = packet_mc_add(sk, &mreq);
3755 ret = packet_mc_drop(sk, &mreq);
3759 case PACKET_RX_RING:
3760 case PACKET_TX_RING:
3762 union tpacket_req_u req_u;
3766 switch (po->tp_version) {
3769 len = sizeof(req_u.req);
3773 len = sizeof(req_u.req3);
3779 if (copy_from_user(&req_u.req, optval, len))
3782 ret = packet_set_ring(sk, &req_u, 0,
3783 optname == PACKET_TX_RING);
3788 case PACKET_COPY_THRESH:
3792 if (optlen != sizeof(val))
3794 if (copy_from_user(&val, optval, sizeof(val)))
3797 pkt_sk(sk)->copy_thresh = val;
3800 case PACKET_VERSION:
3804 if (optlen != sizeof(val))
3806 if (copy_from_user(&val, optval, sizeof(val)))
3817 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3820 po->tp_version = val;
3826 case PACKET_RESERVE:
3830 if (optlen != sizeof(val))
3832 if (copy_from_user(&val, optval, sizeof(val)))
3837 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3840 po->tp_reserve = val;
3850 if (optlen != sizeof(val))
3852 if (copy_from_user(&val, optval, sizeof(val)))
3856 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3859 po->tp_loss = !!val;
3865 case PACKET_AUXDATA:
3869 if (optlen < sizeof(val))
3871 if (copy_from_user(&val, optval, sizeof(val)))
3874 packet_sock_flag_set(po, PACKET_SOCK_AUXDATA, val);
3877 case PACKET_ORIGDEV:
3881 if (optlen < sizeof(val))
3883 if (copy_from_user(&val, optval, sizeof(val)))
3886 packet_sock_flag_set(po, PACKET_SOCK_ORIGDEV, val);
3889 case PACKET_VNET_HDR:
3893 if (sock->type != SOCK_RAW)
3895 if (optlen < sizeof(val))
3897 if (copy_from_user(&val, optval, sizeof(val)))
3901 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3904 po->has_vnet_hdr = !!val;
3910 case PACKET_TIMESTAMP:
3914 if (optlen != sizeof(val))
3916 if (copy_from_user(&val, optval, sizeof(val)))
3919 po->tp_tstamp = val;
3926 if (optlen != sizeof(val))
3928 if (copy_from_user(&val, optval, sizeof(val)))
3931 return fanout_add(sk, val & 0xffff, val >> 16);
3933 case PACKET_FANOUT_DATA:
3935 /* Paired with the WRITE_ONCE() in fanout_add() */
3936 if (!READ_ONCE(po->fanout))
3939 return fanout_set_data(po, optval, optlen);
3941 case PACKET_TX_HAS_OFF:
3945 if (optlen != sizeof(val))
3947 if (copy_from_user(&val, optval, sizeof(val)))
3951 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3954 po->tp_tx_has_off = !!val;
3960 case PACKET_QDISC_BYPASS:
3964 if (optlen != sizeof(val))
3966 if (copy_from_user(&val, optval, sizeof(val)))
3969 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3973 return -ENOPROTOOPT;
3977 static int packet_getsockopt(struct socket *sock, int level, int optname,
3978 char __user *optval, int __user *optlen)
3981 int val, lv = sizeof(val);
3982 struct sock *sk = sock->sk;
3983 struct packet_sock *po = pkt_sk(sk);
3985 union tpacket_stats_u st;
3986 struct tpacket_rollover_stats rstats;
3988 if (level != SOL_PACKET)
3989 return -ENOPROTOOPT;
3991 if (get_user(len, optlen))
3998 case PACKET_STATISTICS:
3999 spin_lock_bh(&sk->sk_receive_queue.lock);
4000 memcpy(&st, &po->stats, sizeof(st));
4001 memset(&po->stats, 0, sizeof(po->stats));
4002 spin_unlock_bh(&sk->sk_receive_queue.lock);
4004 if (po->tp_version == TPACKET_V3) {
4005 lv = sizeof(struct tpacket_stats_v3);
4006 st.stats3.tp_packets += st.stats3.tp_drops;
4009 lv = sizeof(struct tpacket_stats);
4010 st.stats1.tp_packets += st.stats1.tp_drops;
4015 case PACKET_AUXDATA:
4016 val = packet_sock_flag(po, PACKET_SOCK_AUXDATA);
4018 case PACKET_ORIGDEV:
4019 val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV);
4021 case PACKET_VNET_HDR:
4022 val = po->has_vnet_hdr;
4024 case PACKET_VERSION:
4025 val = po->tp_version;
4028 if (len > sizeof(int))
4030 if (len < sizeof(int))
4032 if (copy_from_user(&val, optval, len))
4036 val = sizeof(struct tpacket_hdr);
4039 val = sizeof(struct tpacket2_hdr);
4042 val = sizeof(struct tpacket3_hdr);
4048 case PACKET_RESERVE:
4049 val = po->tp_reserve;
4054 case PACKET_TIMESTAMP:
4055 val = po->tp_tstamp;
4059 ((u32)po->fanout->id |
4060 ((u32)po->fanout->type << 16) |
4061 ((u32)po->fanout->flags << 24)) :
4064 case PACKET_ROLLOVER_STATS:
4067 rstats.tp_all = atomic_long_read(&po->rollover->num);
4068 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
4069 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
4071 lv = sizeof(rstats);
4073 case PACKET_TX_HAS_OFF:
4074 val = po->tp_tx_has_off;
4076 case PACKET_QDISC_BYPASS:
4077 val = packet_use_direct_xmit(po);
4080 return -ENOPROTOOPT;
4085 if (put_user(len, optlen))
4087 if (copy_to_user(optval, data, len))
4093 #ifdef CONFIG_COMPAT
4094 static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
4095 char __user *optval, unsigned int optlen)
4097 struct packet_sock *po = pkt_sk(sock->sk);
4099 if (level != SOL_PACKET)
4100 return -ENOPROTOOPT;
4102 if (optname == PACKET_FANOUT_DATA &&
4103 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
4104 optval = (char __user *)get_compat_bpf_fprog(optval);
4107 optlen = sizeof(struct sock_fprog);
4110 return packet_setsockopt(sock, level, optname, optval, optlen);
4114 static int packet_notifier(struct notifier_block *this,
4115 unsigned long msg, void *ptr)
4118 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4119 struct net *net = dev_net(dev);
4122 sk_for_each_rcu(sk, &net->packet.sklist) {
4123 struct packet_sock *po = pkt_sk(sk);
4126 case NETDEV_UNREGISTER:
4128 packet_dev_mclist_delete(dev, &po->mclist);
4132 if (dev->ifindex == po->ifindex) {
4133 spin_lock(&po->bind_lock);
4135 __unregister_prot_hook(sk, false);
4136 sk->sk_err = ENETDOWN;
4137 if (!sock_flag(sk, SOCK_DEAD))
4138 sk->sk_error_report(sk);
4140 if (msg == NETDEV_UNREGISTER) {
4141 packet_cached_dev_reset(po);
4142 WRITE_ONCE(po->ifindex, -1);
4143 if (po->prot_hook.dev)
4144 dev_put(po->prot_hook.dev);
4145 po->prot_hook.dev = NULL;
4147 spin_unlock(&po->bind_lock);
4151 if (dev->ifindex == po->ifindex) {
4152 spin_lock(&po->bind_lock);
4154 register_prot_hook(sk);
4155 spin_unlock(&po->bind_lock);
4165 static int packet_ioctl(struct socket *sock, unsigned int cmd,
4168 struct sock *sk = sock->sk;
4173 int amount = sk_wmem_alloc_get(sk);
4175 return put_user(amount, (int __user *)arg);
4179 struct sk_buff *skb;
4182 spin_lock_bh(&sk->sk_receive_queue.lock);
4183 skb = skb_peek(&sk->sk_receive_queue);
4186 spin_unlock_bh(&sk->sk_receive_queue.lock);
4187 return put_user(amount, (int __user *)arg);
4190 return sock_get_timestamp(sk, (struct timeval __user *)arg);
4192 return sock_get_timestampns(sk, (struct timespec __user *)arg);
4202 case SIOCGIFBRDADDR:
4203 case SIOCSIFBRDADDR:
4204 case SIOCGIFNETMASK:
4205 case SIOCSIFNETMASK:
4206 case SIOCGIFDSTADDR:
4207 case SIOCSIFDSTADDR:
4209 return inet_dgram_ops.ioctl(sock, cmd, arg);
4213 return -ENOIOCTLCMD;
4218 static unsigned int packet_poll(struct file *file, struct socket *sock,
4221 struct sock *sk = sock->sk;
4222 struct packet_sock *po = pkt_sk(sk);
4223 unsigned int mask = datagram_poll(file, sock, wait);
4225 spin_lock_bh(&sk->sk_receive_queue.lock);
4226 if (po->rx_ring.pg_vec) {
4227 if (!packet_previous_rx_frame(po, &po->rx_ring,
4229 mask |= POLLIN | POLLRDNORM;
4231 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
4233 spin_unlock_bh(&sk->sk_receive_queue.lock);
4234 spin_lock_bh(&sk->sk_write_queue.lock);
4235 if (po->tx_ring.pg_vec) {
4236 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
4237 mask |= POLLOUT | POLLWRNORM;
4239 spin_unlock_bh(&sk->sk_write_queue.lock);
4244 /* Dirty? Well, I still did not learn better way to account
4248 static void packet_mm_open(struct vm_area_struct *vma)
4250 struct file *file = vma->vm_file;
4251 struct socket *sock = file->private_data;
4252 struct sock *sk = sock->sk;
4255 atomic_inc(&pkt_sk(sk)->mapped);
4258 static void packet_mm_close(struct vm_area_struct *vma)
4260 struct file *file = vma->vm_file;
4261 struct socket *sock = file->private_data;
4262 struct sock *sk = sock->sk;
4265 atomic_dec(&pkt_sk(sk)->mapped);
4268 static const struct vm_operations_struct packet_mmap_ops = {
4269 .open = packet_mm_open,
4270 .close = packet_mm_close,
4273 static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4278 for (i = 0; i < len; i++) {
4279 if (likely(pg_vec[i].buffer)) {
4280 if (is_vmalloc_addr(pg_vec[i].buffer))
4281 vfree(pg_vec[i].buffer);
4283 free_pages((unsigned long)pg_vec[i].buffer,
4285 pg_vec[i].buffer = NULL;
4291 static char *alloc_one_pg_vec_page(unsigned long order)
4294 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4295 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4297 buffer = (char *) __get_free_pages(gfp_flags, order);
4301 /* __get_free_pages failed, fall back to vmalloc */
4302 buffer = vzalloc((1 << order) * PAGE_SIZE);
4306 /* vmalloc failed, lets dig into swap here */
4307 gfp_flags &= ~__GFP_NORETRY;
4308 buffer = (char *) __get_free_pages(gfp_flags, order);
4312 /* complete and utter failure */
4316 static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4318 unsigned int block_nr = req->tp_block_nr;
4322 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
4323 if (unlikely(!pg_vec))
4326 for (i = 0; i < block_nr; i++) {
4327 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
4328 if (unlikely(!pg_vec[i].buffer))
4329 goto out_free_pgvec;
4336 free_pg_vec(pg_vec, order, block_nr);
4341 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4342 int closing, int tx_ring)
4344 struct pgv *pg_vec = NULL;
4345 struct packet_sock *po = pkt_sk(sk);
4346 unsigned long *rx_owner_map = NULL;
4347 int was_running, order = 0;
4348 struct packet_ring_buffer *rb;
4349 struct sk_buff_head *rb_queue;
4352 /* Added to avoid minimal code churn */
4353 struct tpacket_req *req = &req_u->req;
4355 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4356 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
4360 if (atomic_read(&po->mapped))
4362 if (packet_read_pending(rb))
4366 if (req->tp_block_nr) {
4367 unsigned int min_frame_size;
4369 /* Sanity tests and some calculations */
4371 if (unlikely(rb->pg_vec))
4374 switch (po->tp_version) {
4376 po->tp_hdrlen = TPACKET_HDRLEN;
4379 po->tp_hdrlen = TPACKET2_HDRLEN;
4382 po->tp_hdrlen = TPACKET3_HDRLEN;
4387 if (unlikely((int)req->tp_block_size <= 0))
4389 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
4391 min_frame_size = po->tp_hdrlen + po->tp_reserve;
4392 if (po->tp_version >= TPACKET_V3 &&
4393 req->tp_block_size <
4394 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
4396 if (unlikely(req->tp_frame_size < min_frame_size))
4398 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
4401 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4402 if (unlikely(rb->frames_per_block == 0))
4404 if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
4406 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4411 order = get_order(req->tp_block_size);
4412 pg_vec = alloc_pg_vec(req, order);
4413 if (unlikely(!pg_vec))
4415 switch (po->tp_version) {
4417 /* Block transmit is not supported yet */
4419 init_prb_bdqc(po, rb, pg_vec, req_u);
4421 struct tpacket_req3 *req3 = &req_u->req3;
4423 if (req3->tp_retire_blk_tov ||
4424 req3->tp_sizeof_priv ||
4425 req3->tp_feature_req_word) {
4427 goto out_free_pg_vec;
4433 rx_owner_map = bitmap_alloc(req->tp_frame_nr,
4434 GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
4436 goto out_free_pg_vec;
4444 if (unlikely(req->tp_frame_nr))
4449 /* Detach socket from network */
4450 spin_lock(&po->bind_lock);
4451 was_running = po->running;
4454 WRITE_ONCE(po->num, 0);
4455 __unregister_prot_hook(sk, false);
4457 spin_unlock(&po->bind_lock);
4462 mutex_lock(&po->pg_vec_lock);
4463 if (closing || atomic_read(&po->mapped) == 0) {
4465 spin_lock_bh(&rb_queue->lock);
4466 swap(rb->pg_vec, pg_vec);
4467 if (po->tp_version <= TPACKET_V2)
4468 swap(rb->rx_owner_map, rx_owner_map);
4469 rb->frame_max = (req->tp_frame_nr - 1);
4471 rb->frame_size = req->tp_frame_size;
4472 spin_unlock_bh(&rb_queue->lock);
4474 swap(rb->pg_vec_order, order);
4475 swap(rb->pg_vec_len, req->tp_block_nr);
4477 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4478 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4479 tpacket_rcv : packet_rcv;
4480 skb_queue_purge(rb_queue);
4481 if (atomic_read(&po->mapped))
4482 pr_err("packet_mmap: vma is busy: %d\n",
4483 atomic_read(&po->mapped));
4485 mutex_unlock(&po->pg_vec_lock);
4487 spin_lock(&po->bind_lock);
4489 WRITE_ONCE(po->num, num);
4490 register_prot_hook(sk);
4492 spin_unlock(&po->bind_lock);
4493 if (pg_vec && (po->tp_version > TPACKET_V2)) {
4494 /* Because we don't support block-based V3 on tx-ring */
4496 prb_shutdown_retire_blk_timer(po, rb_queue);
4501 bitmap_free(rx_owner_map);
4502 free_pg_vec(pg_vec, order, req->tp_block_nr);
4508 static int packet_mmap(struct file *file, struct socket *sock,
4509 struct vm_area_struct *vma)
4511 struct sock *sk = sock->sk;
4512 struct packet_sock *po = pkt_sk(sk);
4513 unsigned long size, expected_size;
4514 struct packet_ring_buffer *rb;
4515 unsigned long start;
4522 mutex_lock(&po->pg_vec_lock);
4525 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4527 expected_size += rb->pg_vec_len
4533 if (expected_size == 0)
4536 size = vma->vm_end - vma->vm_start;
4537 if (size != expected_size)
4540 start = vma->vm_start;
4541 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4542 if (rb->pg_vec == NULL)
4545 for (i = 0; i < rb->pg_vec_len; i++) {
4547 void *kaddr = rb->pg_vec[i].buffer;
4550 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4551 page = pgv_to_page(kaddr);
4552 err = vm_insert_page(vma, start, page);
4561 atomic_inc(&po->mapped);
4562 vma->vm_ops = &packet_mmap_ops;
4566 mutex_unlock(&po->pg_vec_lock);
4570 static const struct proto_ops packet_ops_spkt = {
4571 .family = PF_PACKET,
4572 .owner = THIS_MODULE,
4573 .release = packet_release,
4574 .bind = packet_bind_spkt,
4575 .connect = sock_no_connect,
4576 .socketpair = sock_no_socketpair,
4577 .accept = sock_no_accept,
4578 .getname = packet_getname_spkt,
4579 .poll = datagram_poll,
4580 .ioctl = packet_ioctl,
4581 .listen = sock_no_listen,
4582 .shutdown = sock_no_shutdown,
4583 .setsockopt = sock_no_setsockopt,
4584 .getsockopt = sock_no_getsockopt,
4585 .sendmsg = packet_sendmsg_spkt,
4586 .recvmsg = packet_recvmsg,
4587 .mmap = sock_no_mmap,
4588 .sendpage = sock_no_sendpage,
4591 static const struct proto_ops packet_ops = {
4592 .family = PF_PACKET,
4593 .owner = THIS_MODULE,
4594 .release = packet_release,
4595 .bind = packet_bind,
4596 .connect = sock_no_connect,
4597 .socketpair = sock_no_socketpair,
4598 .accept = sock_no_accept,
4599 .getname = packet_getname,
4600 .poll = packet_poll,
4601 .ioctl = packet_ioctl,
4602 .listen = sock_no_listen,
4603 .shutdown = sock_no_shutdown,
4604 .setsockopt = packet_setsockopt,
4605 .getsockopt = packet_getsockopt,
4606 #ifdef CONFIG_COMPAT
4607 .compat_setsockopt = compat_packet_setsockopt,
4609 .sendmsg = packet_sendmsg,
4610 .recvmsg = packet_recvmsg,
4611 .mmap = packet_mmap,
4612 .sendpage = sock_no_sendpage,
4615 static const struct net_proto_family packet_family_ops = {
4616 .family = PF_PACKET,
4617 .create = packet_create,
4618 .owner = THIS_MODULE,
4621 static struct notifier_block packet_netdev_notifier = {
4622 .notifier_call = packet_notifier,
4625 #ifdef CONFIG_PROC_FS
4627 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
4630 struct net *net = seq_file_net(seq);
4633 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
4636 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4638 struct net *net = seq_file_net(seq);
4639 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
4642 static void packet_seq_stop(struct seq_file *seq, void *v)
4648 static int packet_seq_show(struct seq_file *seq, void *v)
4650 if (v == SEQ_START_TOKEN)
4651 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4653 struct sock *s = sk_entry(v);
4654 const struct packet_sock *po = pkt_sk(s);
4657 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
4659 refcount_read(&s->sk_refcnt),
4661 ntohs(READ_ONCE(po->num)),
4662 READ_ONCE(po->ifindex),
4664 atomic_read(&s->sk_rmem_alloc),
4665 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
4672 static const struct seq_operations packet_seq_ops = {
4673 .start = packet_seq_start,
4674 .next = packet_seq_next,
4675 .stop = packet_seq_stop,
4676 .show = packet_seq_show,
4679 static int packet_seq_open(struct inode *inode, struct file *file)
4681 return seq_open_net(inode, file, &packet_seq_ops,
4682 sizeof(struct seq_net_private));
4685 static const struct file_operations packet_seq_fops = {
4686 .owner = THIS_MODULE,
4687 .open = packet_seq_open,
4689 .llseek = seq_lseek,
4690 .release = seq_release_net,
4695 static int __net_init packet_net_init(struct net *net)
4697 mutex_init(&net->packet.sklist_lock);
4698 INIT_HLIST_HEAD(&net->packet.sklist);
4700 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
4706 static void __net_exit packet_net_exit(struct net *net)
4708 remove_proc_entry("packet", net->proc_net);
4711 static struct pernet_operations packet_net_ops = {
4712 .init = packet_net_init,
4713 .exit = packet_net_exit,
4717 static void __exit packet_exit(void)
4719 unregister_netdevice_notifier(&packet_netdev_notifier);
4720 unregister_pernet_subsys(&packet_net_ops);
4721 sock_unregister(PF_PACKET);
4722 proto_unregister(&packet_proto);
4725 static int __init packet_init(void)
4729 rc = proto_register(&packet_proto, 0);
4732 rc = sock_register(&packet_family_ops);
4735 rc = register_pernet_subsys(&packet_net_ops);
4738 rc = register_netdevice_notifier(&packet_netdev_notifier);
4745 unregister_pernet_subsys(&packet_net_ops);
4747 sock_unregister(PF_PACKET);
4749 proto_unregister(&packet_proto);
4754 module_init(packet_init);
4755 module_exit(packet_exit);
4756 MODULE_LICENSE("GPL");
4757 MODULE_ALIAS_NETPROTO(PF_PACKET);