1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * PACKET - implements raw packet sockets.
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
14 * Alan Cox : verify_area() now used correctly
15 * Alan Cox : new skbuff lists, look ma no backlogs!
16 * Alan Cox : tidied skbuff lists.
17 * Alan Cox : Now uses generic datagram routines I
18 * added. Also fixed the peek/read crash
19 * from all old Linux datagram code.
20 * Alan Cox : Uses the improved datagram code.
21 * Alan Cox : Added NULL's for socket options.
22 * Alan Cox : Re-commented the code.
23 * Alan Cox : Use new kernel side addressing
24 * Rob Janssen : Correct MTU usage.
25 * Dave Platt : Counter leaks caused by incorrect
26 * interrupt locking and some slightly
27 * dubious gcc output. Can you read
28 * compiler: it said _VOLATILE_
29 * Richard Kooijman : Timestamp fixes.
30 * Alan Cox : New buffers. Use sk->mac.raw.
31 * Alan Cox : sendmsg/recvmsg support.
32 * Alan Cox : Protocol setting support
33 * Alexey Kuznetsov : Untied from IPv4 stack.
34 * Cyrus Durgin : Fixed kerneld for kmod.
35 * Michal Ostrowski : Module initialization cleanup.
36 * Ulises Alonso : Frame number limit removal and
37 * packet_set_ring memory leak.
38 * Eric Biederman : Allow for > 8 byte hardware addresses.
39 * The convention is that longer addresses
40 * will simply extend the hardware address
41 * byte arrays at the end of sockaddr_ll
43 * Johann Baudy : Added TX RING.
44 * Chetan Loke : Implemented TPACKET_V3 block abstraction
46 * Copyright (C) 2011, <lokec@ccs.neu.edu>
49 #include <linux/types.h>
51 #include <linux/capability.h>
52 #include <linux/fcntl.h>
53 #include <linux/socket.h>
55 #include <linux/inet.h>
56 #include <linux/netdevice.h>
57 #include <linux/if_packet.h>
58 #include <linux/wireless.h>
59 #include <linux/kernel.h>
60 #include <linux/kmod.h>
61 #include <linux/slab.h>
62 #include <linux/vmalloc.h>
63 #include <net/net_namespace.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <linux/uaccess.h>
71 #include <asm/ioctls.h>
73 #include <asm/cacheflush.h>
75 #include <linux/proc_fs.h>
76 #include <linux/seq_file.h>
77 #include <linux/poll.h>
78 #include <linux/module.h>
79 #include <linux/init.h>
80 #include <linux/mutex.h>
81 #include <linux/if_vlan.h>
82 #include <linux/virtio_net.h>
83 #include <linux/errqueue.h>
84 #include <linux/net_tstamp.h>
85 #include <linux/percpu.h>
87 #include <net/inet_common.h>
89 #include <linux/bpf.h>
90 #include <net/compat.h>
96 - if device has no dev->hard_header routine, it adds and removes ll header
97 inside itself. In this case ll header is invisible outside of device,
98 but higher levels still should reserve dev->hard_header_len.
99 Some devices are enough clever to reallocate skb, when header
100 will not fit to reserved space (tunnel), another ones are silly
102 - packet socket receives packets with pulled ll header,
103 so that SOCK_RAW should push it back.
108 Incoming, dev->hard_header!=NULL
109 mac_header -> ll header
112 Outgoing, dev->hard_header!=NULL
113 mac_header -> ll header
116 Incoming, dev->hard_header==NULL
117 mac_header -> UNKNOWN position. It is very likely, that it points to ll
118 header. PPP makes it, that is wrong, because introduce
119 assymetry between rx and tx paths.
122 Outgoing, dev->hard_header==NULL
123 mac_header -> data. ll header is still not built!
127 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
133 dev->hard_header != NULL
134 mac_header -> ll header
137 dev->hard_header == NULL (ll header is added by device, we cannot control it)
141 We should set nh.raw on output to correct posistion,
142 packet classifier depends on it.
145 /* Private packet socket structures. */
147 /* identical to struct packet_mreq except it has
148 * a longer address field.
150 struct packet_mreq_max {
152 unsigned short mr_type;
153 unsigned short mr_alen;
154 unsigned char mr_address[MAX_ADDR_LEN];
158 struct tpacket_hdr *h1;
159 struct tpacket2_hdr *h2;
160 struct tpacket3_hdr *h3;
164 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
165 int closing, int tx_ring);
167 #define V3_ALIGNMENT (8)
169 #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
171 #define BLK_PLUS_PRIV(sz_of_priv) \
172 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
174 #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
175 #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
176 #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
177 #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
178 #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
179 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
180 #define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
183 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
184 struct packet_type *pt, struct net_device *orig_dev);
186 static void *packet_previous_frame(struct packet_sock *po,
187 struct packet_ring_buffer *rb,
189 static void packet_increment_head(struct packet_ring_buffer *buff);
190 static int prb_curr_blk_in_use(struct tpacket_block_desc *);
191 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
192 struct packet_sock *);
193 static void prb_retire_current_block(struct tpacket_kbdq_core *,
194 struct packet_sock *, unsigned int status);
195 static int prb_queue_frozen(struct tpacket_kbdq_core *);
196 static void prb_open_block(struct tpacket_kbdq_core *,
197 struct tpacket_block_desc *);
198 static void prb_retire_rx_blk_timer_expired(struct timer_list *);
199 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
200 static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
201 static void prb_clear_rxhash(struct tpacket_kbdq_core *,
202 struct tpacket3_hdr *);
203 static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
204 struct tpacket3_hdr *);
205 static void packet_flush_mclist(struct sock *sk);
206 static u16 packet_pick_tx_queue(struct sk_buff *skb);
208 struct packet_skb_cb {
210 struct sockaddr_pkt pkt;
212 /* Trick: alias skb original length with
213 * ll.sll_family and ll.protocol in order
216 unsigned int origlen;
217 struct sockaddr_ll ll;
222 #define vio_le() virtio_legacy_is_little_endian()
224 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
226 #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
227 #define GET_PBLOCK_DESC(x, bid) \
228 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
229 #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
230 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
231 #define GET_NEXT_PRB_BLK_NUM(x) \
232 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
233 ((x)->kactive_blk_num+1) : 0)
235 static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
236 static void __fanout_link(struct sock *sk, struct packet_sock *po);
238 static int packet_direct_xmit(struct sk_buff *skb)
240 return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
243 static struct net_device *packet_cached_dev_get(struct packet_sock *po)
245 struct net_device *dev;
248 dev = rcu_dereference(po->cached_dev);
256 static void packet_cached_dev_assign(struct packet_sock *po,
257 struct net_device *dev)
259 rcu_assign_pointer(po->cached_dev, dev);
262 static void packet_cached_dev_reset(struct packet_sock *po)
264 RCU_INIT_POINTER(po->cached_dev, NULL);
267 static bool packet_use_direct_xmit(const struct packet_sock *po)
269 /* Paired with WRITE_ONCE() in packet_setsockopt() */
270 return READ_ONCE(po->xmit) == packet_direct_xmit;
273 static u16 packet_pick_tx_queue(struct sk_buff *skb)
275 struct net_device *dev = skb->dev;
276 const struct net_device_ops *ops = dev->netdev_ops;
277 int cpu = raw_smp_processor_id();
281 skb->sender_cpu = cpu + 1;
283 skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues);
284 if (ops->ndo_select_queue) {
285 queue_index = ops->ndo_select_queue(dev, skb, NULL);
286 queue_index = netdev_cap_txqueue(dev, queue_index);
288 queue_index = netdev_pick_tx(dev, skb, NULL);
294 /* __register_prot_hook must be invoked through register_prot_hook
295 * or from a context in which asynchronous accesses to the packet
296 * socket is not possible (packet_create()).
298 static void __register_prot_hook(struct sock *sk)
300 struct packet_sock *po = pkt_sk(sk);
304 __fanout_link(sk, po);
306 dev_add_pack(&po->prot_hook);
313 static void register_prot_hook(struct sock *sk)
315 lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
316 __register_prot_hook(sk);
319 /* If the sync parameter is true, we will temporarily drop
320 * the po->bind_lock and do a synchronize_net to make sure no
321 * asynchronous packet processing paths still refer to the elements
322 * of po->prot_hook. If the sync parameter is false, it is the
323 * callers responsibility to take care of this.
325 static void __unregister_prot_hook(struct sock *sk, bool sync)
327 struct packet_sock *po = pkt_sk(sk);
329 lockdep_assert_held_once(&po->bind_lock);
334 __fanout_unlink(sk, po);
336 __dev_remove_pack(&po->prot_hook);
341 spin_unlock(&po->bind_lock);
343 spin_lock(&po->bind_lock);
347 static void unregister_prot_hook(struct sock *sk, bool sync)
349 struct packet_sock *po = pkt_sk(sk);
352 __unregister_prot_hook(sk, sync);
355 static inline struct page * __pure pgv_to_page(void *addr)
357 if (is_vmalloc_addr(addr))
358 return vmalloc_to_page(addr);
359 return virt_to_page(addr);
362 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
364 union tpacket_uhdr h;
366 /* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */
369 switch (po->tp_version) {
371 WRITE_ONCE(h.h1->tp_status, status);
372 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
375 WRITE_ONCE(h.h2->tp_status, status);
376 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
379 WRITE_ONCE(h.h3->tp_status, status);
380 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
383 WARN(1, "TPACKET version not supported.\n");
390 static int __packet_get_status(const struct packet_sock *po, void *frame)
392 union tpacket_uhdr h;
396 /* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */
399 switch (po->tp_version) {
401 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
402 return READ_ONCE(h.h1->tp_status);
404 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
405 return READ_ONCE(h.h2->tp_status);
407 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
408 return READ_ONCE(h.h3->tp_status);
410 WARN(1, "TPACKET version not supported.\n");
416 static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
419 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
422 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
423 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
424 return TP_STATUS_TS_RAW_HARDWARE;
426 if (ktime_to_timespec_cond(skb->tstamp, ts))
427 return TP_STATUS_TS_SOFTWARE;
432 static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
435 union tpacket_uhdr h;
439 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
443 switch (po->tp_version) {
445 h.h1->tp_sec = ts.tv_sec;
446 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
449 h.h2->tp_sec = ts.tv_sec;
450 h.h2->tp_nsec = ts.tv_nsec;
453 h.h3->tp_sec = ts.tv_sec;
454 h.h3->tp_nsec = ts.tv_nsec;
457 WARN(1, "TPACKET version not supported.\n");
461 /* one flush is safe, as both fields always lie on the same cacheline */
462 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
468 static void *packet_lookup_frame(const struct packet_sock *po,
469 const struct packet_ring_buffer *rb,
470 unsigned int position,
473 unsigned int pg_vec_pos, frame_offset;
474 union tpacket_uhdr h;
476 pg_vec_pos = position / rb->frames_per_block;
477 frame_offset = position % rb->frames_per_block;
479 h.raw = rb->pg_vec[pg_vec_pos].buffer +
480 (frame_offset * rb->frame_size);
482 if (status != __packet_get_status(po, h.raw))
488 static void *packet_current_frame(struct packet_sock *po,
489 struct packet_ring_buffer *rb,
492 return packet_lookup_frame(po, rb, rb->head, status);
495 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
497 del_timer_sync(&pkc->retire_blk_timer);
500 static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
501 struct sk_buff_head *rb_queue)
503 struct tpacket_kbdq_core *pkc;
505 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
507 spin_lock_bh(&rb_queue->lock);
508 pkc->delete_blk_timer = 1;
509 spin_unlock_bh(&rb_queue->lock);
511 prb_del_retire_blk_timer(pkc);
514 static void prb_setup_retire_blk_timer(struct packet_sock *po)
516 struct tpacket_kbdq_core *pkc;
518 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
519 timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
521 pkc->retire_blk_timer.expires = jiffies;
524 static int prb_calc_retire_blk_tmo(struct packet_sock *po,
525 int blk_size_in_bytes)
527 struct net_device *dev;
528 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
529 struct ethtool_link_ksettings ecmd;
533 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
534 if (unlikely(!dev)) {
536 return DEFAULT_PRB_RETIRE_TOV;
538 err = __ethtool_get_link_ksettings(dev, &ecmd);
542 * If the link speed is so slow you don't really
543 * need to worry about perf anyways
545 if (ecmd.base.speed < SPEED_1000 ||
546 ecmd.base.speed == SPEED_UNKNOWN) {
547 return DEFAULT_PRB_RETIRE_TOV;
550 div = ecmd.base.speed / 1000;
553 return DEFAULT_PRB_RETIRE_TOV;
555 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
567 static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
568 union tpacket_req_u *req_u)
570 p1->feature_req_word = req_u->req3.tp_feature_req_word;
573 static void init_prb_bdqc(struct packet_sock *po,
574 struct packet_ring_buffer *rb,
576 union tpacket_req_u *req_u)
578 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
579 struct tpacket_block_desc *pbd;
581 memset(p1, 0x0, sizeof(*p1));
583 p1->knxt_seq_num = 1;
585 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
586 p1->pkblk_start = pg_vec[0].buffer;
587 p1->kblk_size = req_u->req3.tp_block_size;
588 p1->knum_blocks = req_u->req3.tp_block_nr;
589 p1->hdrlen = po->tp_hdrlen;
590 p1->version = po->tp_version;
591 p1->last_kactive_blk_num = 0;
592 po->stats.stats3.tp_freeze_q_cnt = 0;
593 if (req_u->req3.tp_retire_blk_tov)
594 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
596 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
597 req_u->req3.tp_block_size);
598 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
599 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
601 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
602 prb_init_ft_ops(p1, req_u);
603 prb_setup_retire_blk_timer(po);
604 prb_open_block(p1, pbd);
607 /* Do NOT update the last_blk_num first.
608 * Assumes sk_buff_head lock is held.
610 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
612 mod_timer(&pkc->retire_blk_timer,
613 jiffies + pkc->tov_in_jiffies);
614 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
619 * 1) We refresh the timer only when we open a block.
620 * By doing this we don't waste cycles refreshing the timer
621 * on packet-by-packet basis.
623 * With a 1MB block-size, on a 1Gbps line, it will take
624 * i) ~8 ms to fill a block + ii) memcpy etc.
625 * In this cut we are not accounting for the memcpy time.
627 * So, if the user sets the 'tmo' to 10ms then the timer
628 * will never fire while the block is still getting filled
629 * (which is what we want). However, the user could choose
630 * to close a block early and that's fine.
632 * But when the timer does fire, we check whether or not to refresh it.
633 * Since the tmo granularity is in msecs, it is not too expensive
634 * to refresh the timer, lets say every '8' msecs.
635 * Either the user can set the 'tmo' or we can derive it based on
636 * a) line-speed and b) block-size.
637 * prb_calc_retire_blk_tmo() calculates the tmo.
640 static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
642 struct packet_sock *po =
643 from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
644 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
646 struct tpacket_block_desc *pbd;
648 spin_lock(&po->sk.sk_receive_queue.lock);
650 frozen = prb_queue_frozen(pkc);
651 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
653 if (unlikely(pkc->delete_blk_timer))
656 /* We only need to plug the race when the block is partially filled.
658 * lock(); increment BLOCK_NUM_PKTS; unlock()
659 * copy_bits() is in progress ...
660 * timer fires on other cpu:
661 * we can't retire the current block because copy_bits
665 if (BLOCK_NUM_PKTS(pbd)) {
666 while (atomic_read(&pkc->blk_fill_in_prog)) {
667 /* Waiting for skb_copy_bits to finish... */
672 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
674 if (!BLOCK_NUM_PKTS(pbd)) {
675 /* An empty block. Just refresh the timer. */
678 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
679 if (!prb_dispatch_next_block(pkc, po))
684 /* Case 1. Queue was frozen because user-space was
687 if (prb_curr_blk_in_use(pbd)) {
689 * Ok, user-space is still behind.
690 * So just refresh the timer.
694 /* Case 2. queue was frozen,user-space caught up,
695 * now the link went idle && the timer fired.
696 * We don't have a block to close.So we open this
697 * block and restart the timer.
698 * opening a block thaws the queue,restarts timer
699 * Thawing/timer-refresh is a side effect.
701 prb_open_block(pkc, pbd);
708 _prb_refresh_rx_retire_blk_timer(pkc);
711 spin_unlock(&po->sk.sk_receive_queue.lock);
714 static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
715 struct tpacket_block_desc *pbd1, __u32 status)
717 /* Flush everything minus the block header */
719 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
724 /* Skip the block header(we know header WILL fit in 4K) */
727 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
728 for (; start < end; start += PAGE_SIZE)
729 flush_dcache_page(pgv_to_page(start));
734 /* Now update the block status. */
736 BLOCK_STATUS(pbd1) = status;
738 /* Flush the block header */
740 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
742 flush_dcache_page(pgv_to_page(start));
752 * 2) Increment active_blk_num
754 * Note:We DONT refresh the timer on purpose.
755 * Because almost always the next block will be opened.
757 static void prb_close_block(struct tpacket_kbdq_core *pkc1,
758 struct tpacket_block_desc *pbd1,
759 struct packet_sock *po, unsigned int stat)
761 __u32 status = TP_STATUS_USER | stat;
763 struct tpacket3_hdr *last_pkt;
764 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
765 struct sock *sk = &po->sk;
767 if (atomic_read(&po->tp_drops))
768 status |= TP_STATUS_LOSING;
770 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
771 last_pkt->tp_next_offset = 0;
773 /* Get the ts of the last pkt */
774 if (BLOCK_NUM_PKTS(pbd1)) {
775 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
776 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
778 /* Ok, we tmo'd - so get the current time.
780 * It shouldn't really happen as we don't close empty
781 * blocks. See prb_retire_rx_blk_timer_expired().
785 h1->ts_last_pkt.ts_sec = ts.tv_sec;
786 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
791 /* Flush the block */
792 prb_flush_block(pkc1, pbd1, status);
794 sk->sk_data_ready(sk);
796 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
799 static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
801 pkc->reset_pending_on_curr_blk = 0;
805 * Side effect of opening a block:
807 * 1) prb_queue is thawed.
808 * 2) retire_blk_timer is refreshed.
811 static void prb_open_block(struct tpacket_kbdq_core *pkc1,
812 struct tpacket_block_desc *pbd1)
815 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
819 /* We could have just memset this but we will lose the
820 * flexibility of making the priv area sticky
823 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
824 BLOCK_NUM_PKTS(pbd1) = 0;
825 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
829 h1->ts_first_pkt.ts_sec = ts.tv_sec;
830 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
832 pkc1->pkblk_start = (char *)pbd1;
833 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
835 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
836 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
838 pbd1->version = pkc1->version;
839 pkc1->prev = pkc1->nxt_offset;
840 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
842 prb_thaw_queue(pkc1);
843 _prb_refresh_rx_retire_blk_timer(pkc1);
849 * Queue freeze logic:
850 * 1) Assume tp_block_nr = 8 blocks.
851 * 2) At time 't0', user opens Rx ring.
852 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
853 * 4) user-space is either sleeping or processing block '0'.
854 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
855 * it will close block-7,loop around and try to fill block '0'.
857 * __packet_lookup_frame_in_block
858 * prb_retire_current_block()
859 * prb_dispatch_next_block()
860 * |->(BLOCK_STATUS == USER) evaluates to true
861 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
862 * 6) Now there are two cases:
863 * 6.1) Link goes idle right after the queue is frozen.
864 * But remember, the last open_block() refreshed the timer.
865 * When this timer expires,it will refresh itself so that we can
866 * re-open block-0 in near future.
867 * 6.2) Link is busy and keeps on receiving packets. This is a simple
868 * case and __packet_lookup_frame_in_block will check if block-0
869 * is free and can now be re-used.
871 static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
872 struct packet_sock *po)
874 pkc->reset_pending_on_curr_blk = 1;
875 po->stats.stats3.tp_freeze_q_cnt++;
878 #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
881 * If the next block is free then we will dispatch it
882 * and return a good offset.
883 * Else, we will freeze the queue.
884 * So, caller must check the return value.
886 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
887 struct packet_sock *po)
889 struct tpacket_block_desc *pbd;
893 /* 1. Get current block num */
894 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
896 /* 2. If this block is currently in_use then freeze the queue */
897 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
898 prb_freeze_queue(pkc, po);
904 * open this block and return the offset where the first packet
905 * needs to get stored.
907 prb_open_block(pkc, pbd);
908 return (void *)pkc->nxt_offset;
911 static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
912 struct packet_sock *po, unsigned int status)
914 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
916 /* retire/close the current block */
917 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
919 * Plug the case where copy_bits() is in progress on
920 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
921 * have space to copy the pkt in the current block and
922 * called prb_retire_current_block()
924 * We don't need to worry about the TMO case because
925 * the timer-handler already handled this case.
927 if (!(status & TP_STATUS_BLK_TMO)) {
928 while (atomic_read(&pkc->blk_fill_in_prog)) {
929 /* Waiting for skb_copy_bits to finish... */
933 prb_close_block(pkc, pbd, po, status);
938 static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
940 return TP_STATUS_USER & BLOCK_STATUS(pbd);
943 static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
945 return pkc->reset_pending_on_curr_blk;
948 static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
949 __releases(&pkc->blk_fill_in_prog_lock)
951 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
952 atomic_dec(&pkc->blk_fill_in_prog);
955 static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
956 struct tpacket3_hdr *ppd)
958 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
961 static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
962 struct tpacket3_hdr *ppd)
964 ppd->hv1.tp_rxhash = 0;
967 static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
968 struct tpacket3_hdr *ppd)
970 if (skb_vlan_tag_present(pkc->skb)) {
971 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
972 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
973 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
975 ppd->hv1.tp_vlan_tci = 0;
976 ppd->hv1.tp_vlan_tpid = 0;
977 ppd->tp_status = TP_STATUS_AVAILABLE;
981 static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
982 struct tpacket3_hdr *ppd)
984 ppd->hv1.tp_padding = 0;
985 prb_fill_vlan_info(pkc, ppd);
987 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
988 prb_fill_rxhash(pkc, ppd);
990 prb_clear_rxhash(pkc, ppd);
993 static void prb_fill_curr_block(char *curr,
994 struct tpacket_kbdq_core *pkc,
995 struct tpacket_block_desc *pbd,
997 __acquires(&pkc->blk_fill_in_prog_lock)
999 struct tpacket3_hdr *ppd;
1001 ppd = (struct tpacket3_hdr *)curr;
1002 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1004 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1005 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1006 BLOCK_NUM_PKTS(pbd) += 1;
1007 atomic_inc(&pkc->blk_fill_in_prog);
1008 prb_run_all_ft_ops(pkc, ppd);
1011 /* Assumes caller has the sk->rx_queue.lock */
1012 static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1013 struct sk_buff *skb,
1017 struct tpacket_kbdq_core *pkc;
1018 struct tpacket_block_desc *pbd;
1021 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
1022 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1024 /* Queue is frozen when user space is lagging behind */
1025 if (prb_queue_frozen(pkc)) {
1027 * Check if that last block which caused the queue to freeze,
1028 * is still in_use by user-space.
1030 if (prb_curr_blk_in_use(pbd)) {
1031 /* Can't record this packet */
1035 * Ok, the block was released by user-space.
1036 * Now let's open that block.
1037 * opening a block also thaws the queue.
1038 * Thawing is a side effect.
1040 prb_open_block(pkc, pbd);
1045 curr = pkc->nxt_offset;
1047 end = (char *)pbd + pkc->kblk_size;
1049 /* first try the current block */
1050 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1051 prb_fill_curr_block(curr, pkc, pbd, len);
1052 return (void *)curr;
1055 /* Ok, close the current block */
1056 prb_retire_current_block(pkc, po, 0);
1058 /* Now, try to dispatch the next block */
1059 curr = (char *)prb_dispatch_next_block(pkc, po);
1061 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1062 prb_fill_curr_block(curr, pkc, pbd, len);
1063 return (void *)curr;
1067 * No free blocks are available.user_space hasn't caught up yet.
1068 * Queue was just frozen and now this packet will get dropped.
1073 static void *packet_current_rx_frame(struct packet_sock *po,
1074 struct sk_buff *skb,
1075 int status, unsigned int len)
1078 switch (po->tp_version) {
1081 curr = packet_lookup_frame(po, &po->rx_ring,
1082 po->rx_ring.head, status);
1085 return __packet_lookup_frame_in_block(po, skb, len);
1087 WARN(1, "TPACKET version not supported\n");
1093 static void *prb_lookup_block(const struct packet_sock *po,
1094 const struct packet_ring_buffer *rb,
1098 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1099 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
1101 if (status != BLOCK_STATUS(pbd))
1106 static int prb_previous_blk_num(struct packet_ring_buffer *rb)
1109 if (rb->prb_bdqc.kactive_blk_num)
1110 prev = rb->prb_bdqc.kactive_blk_num-1;
1112 prev = rb->prb_bdqc.knum_blocks-1;
1116 /* Assumes caller has held the rx_queue.lock */
1117 static void *__prb_previous_block(struct packet_sock *po,
1118 struct packet_ring_buffer *rb,
1121 unsigned int previous = prb_previous_blk_num(rb);
1122 return prb_lookup_block(po, rb, previous, status);
1125 static void *packet_previous_rx_frame(struct packet_sock *po,
1126 struct packet_ring_buffer *rb,
1129 if (po->tp_version <= TPACKET_V2)
1130 return packet_previous_frame(po, rb, status);
1132 return __prb_previous_block(po, rb, status);
1135 static void packet_increment_rx_head(struct packet_sock *po,
1136 struct packet_ring_buffer *rb)
1138 switch (po->tp_version) {
1141 return packet_increment_head(rb);
1144 WARN(1, "TPACKET version not supported.\n");
1150 static void *packet_previous_frame(struct packet_sock *po,
1151 struct packet_ring_buffer *rb,
1154 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1155 return packet_lookup_frame(po, rb, previous, status);
1158 static void packet_increment_head(struct packet_ring_buffer *buff)
1160 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1163 static void packet_inc_pending(struct packet_ring_buffer *rb)
1165 this_cpu_inc(*rb->pending_refcnt);
1168 static void packet_dec_pending(struct packet_ring_buffer *rb)
1170 this_cpu_dec(*rb->pending_refcnt);
1173 static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1175 unsigned int refcnt = 0;
1178 /* We don't use pending refcount in rx_ring. */
1179 if (rb->pending_refcnt == NULL)
1182 for_each_possible_cpu(cpu)
1183 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1188 static int packet_alloc_pending(struct packet_sock *po)
1190 po->rx_ring.pending_refcnt = NULL;
1192 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1193 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1199 static void packet_free_pending(struct packet_sock *po)
1201 free_percpu(po->tx_ring.pending_refcnt);
1204 #define ROOM_POW_OFF 2
1205 #define ROOM_NONE 0x0
1206 #define ROOM_LOW 0x1
1207 #define ROOM_NORMAL 0x2
1209 static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
1213 len = READ_ONCE(po->rx_ring.frame_max) + 1;
1214 idx = READ_ONCE(po->rx_ring.head);
1216 idx += len >> pow_off;
1219 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1222 static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
1226 len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
1227 idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
1229 idx += len >> pow_off;
1232 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1235 static int __packet_rcv_has_room(const struct packet_sock *po,
1236 const struct sk_buff *skb)
1238 const struct sock *sk = &po->sk;
1239 int ret = ROOM_NONE;
1241 if (po->prot_hook.func != tpacket_rcv) {
1242 int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
1243 int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1244 - (skb ? skb->truesize : 0);
1246 if (avail > (rcvbuf >> ROOM_POW_OFF))
1254 if (po->tp_version == TPACKET_V3) {
1255 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1257 else if (__tpacket_v3_has_room(po, 0))
1260 if (__tpacket_has_room(po, ROOM_POW_OFF))
1262 else if (__tpacket_has_room(po, 0))
1269 static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1273 ret = __packet_rcv_has_room(po, skb);
1274 pressure = ret != ROOM_NORMAL;
1276 if (READ_ONCE(po->pressure) != pressure)
1277 WRITE_ONCE(po->pressure, pressure);
1282 static void packet_rcv_try_clear_pressure(struct packet_sock *po)
1284 if (READ_ONCE(po->pressure) &&
1285 __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
1286 WRITE_ONCE(po->pressure, 0);
1289 static void packet_sock_destruct(struct sock *sk)
1291 skb_queue_purge(&sk->sk_error_queue);
1293 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1294 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1296 if (!sock_flag(sk, SOCK_DEAD)) {
1297 pr_err("Attempt to release alive packet socket: %p\n", sk);
1301 sk_refcnt_debug_dec(sk);
1304 static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1306 u32 *history = po->rollover->history;
1310 rxhash = skb_get_hash(skb);
1311 for (i = 0; i < ROLLOVER_HLEN; i++)
1312 if (READ_ONCE(history[i]) == rxhash)
1315 victim = prandom_u32() % ROLLOVER_HLEN;
1317 /* Avoid dirtying the cache line if possible */
1318 if (READ_ONCE(history[victim]) != rxhash)
1319 WRITE_ONCE(history[victim], rxhash);
1321 return count > (ROLLOVER_HLEN >> 1);
1324 static unsigned int fanout_demux_hash(struct packet_fanout *f,
1325 struct sk_buff *skb,
1328 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
1331 static unsigned int fanout_demux_lb(struct packet_fanout *f,
1332 struct sk_buff *skb,
1335 unsigned int val = atomic_inc_return(&f->rr_cur);
1340 static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1341 struct sk_buff *skb,
1344 return smp_processor_id() % num;
1347 static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1348 struct sk_buff *skb,
1351 return prandom_u32_max(num);
1354 static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1355 struct sk_buff *skb,
1356 unsigned int idx, bool try_self,
1359 struct packet_sock *po, *po_next, *po_skip = NULL;
1360 unsigned int i, j, room = ROOM_NONE;
1362 po = pkt_sk(f->arr[idx]);
1365 room = packet_rcv_has_room(po, skb);
1366 if (room == ROOM_NORMAL ||
1367 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1372 i = j = min_t(int, po->rollover->sock, num - 1);
1374 po_next = pkt_sk(f->arr[i]);
1375 if (po_next != po_skip && !READ_ONCE(po_next->pressure) &&
1376 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
1378 po->rollover->sock = i;
1379 atomic_long_inc(&po->rollover->num);
1380 if (room == ROOM_LOW)
1381 atomic_long_inc(&po->rollover->num_huge);
1389 atomic_long_inc(&po->rollover->num_failed);
1393 static unsigned int fanout_demux_qm(struct packet_fanout *f,
1394 struct sk_buff *skb,
1397 return skb_get_queue_mapping(skb) % num;
1400 static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1401 struct sk_buff *skb,
1404 struct bpf_prog *prog;
1405 unsigned int ret = 0;
1408 prog = rcu_dereference(f->bpf_prog);
1410 ret = bpf_prog_run_clear_cb(prog, skb) % num;
1416 static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1418 return f->flags & (flag >> 8);
1421 static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1422 struct packet_type *pt, struct net_device *orig_dev)
1424 struct packet_fanout *f = pt->af_packet_priv;
1425 unsigned int num = READ_ONCE(f->num_members);
1426 struct net *net = read_pnet(&f->net);
1427 struct packet_sock *po;
1430 if (!net_eq(dev_net(dev), net) || !num) {
1435 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1436 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
1441 case PACKET_FANOUT_HASH:
1443 idx = fanout_demux_hash(f, skb, num);
1445 case PACKET_FANOUT_LB:
1446 idx = fanout_demux_lb(f, skb, num);
1448 case PACKET_FANOUT_CPU:
1449 idx = fanout_demux_cpu(f, skb, num);
1451 case PACKET_FANOUT_RND:
1452 idx = fanout_demux_rnd(f, skb, num);
1454 case PACKET_FANOUT_QM:
1455 idx = fanout_demux_qm(f, skb, num);
1457 case PACKET_FANOUT_ROLLOVER:
1458 idx = fanout_demux_rollover(f, skb, 0, false, num);
1460 case PACKET_FANOUT_CBPF:
1461 case PACKET_FANOUT_EBPF:
1462 idx = fanout_demux_bpf(f, skb, num);
1466 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1467 idx = fanout_demux_rollover(f, skb, idx, true, num);
1469 po = pkt_sk(f->arr[idx]);
1470 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1473 DEFINE_MUTEX(fanout_mutex);
1474 EXPORT_SYMBOL_GPL(fanout_mutex);
1475 static LIST_HEAD(fanout_list);
1476 static u16 fanout_next_id;
1478 static void __fanout_link(struct sock *sk, struct packet_sock *po)
1480 struct packet_fanout *f = po->fanout;
1482 spin_lock(&f->lock);
1483 f->arr[f->num_members] = sk;
1486 if (f->num_members == 1)
1487 dev_add_pack(&f->prot_hook);
1488 spin_unlock(&f->lock);
1491 static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1493 struct packet_fanout *f = po->fanout;
1496 spin_lock(&f->lock);
1497 for (i = 0; i < f->num_members; i++) {
1498 if (f->arr[i] == sk)
1501 BUG_ON(i >= f->num_members);
1502 f->arr[i] = f->arr[f->num_members - 1];
1504 if (f->num_members == 0)
1505 __dev_remove_pack(&f->prot_hook);
1506 spin_unlock(&f->lock);
1509 static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
1511 if (sk->sk_family != PF_PACKET)
1514 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
1517 static void fanout_init_data(struct packet_fanout *f)
1520 case PACKET_FANOUT_LB:
1521 atomic_set(&f->rr_cur, 0);
1523 case PACKET_FANOUT_CBPF:
1524 case PACKET_FANOUT_EBPF:
1525 RCU_INIT_POINTER(f->bpf_prog, NULL);
1530 static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1532 struct bpf_prog *old;
1534 spin_lock(&f->lock);
1535 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1536 rcu_assign_pointer(f->bpf_prog, new);
1537 spin_unlock(&f->lock);
1541 bpf_prog_destroy(old);
1545 static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1548 struct bpf_prog *new;
1549 struct sock_fprog fprog;
1552 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1554 if (len != sizeof(fprog))
1556 if (copy_from_user(&fprog, data, len))
1559 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
1563 __fanout_set_data_bpf(po->fanout, new);
1567 static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1570 struct bpf_prog *new;
1573 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1575 if (len != sizeof(fd))
1577 if (copy_from_user(&fd, data, len))
1580 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
1582 return PTR_ERR(new);
1584 __fanout_set_data_bpf(po->fanout, new);
1588 static int fanout_set_data(struct packet_sock *po, char __user *data,
1591 switch (po->fanout->type) {
1592 case PACKET_FANOUT_CBPF:
1593 return fanout_set_data_cbpf(po, data, len);
1594 case PACKET_FANOUT_EBPF:
1595 return fanout_set_data_ebpf(po, data, len);
1601 static void fanout_release_data(struct packet_fanout *f)
1604 case PACKET_FANOUT_CBPF:
1605 case PACKET_FANOUT_EBPF:
1606 __fanout_set_data_bpf(f, NULL);
1610 static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1612 struct packet_fanout *f;
1614 list_for_each_entry(f, &fanout_list, list) {
1615 if (f->id == candidate_id &&
1616 read_pnet(&f->net) == sock_net(sk)) {
1623 static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1625 u16 id = fanout_next_id;
1628 if (__fanout_id_is_free(sk, id)) {
1630 fanout_next_id = id + 1;
1635 } while (id != fanout_next_id);
1640 static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1642 struct packet_rollover *rollover = NULL;
1643 struct packet_sock *po = pkt_sk(sk);
1644 struct packet_fanout *f, *match;
1645 u8 type = type_flags & 0xff;
1646 u8 flags = type_flags >> 8;
1650 case PACKET_FANOUT_ROLLOVER:
1651 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1653 case PACKET_FANOUT_HASH:
1654 case PACKET_FANOUT_LB:
1655 case PACKET_FANOUT_CPU:
1656 case PACKET_FANOUT_RND:
1657 case PACKET_FANOUT_QM:
1658 case PACKET_FANOUT_CBPF:
1659 case PACKET_FANOUT_EBPF:
1665 mutex_lock(&fanout_mutex);
1671 if (type == PACKET_FANOUT_ROLLOVER ||
1672 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
1674 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1677 atomic_long_set(&rollover->num, 0);
1678 atomic_long_set(&rollover->num_huge, 0);
1679 atomic_long_set(&rollover->num_failed, 0);
1682 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1687 if (!fanout_find_new_id(sk, &id)) {
1691 /* ephemeral flag for the first socket in the group: drop it */
1692 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1696 list_for_each_entry(f, &fanout_list, list) {
1698 read_pnet(&f->net) == sock_net(sk)) {
1704 if (match && match->flags != flags)
1708 match = kzalloc(sizeof(*match), GFP_KERNEL);
1711 write_pnet(&match->net, sock_net(sk));
1714 match->flags = flags;
1715 INIT_LIST_HEAD(&match->list);
1716 spin_lock_init(&match->lock);
1717 refcount_set(&match->sk_ref, 0);
1718 fanout_init_data(match);
1719 match->prot_hook.type = po->prot_hook.type;
1720 match->prot_hook.dev = po->prot_hook.dev;
1721 match->prot_hook.func = packet_rcv_fanout;
1722 match->prot_hook.af_packet_priv = match;
1723 match->prot_hook.af_packet_net = read_pnet(&match->net);
1724 match->prot_hook.id_match = match_fanout_group;
1725 list_add(&match->list, &fanout_list);
1729 spin_lock(&po->bind_lock);
1731 match->type == type &&
1732 match->prot_hook.type == po->prot_hook.type &&
1733 match->prot_hook.dev == po->prot_hook.dev) {
1735 if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1736 __dev_remove_pack(&po->prot_hook);
1738 /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */
1739 WRITE_ONCE(po->fanout, match);
1741 po->rollover = rollover;
1743 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
1744 __fanout_link(sk, po);
1748 spin_unlock(&po->bind_lock);
1750 if (err && !refcount_read(&match->sk_ref)) {
1751 list_del(&match->list);
1757 mutex_unlock(&fanout_mutex);
1761 /* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1762 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1763 * It is the responsibility of the caller to call fanout_release_data() and
1764 * free the returned packet_fanout (after synchronize_net())
1766 static struct packet_fanout *fanout_release(struct sock *sk)
1768 struct packet_sock *po = pkt_sk(sk);
1769 struct packet_fanout *f;
1771 mutex_lock(&fanout_mutex);
1776 if (refcount_dec_and_test(&f->sk_ref))
1781 mutex_unlock(&fanout_mutex);
1786 static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1787 struct sk_buff *skb)
1789 /* Earlier code assumed this would be a VLAN pkt, double-check
1790 * this now that we have the actual packet in hand. We can only
1791 * do this check on Ethernet devices.
1793 if (unlikely(dev->type != ARPHRD_ETHER))
1796 skb_reset_mac_header(skb);
1797 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1800 static const struct proto_ops packet_ops;
1802 static const struct proto_ops packet_ops_spkt;
1804 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1805 struct packet_type *pt, struct net_device *orig_dev)
1808 struct sockaddr_pkt *spkt;
1811 * When we registered the protocol we saved the socket in the data
1812 * field for just this event.
1815 sk = pt->af_packet_priv;
1818 * Yank back the headers [hope the device set this
1819 * right or kerboom...]
1821 * Incoming packets have ll header pulled,
1824 * For outgoing ones skb->data == skb_mac_header(skb)
1825 * so that this procedure is noop.
1828 if (skb->pkt_type == PACKET_LOOPBACK)
1831 if (!net_eq(dev_net(dev), sock_net(sk)))
1834 skb = skb_share_check(skb, GFP_ATOMIC);
1838 /* drop any routing info */
1841 /* drop conntrack reference */
1844 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1846 skb_push(skb, skb->data - skb_mac_header(skb));
1849 * The SOCK_PACKET socket receives _all_ frames.
1852 spkt->spkt_family = dev->type;
1853 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1854 spkt->spkt_protocol = skb->protocol;
1857 * Charge the memory to the socket. This is done specifically
1858 * to prevent sockets using all the memory up.
1861 if (sock_queue_rcv_skb(sk, skb) == 0)
1870 static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
1874 if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) &&
1875 sock->type == SOCK_RAW) {
1876 skb_reset_mac_header(skb);
1877 skb->protocol = dev_parse_header_protocol(skb);
1880 /* Move network header to the right position for VLAN tagged packets */
1881 if (likely(skb->dev->type == ARPHRD_ETHER) &&
1882 eth_type_vlan(skb->protocol) &&
1883 vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0)
1884 skb_set_network_header(skb, depth);
1886 skb_probe_transport_header(skb);
1890 * Output a raw packet to a device layer. This bypasses all the other
1891 * protocol layers and you must therefore supply it with a complete frame
1894 static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1897 struct sock *sk = sock->sk;
1898 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1899 struct sk_buff *skb = NULL;
1900 struct net_device *dev;
1901 struct sockcm_cookie sockc;
1907 * Get and verify the address.
1911 if (msg->msg_namelen < sizeof(struct sockaddr))
1913 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1914 proto = saddr->spkt_protocol;
1916 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1919 * Find the device first to size check it
1922 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1925 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1931 if (!(dev->flags & IFF_UP))
1935 * You may not queue a frame bigger than the mtu. This is the lowest level
1936 * raw protocol and you must do your own fragmentation at this level.
1939 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1940 if (!netif_supports_nofcs(dev)) {
1941 err = -EPROTONOSUPPORT;
1944 extra_len = 4; /* We're doing our own CRC */
1948 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1952 size_t reserved = LL_RESERVED_SPACE(dev);
1953 int tlen = dev->needed_tailroom;
1954 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1957 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1960 /* FIXME: Save some space for broken drivers that write a hard
1961 * header at transmission time by themselves. PPP is the notable
1962 * one here. This should really be fixed at the driver level.
1964 skb_reserve(skb, reserved);
1965 skb_reset_network_header(skb);
1967 /* Try to align data part correctly */
1972 skb_reset_network_header(skb);
1974 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1980 if (!dev_validate_header(dev, skb->data, len) || !skb->len) {
1984 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1985 !packet_extra_vlan_len_allowed(dev, skb)) {
1990 sockcm_init(&sockc, sk);
1991 if (msg->msg_controllen) {
1992 err = sock_cmsg_send(sk, msg, &sockc);
1997 skb->protocol = proto;
1999 skb->priority = sk->sk_priority;
2000 skb->mark = sk->sk_mark;
2001 skb->tstamp = sockc.transmit_time;
2003 skb_setup_tx_timestamp(skb, sockc.tsflags);
2005 if (unlikely(extra_len == 4))
2008 packet_parse_headers(skb, sock);
2010 dev_queue_xmit(skb);
2021 static unsigned int run_filter(struct sk_buff *skb,
2022 const struct sock *sk,
2025 struct sk_filter *filter;
2028 filter = rcu_dereference(sk->sk_filter);
2030 res = bpf_prog_run_clear_cb(filter->prog, skb);
2036 static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2039 struct virtio_net_hdr vnet_hdr;
2041 if (*len < sizeof(vnet_hdr))
2043 *len -= sizeof(vnet_hdr);
2045 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
2048 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2052 * This function makes lazy skb cloning in hope that most of packets
2053 * are discarded by BPF.
2055 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2056 * and skb->cb are mangled. It works because (and until) packets
2057 * falling here are owned by current CPU. Output packets are cloned
2058 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2059 * sequencially, so that if we return skb to original state on exit,
2060 * we will not harm anyone.
2063 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2064 struct packet_type *pt, struct net_device *orig_dev)
2067 struct sockaddr_ll *sll;
2068 struct packet_sock *po;
2069 u8 *skb_head = skb->data;
2070 int skb_len = skb->len;
2071 unsigned int snaplen, res;
2072 bool is_drop_n_account = false;
2074 if (skb->pkt_type == PACKET_LOOPBACK)
2077 sk = pt->af_packet_priv;
2080 if (!net_eq(dev_net(dev), sock_net(sk)))
2085 if (dev->header_ops) {
2086 /* The device has an explicit notion of ll header,
2087 * exported to higher levels.
2089 * Otherwise, the device hides details of its frame
2090 * structure, so that corresponding packet head is
2091 * never delivered to user.
2093 if (sk->sk_type != SOCK_DGRAM)
2094 skb_push(skb, skb->data - skb_mac_header(skb));
2095 else if (skb->pkt_type == PACKET_OUTGOING) {
2096 /* Special case: outgoing packets have ll header at head */
2097 skb_pull(skb, skb_network_offset(skb));
2103 res = run_filter(skb, sk, snaplen);
2105 goto drop_n_restore;
2109 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
2112 if (skb_shared(skb)) {
2113 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2117 if (skb_head != skb->data) {
2118 skb->data = skb_head;
2125 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
2127 sll = &PACKET_SKB_CB(skb)->sa.ll;
2128 sll->sll_hatype = dev->type;
2129 sll->sll_pkttype = skb->pkt_type;
2130 if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
2131 sll->sll_ifindex = orig_dev->ifindex;
2133 sll->sll_ifindex = dev->ifindex;
2135 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2137 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2138 * Use their space for storing the original skb length.
2140 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
2142 if (pskb_trim(skb, snaplen))
2145 skb_set_owner_r(skb, sk);
2149 /* drop conntrack reference */
2152 spin_lock(&sk->sk_receive_queue.lock);
2153 po->stats.stats1.tp_packets++;
2154 sock_skb_set_dropcount(sk, skb);
2155 __skb_queue_tail(&sk->sk_receive_queue, skb);
2156 spin_unlock(&sk->sk_receive_queue.lock);
2157 sk->sk_data_ready(sk);
2161 is_drop_n_account = true;
2162 atomic_inc(&po->tp_drops);
2163 atomic_inc(&sk->sk_drops);
2166 if (skb_head != skb->data && skb_shared(skb)) {
2167 skb->data = skb_head;
2171 if (!is_drop_n_account)
2178 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2179 struct packet_type *pt, struct net_device *orig_dev)
2182 struct packet_sock *po;
2183 struct sockaddr_ll *sll;
2184 union tpacket_uhdr h;
2185 u8 *skb_head = skb->data;
2186 int skb_len = skb->len;
2187 unsigned int snaplen, res;
2188 unsigned long status = TP_STATUS_USER;
2189 unsigned short macoff, hdrlen;
2190 unsigned int netoff;
2191 struct sk_buff *copy_skb = NULL;
2194 bool is_drop_n_account = false;
2195 unsigned int slot_id = 0;
2196 bool do_vnet = false;
2198 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2199 * We may add members to them until current aligned size without forcing
2200 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2202 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2203 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2205 if (skb->pkt_type == PACKET_LOOPBACK)
2208 sk = pt->af_packet_priv;
2211 if (!net_eq(dev_net(dev), sock_net(sk)))
2214 if (dev->header_ops) {
2215 if (sk->sk_type != SOCK_DGRAM)
2216 skb_push(skb, skb->data - skb_mac_header(skb));
2217 else if (skb->pkt_type == PACKET_OUTGOING) {
2218 /* Special case: outgoing packets have ll header at head */
2219 skb_pull(skb, skb_network_offset(skb));
2225 res = run_filter(skb, sk, snaplen);
2227 goto drop_n_restore;
2229 /* If we are flooded, just give up */
2230 if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
2231 atomic_inc(&po->tp_drops);
2232 goto drop_n_restore;
2235 if (skb->ip_summed == CHECKSUM_PARTIAL)
2236 status |= TP_STATUS_CSUMNOTREADY;
2237 else if (skb->pkt_type != PACKET_OUTGOING &&
2238 skb_csum_unnecessary(skb))
2239 status |= TP_STATUS_CSUM_VALID;
2244 if (sk->sk_type == SOCK_DGRAM) {
2245 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2248 unsigned int maclen = skb_network_offset(skb);
2249 netoff = TPACKET_ALIGN(po->tp_hdrlen +
2250 (maclen < 16 ? 16 : maclen)) +
2252 if (po->has_vnet_hdr) {
2253 netoff += sizeof(struct virtio_net_hdr);
2256 macoff = netoff - maclen;
2258 if (netoff > USHRT_MAX) {
2259 atomic_inc(&po->tp_drops);
2260 goto drop_n_restore;
2262 if (po->tp_version <= TPACKET_V2) {
2263 if (macoff + snaplen > po->rx_ring.frame_size) {
2264 if (po->copy_thresh &&
2265 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
2266 if (skb_shared(skb)) {
2267 copy_skb = skb_clone(skb, GFP_ATOMIC);
2269 copy_skb = skb_get(skb);
2270 skb_head = skb->data;
2273 memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0,
2274 sizeof(PACKET_SKB_CB(copy_skb)->sa.ll));
2275 skb_set_owner_r(copy_skb, sk);
2278 snaplen = po->rx_ring.frame_size - macoff;
2279 if ((int)snaplen < 0) {
2284 } else if (unlikely(macoff + snaplen >
2285 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2288 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2289 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2290 snaplen, nval, macoff);
2292 if (unlikely((int)snaplen < 0)) {
2294 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2298 spin_lock(&sk->sk_receive_queue.lock);
2299 h.raw = packet_current_rx_frame(po, skb,
2300 TP_STATUS_KERNEL, (macoff+snaplen));
2302 goto drop_n_account;
2304 if (po->tp_version <= TPACKET_V2) {
2305 slot_id = po->rx_ring.head;
2306 if (test_bit(slot_id, po->rx_ring.rx_owner_map))
2307 goto drop_n_account;
2308 __set_bit(slot_id, po->rx_ring.rx_owner_map);
2312 virtio_net_hdr_from_skb(skb, h.raw + macoff -
2313 sizeof(struct virtio_net_hdr),
2314 vio_le(), true, 0)) {
2315 if (po->tp_version == TPACKET_V3)
2316 prb_clear_blk_fill_status(&po->rx_ring);
2317 goto drop_n_account;
2320 if (po->tp_version <= TPACKET_V2) {
2321 packet_increment_rx_head(po, &po->rx_ring);
2323 * LOSING will be reported till you read the stats,
2324 * because it's COR - Clear On Read.
2325 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2328 if (atomic_read(&po->tp_drops))
2329 status |= TP_STATUS_LOSING;
2332 po->stats.stats1.tp_packets++;
2334 status |= TP_STATUS_COPY;
2335 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2337 spin_unlock(&sk->sk_receive_queue.lock);
2339 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
2341 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
2342 getnstimeofday(&ts);
2344 status |= ts_status;
2346 switch (po->tp_version) {
2348 h.h1->tp_len = skb->len;
2349 h.h1->tp_snaplen = snaplen;
2350 h.h1->tp_mac = macoff;
2351 h.h1->tp_net = netoff;
2352 h.h1->tp_sec = ts.tv_sec;
2353 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
2354 hdrlen = sizeof(*h.h1);
2357 h.h2->tp_len = skb->len;
2358 h.h2->tp_snaplen = snaplen;
2359 h.h2->tp_mac = macoff;
2360 h.h2->tp_net = netoff;
2361 h.h2->tp_sec = ts.tv_sec;
2362 h.h2->tp_nsec = ts.tv_nsec;
2363 if (skb_vlan_tag_present(skb)) {
2364 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
2365 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2366 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
2368 h.h2->tp_vlan_tci = 0;
2369 h.h2->tp_vlan_tpid = 0;
2371 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
2372 hdrlen = sizeof(*h.h2);
2375 /* tp_nxt_offset,vlan are already populated above.
2376 * So DONT clear those fields here
2378 h.h3->tp_status |= status;
2379 h.h3->tp_len = skb->len;
2380 h.h3->tp_snaplen = snaplen;
2381 h.h3->tp_mac = macoff;
2382 h.h3->tp_net = netoff;
2383 h.h3->tp_sec = ts.tv_sec;
2384 h.h3->tp_nsec = ts.tv_nsec;
2385 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
2386 hdrlen = sizeof(*h.h3);
2392 sll = h.raw + TPACKET_ALIGN(hdrlen);
2393 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2394 sll->sll_family = AF_PACKET;
2395 sll->sll_hatype = dev->type;
2396 sll->sll_protocol = skb->protocol;
2397 sll->sll_pkttype = skb->pkt_type;
2398 if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
2399 sll->sll_ifindex = orig_dev->ifindex;
2401 sll->sll_ifindex = dev->ifindex;
2405 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
2406 if (po->tp_version <= TPACKET_V2) {
2409 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2412 for (start = h.raw; start < end; start += PAGE_SIZE)
2413 flush_dcache_page(pgv_to_page(start));
2418 if (po->tp_version <= TPACKET_V2) {
2419 spin_lock(&sk->sk_receive_queue.lock);
2420 __packet_set_status(po, h.raw, status);
2421 __clear_bit(slot_id, po->rx_ring.rx_owner_map);
2422 spin_unlock(&sk->sk_receive_queue.lock);
2423 sk->sk_data_ready(sk);
2424 } else if (po->tp_version == TPACKET_V3) {
2425 prb_clear_blk_fill_status(&po->rx_ring);
2429 if (skb_head != skb->data && skb_shared(skb)) {
2430 skb->data = skb_head;
2434 if (!is_drop_n_account)
2441 spin_unlock(&sk->sk_receive_queue.lock);
2442 atomic_inc(&po->tp_drops);
2443 is_drop_n_account = true;
2445 sk->sk_data_ready(sk);
2446 kfree_skb(copy_skb);
2447 goto drop_n_restore;
2450 static void tpacket_destruct_skb(struct sk_buff *skb)
2452 struct packet_sock *po = pkt_sk(skb->sk);
2454 if (likely(po->tx_ring.pg_vec)) {
2458 ph = skb_zcopy_get_nouarg(skb);
2459 packet_dec_pending(&po->tx_ring);
2461 ts = __packet_set_timestamp(po, ph, skb);
2462 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
2464 if (!packet_read_pending(&po->tx_ring))
2465 complete(&po->skb_completion);
2471 static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2473 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2474 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2475 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2476 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2477 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2478 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2479 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2481 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2487 static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2488 struct virtio_net_hdr *vnet_hdr)
2490 if (*len < sizeof(*vnet_hdr))
2492 *len -= sizeof(*vnet_hdr);
2494 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
2497 return __packet_snd_vnet_parse(vnet_hdr, *len);
2500 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2501 void *frame, struct net_device *dev, void *data, int tp_len,
2502 __be16 proto, unsigned char *addr, int hlen, int copylen,
2503 const struct sockcm_cookie *sockc)
2505 union tpacket_uhdr ph;
2506 int to_write, offset, len, nr_frags, len_max;
2507 struct socket *sock = po->sk.sk_socket;
2513 skb->protocol = proto;
2515 skb->priority = po->sk.sk_priority;
2516 skb->mark = po->sk.sk_mark;
2517 skb->tstamp = sockc->transmit_time;
2518 skb_setup_tx_timestamp(skb, sockc->tsflags);
2519 skb_zcopy_set_nouarg(skb, ph.raw);
2521 skb_reserve(skb, hlen);
2522 skb_reset_network_header(skb);
2526 if (sock->type == SOCK_DGRAM) {
2527 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2529 if (unlikely(err < 0))
2531 } else if (copylen) {
2532 int hdrlen = min_t(int, copylen, tp_len);
2534 skb_push(skb, dev->hard_header_len);
2535 skb_put(skb, copylen - dev->hard_header_len);
2536 err = skb_store_bits(skb, 0, data, hdrlen);
2539 if (!dev_validate_header(dev, skb->data, hdrlen))
2546 offset = offset_in_page(data);
2547 len_max = PAGE_SIZE - offset;
2548 len = ((to_write > len_max) ? len_max : to_write);
2550 skb->data_len = to_write;
2551 skb->len += to_write;
2552 skb->truesize += to_write;
2553 refcount_add(to_write, &po->sk.sk_wmem_alloc);
2555 while (likely(to_write)) {
2556 nr_frags = skb_shinfo(skb)->nr_frags;
2558 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
2559 pr_err("Packet exceed the number of skb frags(%lu)\n",
2564 page = pgv_to_page(data);
2566 flush_dcache_page(page);
2568 skb_fill_page_desc(skb, nr_frags, page, offset, len);
2571 len_max = PAGE_SIZE;
2572 len = ((to_write > len_max) ? len_max : to_write);
2575 packet_parse_headers(skb, sock);
2580 static int tpacket_parse_header(struct packet_sock *po, void *frame,
2581 int size_max, void **data)
2583 union tpacket_uhdr ph;
2588 switch (po->tp_version) {
2590 if (ph.h3->tp_next_offset != 0) {
2591 pr_warn_once("variable sized slot not supported");
2594 tp_len = ph.h3->tp_len;
2597 tp_len = ph.h2->tp_len;
2600 tp_len = ph.h1->tp_len;
2603 if (unlikely(tp_len > size_max)) {
2604 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2608 if (unlikely(po->tp_tx_has_off)) {
2609 int off_min, off_max;
2611 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2612 off_max = po->tx_ring.frame_size - tp_len;
2613 if (po->sk.sk_type == SOCK_DGRAM) {
2614 switch (po->tp_version) {
2616 off = ph.h3->tp_net;
2619 off = ph.h2->tp_net;
2622 off = ph.h1->tp_net;
2626 switch (po->tp_version) {
2628 off = ph.h3->tp_mac;
2631 off = ph.h2->tp_mac;
2634 off = ph.h1->tp_mac;
2638 if (unlikely((off < off_min) || (off_max < off)))
2641 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2644 *data = frame + off;
2648 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2650 struct sk_buff *skb = NULL;
2651 struct net_device *dev;
2652 struct virtio_net_hdr *vnet_hdr = NULL;
2653 struct sockcm_cookie sockc;
2655 int err, reserve = 0;
2657 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2658 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
2659 unsigned char *addr = NULL;
2660 int tp_len, size_max;
2663 int status = TP_STATUS_AVAILABLE;
2664 int hlen, tlen, copylen = 0;
2667 mutex_lock(&po->pg_vec_lock);
2669 /* packet_sendmsg() check on tx_ring.pg_vec was lockless,
2670 * we need to confirm it under protection of pg_vec_lock.
2672 if (unlikely(!po->tx_ring.pg_vec)) {
2676 if (likely(saddr == NULL)) {
2677 dev = packet_cached_dev_get(po);
2678 proto = READ_ONCE(po->num);
2681 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2683 if (msg->msg_namelen < (saddr->sll_halen
2684 + offsetof(struct sockaddr_ll,
2687 proto = saddr->sll_protocol;
2688 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2689 if (po->sk.sk_socket->type == SOCK_DGRAM) {
2690 if (dev && msg->msg_namelen < dev->addr_len +
2691 offsetof(struct sockaddr_ll, sll_addr))
2693 addr = saddr->sll_addr;
2698 if (unlikely(dev == NULL))
2701 if (unlikely(!(dev->flags & IFF_UP)))
2704 sockcm_init(&sockc, &po->sk);
2705 if (msg->msg_controllen) {
2706 err = sock_cmsg_send(&po->sk, msg, &sockc);
2711 if (po->sk.sk_socket->type == SOCK_RAW)
2712 reserve = dev->hard_header_len;
2713 size_max = po->tx_ring.frame_size
2714 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
2716 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
2717 size_max = dev->mtu + reserve + VLAN_HLEN;
2719 reinit_completion(&po->skb_completion);
2722 ph = packet_current_frame(po, &po->tx_ring,
2723 TP_STATUS_SEND_REQUEST);
2724 if (unlikely(ph == NULL)) {
2725 if (need_wait && skb) {
2726 timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
2727 timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
2729 err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
2733 /* check for additional frames */
2738 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2742 status = TP_STATUS_SEND_REQUEST;
2743 hlen = LL_RESERVED_SPACE(dev);
2744 tlen = dev->needed_tailroom;
2745 if (po->has_vnet_hdr) {
2747 data += sizeof(*vnet_hdr);
2748 tp_len -= sizeof(*vnet_hdr);
2750 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2754 copylen = __virtio16_to_cpu(vio_le(),
2757 copylen = max_t(int, copylen, dev->hard_header_len);
2758 skb = sock_alloc_send_skb(&po->sk,
2759 hlen + tlen + sizeof(struct sockaddr_ll) +
2760 (copylen - dev->hard_header_len),
2763 if (unlikely(skb == NULL)) {
2764 /* we assume the socket was initially writeable ... */
2765 if (likely(len_sum > 0))
2769 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
2770 addr, hlen, copylen, &sockc);
2771 if (likely(tp_len >= 0) &&
2772 tp_len > dev->mtu + reserve &&
2773 !po->has_vnet_hdr &&
2774 !packet_extra_vlan_len_allowed(dev, skb))
2777 if (unlikely(tp_len < 0)) {
2780 __packet_set_status(po, ph,
2781 TP_STATUS_AVAILABLE);
2782 packet_increment_head(&po->tx_ring);
2786 status = TP_STATUS_WRONG_FORMAT;
2792 if (po->has_vnet_hdr) {
2793 if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
2797 virtio_net_hdr_set_proto(skb, vnet_hdr);
2800 skb->destructor = tpacket_destruct_skb;
2801 __packet_set_status(po, ph, TP_STATUS_SENDING);
2802 packet_inc_pending(&po->tx_ring);
2804 status = TP_STATUS_SEND_REQUEST;
2805 /* Paired with WRITE_ONCE() in packet_setsockopt() */
2806 err = READ_ONCE(po->xmit)(skb);
2807 if (unlikely(err != 0)) {
2809 err = net_xmit_errno(err);
2810 if (err && __packet_get_status(po, ph) ==
2811 TP_STATUS_AVAILABLE) {
2812 /* skb was destructed already */
2817 * skb was dropped but not destructed yet;
2818 * let's treat it like congestion or err < 0
2822 packet_increment_head(&po->tx_ring);
2824 } while (likely((ph != NULL) ||
2825 /* Note: packet_read_pending() might be slow if we have
2826 * to call it as it's per_cpu variable, but in fast-path
2827 * we already short-circuit the loop with the first
2828 * condition, and luckily don't have to go that path
2831 (need_wait && packet_read_pending(&po->tx_ring))));
2837 __packet_set_status(po, ph, status);
2842 mutex_unlock(&po->pg_vec_lock);
2846 static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2847 size_t reserve, size_t len,
2848 size_t linear, int noblock,
2851 struct sk_buff *skb;
2853 /* Under a page? Don't bother with paged skb. */
2854 if (prepad + len < PAGE_SIZE || !linear)
2857 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2862 skb_reserve(skb, reserve);
2863 skb_put(skb, linear);
2864 skb->data_len = len - linear;
2865 skb->len += len - linear;
2870 static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2872 struct sock *sk = sock->sk;
2873 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2874 struct sk_buff *skb;
2875 struct net_device *dev;
2877 unsigned char *addr = NULL;
2878 int err, reserve = 0;
2879 struct sockcm_cookie sockc;
2880 struct virtio_net_hdr vnet_hdr = { 0 };
2882 struct packet_sock *po = pkt_sk(sk);
2883 bool has_vnet_hdr = false;
2884 int hlen, tlen, linear;
2888 * Get and verify the address.
2891 if (likely(saddr == NULL)) {
2892 dev = packet_cached_dev_get(po);
2893 proto = READ_ONCE(po->num);
2896 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2898 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2900 proto = saddr->sll_protocol;
2901 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2902 if (sock->type == SOCK_DGRAM) {
2903 if (dev && msg->msg_namelen < dev->addr_len +
2904 offsetof(struct sockaddr_ll, sll_addr))
2906 addr = saddr->sll_addr;
2911 if (unlikely(dev == NULL))
2914 if (unlikely(!(dev->flags & IFF_UP)))
2917 sockcm_init(&sockc, sk);
2918 sockc.mark = sk->sk_mark;
2919 if (msg->msg_controllen) {
2920 err = sock_cmsg_send(sk, msg, &sockc);
2925 if (sock->type == SOCK_RAW)
2926 reserve = dev->hard_header_len;
2927 if (po->has_vnet_hdr) {
2928 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2931 has_vnet_hdr = true;
2934 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2935 if (!netif_supports_nofcs(dev)) {
2936 err = -EPROTONOSUPPORT;
2939 extra_len = 4; /* We're doing our own CRC */
2943 if (!vnet_hdr.gso_type &&
2944 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
2948 hlen = LL_RESERVED_SPACE(dev);
2949 tlen = dev->needed_tailroom;
2950 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2951 linear = max(linear, min_t(int, len, dev->hard_header_len));
2952 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
2953 msg->msg_flags & MSG_DONTWAIT, &err);
2957 skb_reset_network_header(skb);
2960 if (sock->type == SOCK_DGRAM) {
2961 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
2962 if (unlikely(offset < 0))
2964 } else if (reserve) {
2965 skb_reserve(skb, -reserve);
2966 if (len < reserve + sizeof(struct ipv6hdr) &&
2967 dev->min_header_len != dev->hard_header_len)
2968 skb_reset_network_header(skb);
2971 /* Returns -EFAULT on error */
2972 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
2976 if ((sock->type == SOCK_RAW &&
2977 !dev_validate_header(dev, skb->data, len)) || !skb->len) {
2982 skb_setup_tx_timestamp(skb, sockc.tsflags);
2984 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
2985 !packet_extra_vlan_len_allowed(dev, skb)) {
2990 skb->protocol = proto;
2992 skb->priority = sk->sk_priority;
2993 skb->mark = sockc.mark;
2994 skb->tstamp = sockc.transmit_time;
2996 if (unlikely(extra_len == 4))
2999 packet_parse_headers(skb, sock);
3002 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
3005 len += sizeof(vnet_hdr);
3006 virtio_net_hdr_set_proto(skb, &vnet_hdr);
3009 /* Paired with WRITE_ONCE() in packet_setsockopt() */
3010 err = READ_ONCE(po->xmit)(skb);
3011 if (unlikely(err != 0)) {
3013 err = net_xmit_errno(err);
3031 static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
3033 struct sock *sk = sock->sk;
3034 struct packet_sock *po = pkt_sk(sk);
3036 if (po->tx_ring.pg_vec)
3037 return tpacket_snd(po, msg);
3039 return packet_snd(sock, msg, len);
3043 * Close a PACKET socket. This is fairly simple. We immediately go
3044 * to 'closed' state and remove our protocol entry in the device list.
3047 static int packet_release(struct socket *sock)
3049 struct sock *sk = sock->sk;
3050 struct packet_sock *po;
3051 struct packet_fanout *f;
3053 union tpacket_req_u req_u;
3061 mutex_lock(&net->packet.sklist_lock);
3062 sk_del_node_init_rcu(sk);
3063 mutex_unlock(&net->packet.sklist_lock);
3066 sock_prot_inuse_add(net, sk->sk_prot, -1);
3069 spin_lock(&po->bind_lock);
3070 unregister_prot_hook(sk, false);
3071 packet_cached_dev_reset(po);
3073 if (po->prot_hook.dev) {
3074 dev_put(po->prot_hook.dev);
3075 po->prot_hook.dev = NULL;
3077 spin_unlock(&po->bind_lock);
3079 packet_flush_mclist(sk);
3082 if (po->rx_ring.pg_vec) {
3083 memset(&req_u, 0, sizeof(req_u));
3084 packet_set_ring(sk, &req_u, 1, 0);
3087 if (po->tx_ring.pg_vec) {
3088 memset(&req_u, 0, sizeof(req_u));
3089 packet_set_ring(sk, &req_u, 1, 1);
3093 f = fanout_release(sk);
3097 kfree(po->rollover);
3099 fanout_release_data(f);
3103 * Now the socket is dead. No more input will appear.
3110 skb_queue_purge(&sk->sk_receive_queue);
3111 packet_free_pending(po);
3112 sk_refcnt_debug_release(sk);
3119 * Attach a packet hook.
3122 static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3125 struct packet_sock *po = pkt_sk(sk);
3126 struct net_device *dev_curr;
3129 struct net_device *dev = NULL;
3131 bool unlisted = false;
3134 spin_lock(&po->bind_lock);
3146 dev = dev_get_by_name_rcu(sock_net(sk), name);
3151 } else if (ifindex) {
3152 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3162 proto_curr = po->prot_hook.type;
3163 dev_curr = po->prot_hook.dev;
3165 need_rehook = proto_curr != proto || dev_curr != dev;
3170 /* prevents packet_notifier() from calling
3171 * register_prot_hook()
3173 WRITE_ONCE(po->num, 0);
3174 __unregister_prot_hook(sk, true);
3176 dev_curr = po->prot_hook.dev;
3178 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3182 BUG_ON(po->running);
3183 WRITE_ONCE(po->num, proto);
3184 po->prot_hook.type = proto;
3186 if (unlikely(unlisted)) {
3188 po->prot_hook.dev = NULL;
3189 WRITE_ONCE(po->ifindex, -1);
3190 packet_cached_dev_reset(po);
3192 po->prot_hook.dev = dev;
3193 WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0);
3194 packet_cached_dev_assign(po, dev);
3200 if (proto == 0 || !need_rehook)
3203 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
3204 register_prot_hook(sk);
3206 sk->sk_err = ENETDOWN;
3207 if (!sock_flag(sk, SOCK_DEAD))
3208 sk->sk_error_report(sk);
3213 spin_unlock(&po->bind_lock);
3219 * Bind a packet socket to a device
3222 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3225 struct sock *sk = sock->sk;
3226 char name[sizeof(uaddr->sa_data) + 1];
3232 if (addr_len != sizeof(struct sockaddr))
3234 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3237 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3238 name[sizeof(uaddr->sa_data)] = 0;
3240 return packet_do_bind(sk, name, 0, 0);
3243 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3245 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3246 struct sock *sk = sock->sk;
3252 if (addr_len < sizeof(struct sockaddr_ll))
3254 if (sll->sll_family != AF_PACKET)
3257 return packet_do_bind(sk, NULL, sll->sll_ifindex, sll->sll_protocol);
3260 static struct proto packet_proto = {
3262 .owner = THIS_MODULE,
3263 .obj_size = sizeof(struct packet_sock),
3267 * Create a packet of type SOCK_PACKET.
3270 static int packet_create(struct net *net, struct socket *sock, int protocol,
3274 struct packet_sock *po;
3275 __be16 proto = (__force __be16)protocol; /* weird, but documented */
3278 if (!ns_capable(net->user_ns, CAP_NET_RAW))
3280 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3281 sock->type != SOCK_PACKET)
3282 return -ESOCKTNOSUPPORT;
3284 sock->state = SS_UNCONNECTED;
3287 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
3291 sock->ops = &packet_ops;
3292 if (sock->type == SOCK_PACKET)
3293 sock->ops = &packet_ops_spkt;
3295 sock_init_data(sock, sk);
3298 init_completion(&po->skb_completion);
3299 sk->sk_family = PF_PACKET;
3301 po->xmit = dev_queue_xmit;
3303 err = packet_alloc_pending(po);
3307 packet_cached_dev_reset(po);
3309 sk->sk_destruct = packet_sock_destruct;
3310 sk_refcnt_debug_inc(sk);
3313 * Attach a protocol block
3316 spin_lock_init(&po->bind_lock);
3317 mutex_init(&po->pg_vec_lock);
3318 po->rollover = NULL;
3319 po->prot_hook.func = packet_rcv;
3321 if (sock->type == SOCK_PACKET)
3322 po->prot_hook.func = packet_rcv_spkt;
3324 po->prot_hook.af_packet_priv = sk;
3325 po->prot_hook.af_packet_net = sock_net(sk);
3328 po->prot_hook.type = proto;
3329 __register_prot_hook(sk);
3332 mutex_lock(&net->packet.sklist_lock);
3333 sk_add_node_tail_rcu(sk, &net->packet.sklist);
3334 mutex_unlock(&net->packet.sklist_lock);
3337 sock_prot_inuse_add(net, &packet_proto, 1);
3348 * Pull a packet from our receive queue and hand it to the user.
3349 * If necessary we block.
3352 static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3355 struct sock *sk = sock->sk;
3356 struct sk_buff *skb;
3358 int vnet_hdr_len = 0;
3359 unsigned int origlen = 0;
3362 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
3366 /* What error should we return now? EUNATTACH? */
3367 if (pkt_sk(sk)->ifindex < 0)
3371 if (flags & MSG_ERRQUEUE) {
3372 err = sock_recv_errqueue(sk, msg, len,
3373 SOL_PACKET, PACKET_TX_TIMESTAMP);
3378 * Call the generic datagram receiver. This handles all sorts
3379 * of horrible races and re-entrancy so we can forget about it
3380 * in the protocol layers.
3382 * Now it will return ENETDOWN, if device have just gone down,
3383 * but then it will block.
3386 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
3389 * An error occurred so return it. Because skb_recv_datagram()
3390 * handles the blocking we don't see and worry about blocking
3397 packet_rcv_try_clear_pressure(pkt_sk(sk));
3399 if (pkt_sk(sk)->has_vnet_hdr) {
3400 err = packet_rcv_vnet(msg, skb, &len);
3403 vnet_hdr_len = sizeof(struct virtio_net_hdr);
3406 /* You lose any data beyond the buffer you gave. If it worries
3407 * a user program they can ask the device for its MTU
3413 msg->msg_flags |= MSG_TRUNC;
3416 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3420 if (sock->type != SOCK_PACKET) {
3421 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3423 /* Original length was stored in sockaddr_ll fields */
3424 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3425 sll->sll_family = AF_PACKET;
3426 sll->sll_protocol = skb->protocol;
3429 sock_recv_ts_and_drops(msg, sk, skb);
3431 if (msg->msg_name) {
3432 const size_t max_len = min(sizeof(skb->cb),
3433 sizeof(struct sockaddr_storage));
3436 /* If the address length field is there to be filled
3437 * in, we fill it in now.
3439 if (sock->type == SOCK_PACKET) {
3440 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
3441 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3442 copy_len = msg->msg_namelen;
3444 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3446 msg->msg_namelen = sll->sll_halen +
3447 offsetof(struct sockaddr_ll, sll_addr);
3448 copy_len = msg->msg_namelen;
3449 if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
3450 memset(msg->msg_name +
3451 offsetof(struct sockaddr_ll, sll_addr),
3452 0, sizeof(sll->sll_addr));
3453 msg->msg_namelen = sizeof(struct sockaddr_ll);
3456 if (WARN_ON_ONCE(copy_len > max_len)) {
3458 msg->msg_namelen = copy_len;
3460 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
3463 if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_AUXDATA)) {
3464 struct tpacket_auxdata aux;
3466 aux.tp_status = TP_STATUS_USER;
3467 if (skb->ip_summed == CHECKSUM_PARTIAL)
3468 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
3469 else if (skb->pkt_type != PACKET_OUTGOING &&
3470 skb_csum_unnecessary(skb))
3471 aux.tp_status |= TP_STATUS_CSUM_VALID;
3473 aux.tp_len = origlen;
3474 aux.tp_snaplen = skb->len;
3476 aux.tp_net = skb_network_offset(skb);
3477 if (skb_vlan_tag_present(skb)) {
3478 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
3479 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3480 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
3482 aux.tp_vlan_tci = 0;
3483 aux.tp_vlan_tpid = 0;
3485 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
3489 * Free or return the buffer as appropriate. Again this
3490 * hides all the races and re-entrancy issues from us.
3492 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
3495 skb_free_datagram(sk, skb);
3500 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3503 struct net_device *dev;
3504 struct sock *sk = sock->sk;
3509 uaddr->sa_family = AF_PACKET;
3510 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
3512 dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex));
3514 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
3517 return sizeof(*uaddr);
3520 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3523 struct net_device *dev;
3524 struct sock *sk = sock->sk;
3525 struct packet_sock *po = pkt_sk(sk);
3526 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
3532 ifindex = READ_ONCE(po->ifindex);
3533 sll->sll_family = AF_PACKET;
3534 sll->sll_ifindex = ifindex;
3535 sll->sll_protocol = READ_ONCE(po->num);
3536 sll->sll_pkttype = 0;
3538 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3540 sll->sll_hatype = dev->type;
3541 sll->sll_halen = dev->addr_len;
3542 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
3544 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3549 return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
3552 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3556 case PACKET_MR_MULTICAST:
3557 if (i->alen != dev->addr_len)
3560 return dev_mc_add(dev, i->addr);
3562 return dev_mc_del(dev, i->addr);
3564 case PACKET_MR_PROMISC:
3565 return dev_set_promiscuity(dev, what);
3566 case PACKET_MR_ALLMULTI:
3567 return dev_set_allmulti(dev, what);
3568 case PACKET_MR_UNICAST:
3569 if (i->alen != dev->addr_len)
3572 return dev_uc_add(dev, i->addr);
3574 return dev_uc_del(dev, i->addr);
3582 static void packet_dev_mclist_delete(struct net_device *dev,
3583 struct packet_mclist **mlp)
3585 struct packet_mclist *ml;
3587 while ((ml = *mlp) != NULL) {
3588 if (ml->ifindex == dev->ifindex) {
3589 packet_dev_mc(dev, ml, -1);
3597 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
3599 struct packet_sock *po = pkt_sk(sk);
3600 struct packet_mclist *ml, *i;
3601 struct net_device *dev;
3607 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
3612 if (mreq->mr_alen > dev->addr_len)
3616 i = kmalloc(sizeof(*i), GFP_KERNEL);
3621 for (ml = po->mclist; ml; ml = ml->next) {
3622 if (ml->ifindex == mreq->mr_ifindex &&
3623 ml->type == mreq->mr_type &&
3624 ml->alen == mreq->mr_alen &&
3625 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3627 /* Free the new element ... */
3633 i->type = mreq->mr_type;
3634 i->ifindex = mreq->mr_ifindex;
3635 i->alen = mreq->mr_alen;
3636 memcpy(i->addr, mreq->mr_address, i->alen);
3637 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
3639 i->next = po->mclist;
3641 err = packet_dev_mc(dev, i, 1);
3643 po->mclist = i->next;
3652 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
3654 struct packet_mclist *ml, **mlp;
3658 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3659 if (ml->ifindex == mreq->mr_ifindex &&
3660 ml->type == mreq->mr_type &&
3661 ml->alen == mreq->mr_alen &&
3662 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3663 if (--ml->count == 0) {
3664 struct net_device *dev;
3666 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3668 packet_dev_mc(dev, ml, -1);
3678 static void packet_flush_mclist(struct sock *sk)
3680 struct packet_sock *po = pkt_sk(sk);
3681 struct packet_mclist *ml;
3687 while ((ml = po->mclist) != NULL) {
3688 struct net_device *dev;
3690 po->mclist = ml->next;
3691 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3693 packet_dev_mc(dev, ml, -1);
3700 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
3702 struct sock *sk = sock->sk;
3703 struct packet_sock *po = pkt_sk(sk);
3706 if (level != SOL_PACKET)
3707 return -ENOPROTOOPT;
3710 case PACKET_ADD_MEMBERSHIP:
3711 case PACKET_DROP_MEMBERSHIP:
3713 struct packet_mreq_max mreq;
3715 memset(&mreq, 0, sizeof(mreq));
3716 if (len < sizeof(struct packet_mreq))
3718 if (len > sizeof(mreq))
3720 if (copy_from_user(&mreq, optval, len))
3722 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3724 if (optname == PACKET_ADD_MEMBERSHIP)
3725 ret = packet_mc_add(sk, &mreq);
3727 ret = packet_mc_drop(sk, &mreq);
3731 case PACKET_RX_RING:
3732 case PACKET_TX_RING:
3734 union tpacket_req_u req_u;
3738 switch (po->tp_version) {
3741 len = sizeof(req_u.req);
3745 len = sizeof(req_u.req3);
3751 if (copy_from_user(&req_u.req, optval, len))
3754 ret = packet_set_ring(sk, &req_u, 0,
3755 optname == PACKET_TX_RING);
3760 case PACKET_COPY_THRESH:
3764 if (optlen != sizeof(val))
3766 if (copy_from_user(&val, optval, sizeof(val)))
3769 pkt_sk(sk)->copy_thresh = val;
3772 case PACKET_VERSION:
3776 if (optlen != sizeof(val))
3778 if (copy_from_user(&val, optval, sizeof(val)))
3789 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3792 po->tp_version = val;
3798 case PACKET_RESERVE:
3802 if (optlen != sizeof(val))
3804 if (copy_from_user(&val, optval, sizeof(val)))
3809 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3812 po->tp_reserve = val;
3822 if (optlen != sizeof(val))
3824 if (copy_from_user(&val, optval, sizeof(val)))
3828 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3831 po->tp_loss = !!val;
3837 case PACKET_AUXDATA:
3841 if (optlen < sizeof(val))
3843 if (copy_from_user(&val, optval, sizeof(val)))
3846 packet_sock_flag_set(po, PACKET_SOCK_AUXDATA, val);
3849 case PACKET_ORIGDEV:
3853 if (optlen < sizeof(val))
3855 if (copy_from_user(&val, optval, sizeof(val)))
3858 packet_sock_flag_set(po, PACKET_SOCK_ORIGDEV, val);
3861 case PACKET_VNET_HDR:
3865 if (sock->type != SOCK_RAW)
3867 if (optlen < sizeof(val))
3869 if (copy_from_user(&val, optval, sizeof(val)))
3873 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3876 po->has_vnet_hdr = !!val;
3882 case PACKET_TIMESTAMP:
3886 if (optlen != sizeof(val))
3888 if (copy_from_user(&val, optval, sizeof(val)))
3891 po->tp_tstamp = val;
3898 if (optlen != sizeof(val))
3900 if (copy_from_user(&val, optval, sizeof(val)))
3903 return fanout_add(sk, val & 0xffff, val >> 16);
3905 case PACKET_FANOUT_DATA:
3907 /* Paired with the WRITE_ONCE() in fanout_add() */
3908 if (!READ_ONCE(po->fanout))
3911 return fanout_set_data(po, optval, optlen);
3913 case PACKET_IGNORE_OUTGOING:
3917 if (optlen != sizeof(val))
3919 if (copy_from_user(&val, optval, sizeof(val)))
3921 if (val < 0 || val > 1)
3924 po->prot_hook.ignore_outgoing = !!val;
3927 case PACKET_TX_HAS_OFF:
3931 if (optlen != sizeof(val))
3933 if (copy_from_user(&val, optval, sizeof(val)))
3937 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3940 po->tp_tx_has_off = !!val;
3946 case PACKET_QDISC_BYPASS:
3950 if (optlen != sizeof(val))
3952 if (copy_from_user(&val, optval, sizeof(val)))
3955 /* Paired with all lockless reads of po->xmit */
3956 WRITE_ONCE(po->xmit, val ? packet_direct_xmit : dev_queue_xmit);
3960 return -ENOPROTOOPT;
3964 static int packet_getsockopt(struct socket *sock, int level, int optname,
3965 char __user *optval, int __user *optlen)
3968 int val, lv = sizeof(val);
3969 struct sock *sk = sock->sk;
3970 struct packet_sock *po = pkt_sk(sk);
3972 union tpacket_stats_u st;
3973 struct tpacket_rollover_stats rstats;
3976 if (level != SOL_PACKET)
3977 return -ENOPROTOOPT;
3979 if (get_user(len, optlen))
3986 case PACKET_STATISTICS:
3987 spin_lock_bh(&sk->sk_receive_queue.lock);
3988 memcpy(&st, &po->stats, sizeof(st));
3989 memset(&po->stats, 0, sizeof(po->stats));
3990 spin_unlock_bh(&sk->sk_receive_queue.lock);
3991 drops = atomic_xchg(&po->tp_drops, 0);
3993 if (po->tp_version == TPACKET_V3) {
3994 lv = sizeof(struct tpacket_stats_v3);
3995 st.stats3.tp_drops = drops;
3996 st.stats3.tp_packets += drops;
3999 lv = sizeof(struct tpacket_stats);
4000 st.stats1.tp_drops = drops;
4001 st.stats1.tp_packets += drops;
4006 case PACKET_AUXDATA:
4007 val = packet_sock_flag(po, PACKET_SOCK_AUXDATA);
4009 case PACKET_ORIGDEV:
4010 val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV);
4012 case PACKET_VNET_HDR:
4013 val = po->has_vnet_hdr;
4015 case PACKET_VERSION:
4016 val = po->tp_version;
4019 if (len > sizeof(int))
4021 if (len < sizeof(int))
4023 if (copy_from_user(&val, optval, len))
4027 val = sizeof(struct tpacket_hdr);
4030 val = sizeof(struct tpacket2_hdr);
4033 val = sizeof(struct tpacket3_hdr);
4039 case PACKET_RESERVE:
4040 val = po->tp_reserve;
4045 case PACKET_TIMESTAMP:
4046 val = po->tp_tstamp;
4050 ((u32)po->fanout->id |
4051 ((u32)po->fanout->type << 16) |
4052 ((u32)po->fanout->flags << 24)) :
4055 case PACKET_IGNORE_OUTGOING:
4056 val = po->prot_hook.ignore_outgoing;
4058 case PACKET_ROLLOVER_STATS:
4061 rstats.tp_all = atomic_long_read(&po->rollover->num);
4062 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
4063 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
4065 lv = sizeof(rstats);
4067 case PACKET_TX_HAS_OFF:
4068 val = po->tp_tx_has_off;
4070 case PACKET_QDISC_BYPASS:
4071 val = packet_use_direct_xmit(po);
4074 return -ENOPROTOOPT;
4079 if (put_user(len, optlen))
4081 if (copy_to_user(optval, data, len))
4087 #ifdef CONFIG_COMPAT
4088 static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
4089 char __user *optval, unsigned int optlen)
4091 struct packet_sock *po = pkt_sk(sock->sk);
4093 if (level != SOL_PACKET)
4094 return -ENOPROTOOPT;
4096 if (optname == PACKET_FANOUT_DATA &&
4097 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
4098 optval = (char __user *)get_compat_bpf_fprog(optval);
4101 optlen = sizeof(struct sock_fprog);
4104 return packet_setsockopt(sock, level, optname, optval, optlen);
4108 static int packet_notifier(struct notifier_block *this,
4109 unsigned long msg, void *ptr)
4112 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4113 struct net *net = dev_net(dev);
4116 sk_for_each_rcu(sk, &net->packet.sklist) {
4117 struct packet_sock *po = pkt_sk(sk);
4120 case NETDEV_UNREGISTER:
4122 packet_dev_mclist_delete(dev, &po->mclist);
4126 if (dev->ifindex == po->ifindex) {
4127 spin_lock(&po->bind_lock);
4129 __unregister_prot_hook(sk, false);
4130 sk->sk_err = ENETDOWN;
4131 if (!sock_flag(sk, SOCK_DEAD))
4132 sk->sk_error_report(sk);
4134 if (msg == NETDEV_UNREGISTER) {
4135 packet_cached_dev_reset(po);
4136 WRITE_ONCE(po->ifindex, -1);
4137 if (po->prot_hook.dev)
4138 dev_put(po->prot_hook.dev);
4139 po->prot_hook.dev = NULL;
4141 spin_unlock(&po->bind_lock);
4145 if (dev->ifindex == po->ifindex) {
4146 spin_lock(&po->bind_lock);
4148 register_prot_hook(sk);
4149 spin_unlock(&po->bind_lock);
4159 static int packet_ioctl(struct socket *sock, unsigned int cmd,
4162 struct sock *sk = sock->sk;
4167 int amount = sk_wmem_alloc_get(sk);
4169 return put_user(amount, (int __user *)arg);
4173 struct sk_buff *skb;
4176 spin_lock_bh(&sk->sk_receive_queue.lock);
4177 skb = skb_peek(&sk->sk_receive_queue);
4180 spin_unlock_bh(&sk->sk_receive_queue.lock);
4181 return put_user(amount, (int __user *)arg);
4191 case SIOCGIFBRDADDR:
4192 case SIOCSIFBRDADDR:
4193 case SIOCGIFNETMASK:
4194 case SIOCSIFNETMASK:
4195 case SIOCGIFDSTADDR:
4196 case SIOCSIFDSTADDR:
4198 return inet_dgram_ops.ioctl(sock, cmd, arg);
4202 return -ENOIOCTLCMD;
4207 static __poll_t packet_poll(struct file *file, struct socket *sock,
4210 struct sock *sk = sock->sk;
4211 struct packet_sock *po = pkt_sk(sk);
4212 __poll_t mask = datagram_poll(file, sock, wait);
4214 spin_lock_bh(&sk->sk_receive_queue.lock);
4215 if (po->rx_ring.pg_vec) {
4216 if (!packet_previous_rx_frame(po, &po->rx_ring,
4218 mask |= EPOLLIN | EPOLLRDNORM;
4220 packet_rcv_try_clear_pressure(po);
4221 spin_unlock_bh(&sk->sk_receive_queue.lock);
4222 spin_lock_bh(&sk->sk_write_queue.lock);
4223 if (po->tx_ring.pg_vec) {
4224 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
4225 mask |= EPOLLOUT | EPOLLWRNORM;
4227 spin_unlock_bh(&sk->sk_write_queue.lock);
4232 /* Dirty? Well, I still did not learn better way to account
4236 static void packet_mm_open(struct vm_area_struct *vma)
4238 struct file *file = vma->vm_file;
4239 struct socket *sock = file->private_data;
4240 struct sock *sk = sock->sk;
4243 atomic_inc(&pkt_sk(sk)->mapped);
4246 static void packet_mm_close(struct vm_area_struct *vma)
4248 struct file *file = vma->vm_file;
4249 struct socket *sock = file->private_data;
4250 struct sock *sk = sock->sk;
4253 atomic_dec(&pkt_sk(sk)->mapped);
4256 static const struct vm_operations_struct packet_mmap_ops = {
4257 .open = packet_mm_open,
4258 .close = packet_mm_close,
4261 static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4266 for (i = 0; i < len; i++) {
4267 if (likely(pg_vec[i].buffer)) {
4268 if (is_vmalloc_addr(pg_vec[i].buffer))
4269 vfree(pg_vec[i].buffer);
4271 free_pages((unsigned long)pg_vec[i].buffer,
4273 pg_vec[i].buffer = NULL;
4279 static char *alloc_one_pg_vec_page(unsigned long order)
4282 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4283 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4285 buffer = (char *) __get_free_pages(gfp_flags, order);
4289 /* __get_free_pages failed, fall back to vmalloc */
4290 buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
4294 /* vmalloc failed, lets dig into swap here */
4295 gfp_flags &= ~__GFP_NORETRY;
4296 buffer = (char *) __get_free_pages(gfp_flags, order);
4300 /* complete and utter failure */
4304 static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4306 unsigned int block_nr = req->tp_block_nr;
4310 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
4311 if (unlikely(!pg_vec))
4314 for (i = 0; i < block_nr; i++) {
4315 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
4316 if (unlikely(!pg_vec[i].buffer))
4317 goto out_free_pgvec;
4324 free_pg_vec(pg_vec, order, block_nr);
4329 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4330 int closing, int tx_ring)
4332 struct pgv *pg_vec = NULL;
4333 struct packet_sock *po = pkt_sk(sk);
4334 unsigned long *rx_owner_map = NULL;
4335 int was_running, order = 0;
4336 struct packet_ring_buffer *rb;
4337 struct sk_buff_head *rb_queue;
4340 /* Added to avoid minimal code churn */
4341 struct tpacket_req *req = &req_u->req;
4343 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4344 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
4348 if (atomic_read(&po->mapped))
4350 if (packet_read_pending(rb))
4354 if (req->tp_block_nr) {
4355 unsigned int min_frame_size;
4357 /* Sanity tests and some calculations */
4359 if (unlikely(rb->pg_vec))
4362 switch (po->tp_version) {
4364 po->tp_hdrlen = TPACKET_HDRLEN;
4367 po->tp_hdrlen = TPACKET2_HDRLEN;
4370 po->tp_hdrlen = TPACKET3_HDRLEN;
4375 if (unlikely((int)req->tp_block_size <= 0))
4377 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
4379 min_frame_size = po->tp_hdrlen + po->tp_reserve;
4380 if (po->tp_version >= TPACKET_V3 &&
4381 req->tp_block_size <
4382 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
4384 if (unlikely(req->tp_frame_size < min_frame_size))
4386 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
4389 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4390 if (unlikely(rb->frames_per_block == 0))
4392 if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
4394 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4399 order = get_order(req->tp_block_size);
4400 pg_vec = alloc_pg_vec(req, order);
4401 if (unlikely(!pg_vec))
4403 switch (po->tp_version) {
4405 /* Block transmit is not supported yet */
4407 init_prb_bdqc(po, rb, pg_vec, req_u);
4409 struct tpacket_req3 *req3 = &req_u->req3;
4411 if (req3->tp_retire_blk_tov ||
4412 req3->tp_sizeof_priv ||
4413 req3->tp_feature_req_word) {
4415 goto out_free_pg_vec;
4421 rx_owner_map = bitmap_alloc(req->tp_frame_nr,
4422 GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
4424 goto out_free_pg_vec;
4432 if (unlikely(req->tp_frame_nr))
4437 /* Detach socket from network */
4438 spin_lock(&po->bind_lock);
4439 was_running = po->running;
4442 WRITE_ONCE(po->num, 0);
4443 __unregister_prot_hook(sk, false);
4445 spin_unlock(&po->bind_lock);
4450 mutex_lock(&po->pg_vec_lock);
4451 if (closing || atomic_read(&po->mapped) == 0) {
4453 spin_lock_bh(&rb_queue->lock);
4454 swap(rb->pg_vec, pg_vec);
4455 if (po->tp_version <= TPACKET_V2)
4456 swap(rb->rx_owner_map, rx_owner_map);
4457 rb->frame_max = (req->tp_frame_nr - 1);
4459 rb->frame_size = req->tp_frame_size;
4460 spin_unlock_bh(&rb_queue->lock);
4462 swap(rb->pg_vec_order, order);
4463 swap(rb->pg_vec_len, req->tp_block_nr);
4465 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4466 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4467 tpacket_rcv : packet_rcv;
4468 skb_queue_purge(rb_queue);
4469 if (atomic_read(&po->mapped))
4470 pr_err("packet_mmap: vma is busy: %d\n",
4471 atomic_read(&po->mapped));
4473 mutex_unlock(&po->pg_vec_lock);
4475 spin_lock(&po->bind_lock);
4477 WRITE_ONCE(po->num, num);
4478 register_prot_hook(sk);
4480 spin_unlock(&po->bind_lock);
4481 if (pg_vec && (po->tp_version > TPACKET_V2)) {
4482 /* Because we don't support block-based V3 on tx-ring */
4484 prb_shutdown_retire_blk_timer(po, rb_queue);
4489 bitmap_free(rx_owner_map);
4490 free_pg_vec(pg_vec, order, req->tp_block_nr);
4496 static int packet_mmap(struct file *file, struct socket *sock,
4497 struct vm_area_struct *vma)
4499 struct sock *sk = sock->sk;
4500 struct packet_sock *po = pkt_sk(sk);
4501 unsigned long size, expected_size;
4502 struct packet_ring_buffer *rb;
4503 unsigned long start;
4510 mutex_lock(&po->pg_vec_lock);
4513 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4515 expected_size += rb->pg_vec_len
4521 if (expected_size == 0)
4524 size = vma->vm_end - vma->vm_start;
4525 if (size != expected_size)
4528 start = vma->vm_start;
4529 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4530 if (rb->pg_vec == NULL)
4533 for (i = 0; i < rb->pg_vec_len; i++) {
4535 void *kaddr = rb->pg_vec[i].buffer;
4538 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4539 page = pgv_to_page(kaddr);
4540 err = vm_insert_page(vma, start, page);
4549 atomic_inc(&po->mapped);
4550 vma->vm_ops = &packet_mmap_ops;
4554 mutex_unlock(&po->pg_vec_lock);
4558 static const struct proto_ops packet_ops_spkt = {
4559 .family = PF_PACKET,
4560 .owner = THIS_MODULE,
4561 .release = packet_release,
4562 .bind = packet_bind_spkt,
4563 .connect = sock_no_connect,
4564 .socketpair = sock_no_socketpair,
4565 .accept = sock_no_accept,
4566 .getname = packet_getname_spkt,
4567 .poll = datagram_poll,
4568 .ioctl = packet_ioctl,
4569 .gettstamp = sock_gettstamp,
4570 .listen = sock_no_listen,
4571 .shutdown = sock_no_shutdown,
4572 .setsockopt = sock_no_setsockopt,
4573 .getsockopt = sock_no_getsockopt,
4574 .sendmsg = packet_sendmsg_spkt,
4575 .recvmsg = packet_recvmsg,
4576 .mmap = sock_no_mmap,
4577 .sendpage = sock_no_sendpage,
4580 static const struct proto_ops packet_ops = {
4581 .family = PF_PACKET,
4582 .owner = THIS_MODULE,
4583 .release = packet_release,
4584 .bind = packet_bind,
4585 .connect = sock_no_connect,
4586 .socketpair = sock_no_socketpair,
4587 .accept = sock_no_accept,
4588 .getname = packet_getname,
4589 .poll = packet_poll,
4590 .ioctl = packet_ioctl,
4591 .gettstamp = sock_gettstamp,
4592 .listen = sock_no_listen,
4593 .shutdown = sock_no_shutdown,
4594 .setsockopt = packet_setsockopt,
4595 .getsockopt = packet_getsockopt,
4596 #ifdef CONFIG_COMPAT
4597 .compat_setsockopt = compat_packet_setsockopt,
4599 .sendmsg = packet_sendmsg,
4600 .recvmsg = packet_recvmsg,
4601 .mmap = packet_mmap,
4602 .sendpage = sock_no_sendpage,
4605 static const struct net_proto_family packet_family_ops = {
4606 .family = PF_PACKET,
4607 .create = packet_create,
4608 .owner = THIS_MODULE,
4611 static struct notifier_block packet_netdev_notifier = {
4612 .notifier_call = packet_notifier,
4615 #ifdef CONFIG_PROC_FS
4617 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
4620 struct net *net = seq_file_net(seq);
4623 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
4626 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4628 struct net *net = seq_file_net(seq);
4629 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
4632 static void packet_seq_stop(struct seq_file *seq, void *v)
4638 static int packet_seq_show(struct seq_file *seq, void *v)
4640 if (v == SEQ_START_TOKEN)
4641 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4643 struct sock *s = sk_entry(v);
4644 const struct packet_sock *po = pkt_sk(s);
4647 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
4649 refcount_read(&s->sk_refcnt),
4651 ntohs(READ_ONCE(po->num)),
4652 READ_ONCE(po->ifindex),
4654 atomic_read(&s->sk_rmem_alloc),
4655 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
4662 static const struct seq_operations packet_seq_ops = {
4663 .start = packet_seq_start,
4664 .next = packet_seq_next,
4665 .stop = packet_seq_stop,
4666 .show = packet_seq_show,
4670 static int __net_init packet_net_init(struct net *net)
4672 mutex_init(&net->packet.sklist_lock);
4673 INIT_HLIST_HEAD(&net->packet.sklist);
4675 if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
4676 sizeof(struct seq_net_private)))
4682 static void __net_exit packet_net_exit(struct net *net)
4684 remove_proc_entry("packet", net->proc_net);
4685 WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
4688 static struct pernet_operations packet_net_ops = {
4689 .init = packet_net_init,
4690 .exit = packet_net_exit,
4694 static void __exit packet_exit(void)
4696 unregister_netdevice_notifier(&packet_netdev_notifier);
4697 unregister_pernet_subsys(&packet_net_ops);
4698 sock_unregister(PF_PACKET);
4699 proto_unregister(&packet_proto);
4702 static int __init packet_init(void)
4706 rc = proto_register(&packet_proto, 0);
4709 rc = sock_register(&packet_family_ops);
4712 rc = register_pernet_subsys(&packet_net_ops);
4715 rc = register_netdevice_notifier(&packet_netdev_notifier);
4722 unregister_pernet_subsys(&packet_net_ops);
4724 sock_unregister(PF_PACKET);
4726 proto_unregister(&packet_proto);
4731 module_init(packet_init);
4732 module_exit(packet_exit);
4733 MODULE_LICENSE("GPL");
4734 MODULE_ALIAS_NETPROTO(PF_PACKET);