GNU Linux-libre 4.4.285-gnu1
[releases.git] / net / core / dev.c
1 /*
2  *      NET3    Protocol independent device support routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  *      Derived from the non IP parts of dev.c 1.0.19
10  *              Authors:        Ross Biro
11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *      Additional Authors:
15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
17  *              David Hinds <dahinds@users.sourceforge.net>
18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *              Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *      Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *                                      to 2 if register_netdev gets called
25  *                                      before net_dev_init & also removed a
26  *                                      few lines of code in the process.
27  *              Alan Cox        :       device private ioctl copies fields back.
28  *              Alan Cox        :       Transmit queue code does relevant
29  *                                      stunts to keep the queue safe.
30  *              Alan Cox        :       Fixed double lock.
31  *              Alan Cox        :       Fixed promisc NULL pointer trap
32  *              ????????        :       Support the full private ioctl range
33  *              Alan Cox        :       Moved ioctl permission check into
34  *                                      drivers
35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
36  *              Alan Cox        :       100 backlog just doesn't cut it when
37  *                                      you start doing multicast video 8)
38  *              Alan Cox        :       Rewrote net_bh and list manager.
39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
40  *              Alan Cox        :       Took out transmit every packet pass
41  *                                      Saved a few bytes in the ioctl handler
42  *              Alan Cox        :       Network driver sets packet type before
43  *                                      calling netif_rx. Saves a function
44  *                                      call a packet.
45  *              Alan Cox        :       Hashed net_bh()
46  *              Richard Kooijman:       Timestamp fixes.
47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
48  *              Alan Cox        :       Device lock protection.
49  *              Alan Cox        :       Fixed nasty side effect of device close
50  *                                      changes.
51  *              Rudi Cilibrasi  :       Pass the right thing to
52  *                                      set_mac_address()
53  *              Dave Miller     :       32bit quantity for the device lock to
54  *                                      make it work out on a Sparc.
55  *              Bjorn Ekwall    :       Added KERNELD hack.
56  *              Alan Cox        :       Cleaned up the backlog initialise.
57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
58  *                                      1 device.
59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
60  *                                      is no device open function.
61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
63  *              Cyrus Durgin    :       Cleaned for KMOD
64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
65  *                                      A network device unload needs to purge
66  *                                      the backlog queue.
67  *      Paul Rusty Russell      :       SIOCSIFNAME
68  *              Pekka Riikonen  :       Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *                                      indefinitely on dev->refcnt
71  *              J Hadi Salim    :       - Backlog queue sampling
72  *                                      - netif_rx() feedback
73  */
74
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/rwsem.h>
86 #include <linux/string.h>
87 #include <linux/mm.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/stat.h>
102 #include <net/dst.h>
103 #include <net/dst_metadata.h>
104 #include <net/pkt_sched.h>
105 #include <net/checksum.h>
106 #include <net/xfrm.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/module.h>
110 #include <linux/netpoll.h>
111 #include <linux/rcupdate.h>
112 #include <linux/delay.h>
113 #include <net/iw_handler.h>
114 #include <asm/current.h>
115 #include <linux/audit.h>
116 #include <linux/dmaengine.h>
117 #include <linux/err.h>
118 #include <linux/ctype.h>
119 #include <linux/if_arp.h>
120 #include <linux/if_vlan.h>
121 #include <linux/ip.h>
122 #include <net/ip.h>
123 #include <net/mpls.h>
124 #include <linux/ipv6.h>
125 #include <linux/in.h>
126 #include <linux/jhash.h>
127 #include <linux/random.h>
128 #include <trace/events/napi.h>
129 #include <trace/events/net.h>
130 #include <trace/events/skb.h>
131 #include <linux/pci.h>
132 #include <linux/inetdevice.h>
133 #include <linux/cpu_rmap.h>
134 #include <linux/static_key.h>
135 #include <linux/hashtable.h>
136 #include <linux/vmalloc.h>
137 #include <linux/if_macvlan.h>
138 #include <linux/errqueue.h>
139 #include <linux/hrtimer.h>
140 #include <linux/netfilter_ingress.h>
141
142 #include "net-sysfs.h"
143
144 /* Instead of increasing this, you should create a hash table. */
145 #define MAX_GRO_SKBS 8
146
147 /* This should be increased if a protocol with a bigger head is added. */
148 #define GRO_MAX_HEAD (MAX_HEADER + 128)
149
150 static DEFINE_SPINLOCK(ptype_lock);
151 static DEFINE_SPINLOCK(offload_lock);
152 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
153 struct list_head ptype_all __read_mostly;       /* Taps */
154 static struct list_head offload_base __read_mostly;
155
156 static int netif_rx_internal(struct sk_buff *skb);
157 static int call_netdevice_notifiers_info(unsigned long val,
158                                          struct net_device *dev,
159                                          struct netdev_notifier_info *info);
160
161 /*
162  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
163  * semaphore.
164  *
165  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
166  *
167  * Writers must hold the rtnl semaphore while they loop through the
168  * dev_base_head list, and hold dev_base_lock for writing when they do the
169  * actual updates.  This allows pure readers to access the list even
170  * while a writer is preparing to update it.
171  *
172  * To put it another way, dev_base_lock is held for writing only to
173  * protect against pure readers; the rtnl semaphore provides the
174  * protection against other writers.
175  *
176  * See, for example usages, register_netdevice() and
177  * unregister_netdevice(), which must be called with the rtnl
178  * semaphore held.
179  */
180 DEFINE_RWLOCK(dev_base_lock);
181 EXPORT_SYMBOL(dev_base_lock);
182
183 /* protects napi_hash addition/deletion and napi_gen_id */
184 static DEFINE_SPINLOCK(napi_hash_lock);
185
186 static unsigned int napi_gen_id = NR_CPUS;
187 static DEFINE_HASHTABLE(napi_hash, 8);
188
189 static DECLARE_RWSEM(devnet_rename_sem);
190
191 static inline void dev_base_seq_inc(struct net *net)
192 {
193         while (++net->dev_base_seq == 0);
194 }
195
196 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
197 {
198         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
199
200         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
201 }
202
203 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
204 {
205         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
206 }
207
208 static inline void rps_lock(struct softnet_data *sd)
209 {
210 #ifdef CONFIG_RPS
211         spin_lock(&sd->input_pkt_queue.lock);
212 #endif
213 }
214
215 static inline void rps_unlock(struct softnet_data *sd)
216 {
217 #ifdef CONFIG_RPS
218         spin_unlock(&sd->input_pkt_queue.lock);
219 #endif
220 }
221
222 /* Device list insertion */
223 static void list_netdevice(struct net_device *dev)
224 {
225         struct net *net = dev_net(dev);
226
227         ASSERT_RTNL();
228
229         write_lock_bh(&dev_base_lock);
230         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
231         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
232         hlist_add_head_rcu(&dev->index_hlist,
233                            dev_index_hash(net, dev->ifindex));
234         write_unlock_bh(&dev_base_lock);
235
236         dev_base_seq_inc(net);
237 }
238
239 /* Device list removal
240  * caller must respect a RCU grace period before freeing/reusing dev
241  */
242 static void unlist_netdevice(struct net_device *dev)
243 {
244         ASSERT_RTNL();
245
246         /* Unlink dev from the device chain */
247         write_lock_bh(&dev_base_lock);
248         list_del_rcu(&dev->dev_list);
249         hlist_del_rcu(&dev->name_hlist);
250         hlist_del_rcu(&dev->index_hlist);
251         write_unlock_bh(&dev_base_lock);
252
253         dev_base_seq_inc(dev_net(dev));
254 }
255
256 /*
257  *      Our notifier list
258  */
259
260 static RAW_NOTIFIER_HEAD(netdev_chain);
261
262 /*
263  *      Device drivers call our routines to queue packets here. We empty the
264  *      queue in the local softnet handler.
265  */
266
267 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
268 EXPORT_PER_CPU_SYMBOL(softnet_data);
269
270 #ifdef CONFIG_LOCKDEP
271 /*
272  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
273  * according to dev->type
274  */
275 static const unsigned short netdev_lock_type[] =
276         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
277          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
278          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
279          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
280          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
281          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
282          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
283          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
284          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
285          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
286          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
287          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
288          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
289          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
290          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
291
292 static const char *const netdev_lock_name[] =
293         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
294          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
295          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
296          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
297          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
298          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
299          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
300          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
301          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
302          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
303          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
304          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
305          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
306          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
307          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
308
309 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
310 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
311
312 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
313 {
314         int i;
315
316         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
317                 if (netdev_lock_type[i] == dev_type)
318                         return i;
319         /* the last key is used by default */
320         return ARRAY_SIZE(netdev_lock_type) - 1;
321 }
322
323 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
324                                                  unsigned short dev_type)
325 {
326         int i;
327
328         i = netdev_lock_pos(dev_type);
329         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
330                                    netdev_lock_name[i]);
331 }
332
333 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
334 {
335         int i;
336
337         i = netdev_lock_pos(dev->type);
338         lockdep_set_class_and_name(&dev->addr_list_lock,
339                                    &netdev_addr_lock_key[i],
340                                    netdev_lock_name[i]);
341 }
342 #else
343 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
344                                                  unsigned short dev_type)
345 {
346 }
347 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
348 {
349 }
350 #endif
351
352 /*******************************************************************************
353
354                 Protocol management and registration routines
355
356 *******************************************************************************/
357
358 /*
359  *      Add a protocol ID to the list. Now that the input handler is
360  *      smarter we can dispense with all the messy stuff that used to be
361  *      here.
362  *
363  *      BEWARE!!! Protocol handlers, mangling input packets,
364  *      MUST BE last in hash buckets and checking protocol handlers
365  *      MUST start from promiscuous ptype_all chain in net_bh.
366  *      It is true now, do not change it.
367  *      Explanation follows: if protocol handler, mangling packet, will
368  *      be the first on list, it is not able to sense, that packet
369  *      is cloned and should be copied-on-write, so that it will
370  *      change it and subsequent readers will get broken packet.
371  *                                                      --ANK (980803)
372  */
373
374 static inline struct list_head *ptype_head(const struct packet_type *pt)
375 {
376         if (pt->type == htons(ETH_P_ALL))
377                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
378         else
379                 return pt->dev ? &pt->dev->ptype_specific :
380                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
381 }
382
383 /**
384  *      dev_add_pack - add packet handler
385  *      @pt: packet type declaration
386  *
387  *      Add a protocol handler to the networking stack. The passed &packet_type
388  *      is linked into kernel lists and may not be freed until it has been
389  *      removed from the kernel lists.
390  *
391  *      This call does not sleep therefore it can not
392  *      guarantee all CPU's that are in middle of receiving packets
393  *      will see the new packet type (until the next received packet).
394  */
395
396 void dev_add_pack(struct packet_type *pt)
397 {
398         struct list_head *head = ptype_head(pt);
399
400         spin_lock(&ptype_lock);
401         list_add_rcu(&pt->list, head);
402         spin_unlock(&ptype_lock);
403 }
404 EXPORT_SYMBOL(dev_add_pack);
405
406 /**
407  *      __dev_remove_pack        - remove packet handler
408  *      @pt: packet type declaration
409  *
410  *      Remove a protocol handler that was previously added to the kernel
411  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
412  *      from the kernel lists and can be freed or reused once this function
413  *      returns.
414  *
415  *      The packet type might still be in use by receivers
416  *      and must not be freed until after all the CPU's have gone
417  *      through a quiescent state.
418  */
419 void __dev_remove_pack(struct packet_type *pt)
420 {
421         struct list_head *head = ptype_head(pt);
422         struct packet_type *pt1;
423
424         spin_lock(&ptype_lock);
425
426         list_for_each_entry(pt1, head, list) {
427                 if (pt == pt1) {
428                         list_del_rcu(&pt->list);
429                         goto out;
430                 }
431         }
432
433         pr_warn("dev_remove_pack: %p not found\n", pt);
434 out:
435         spin_unlock(&ptype_lock);
436 }
437 EXPORT_SYMBOL(__dev_remove_pack);
438
439 /**
440  *      dev_remove_pack  - remove packet handler
441  *      @pt: packet type declaration
442  *
443  *      Remove a protocol handler that was previously added to the kernel
444  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
445  *      from the kernel lists and can be freed or reused once this function
446  *      returns.
447  *
448  *      This call sleeps to guarantee that no CPU is looking at the packet
449  *      type after return.
450  */
451 void dev_remove_pack(struct packet_type *pt)
452 {
453         __dev_remove_pack(pt);
454
455         synchronize_net();
456 }
457 EXPORT_SYMBOL(dev_remove_pack);
458
459
460 /**
461  *      dev_add_offload - register offload handlers
462  *      @po: protocol offload declaration
463  *
464  *      Add protocol offload handlers to the networking stack. The passed
465  *      &proto_offload is linked into kernel lists and may not be freed until
466  *      it has been removed from the kernel lists.
467  *
468  *      This call does not sleep therefore it can not
469  *      guarantee all CPU's that are in middle of receiving packets
470  *      will see the new offload handlers (until the next received packet).
471  */
472 void dev_add_offload(struct packet_offload *po)
473 {
474         struct packet_offload *elem;
475
476         spin_lock(&offload_lock);
477         list_for_each_entry(elem, &offload_base, list) {
478                 if (po->priority < elem->priority)
479                         break;
480         }
481         list_add_rcu(&po->list, elem->list.prev);
482         spin_unlock(&offload_lock);
483 }
484 EXPORT_SYMBOL(dev_add_offload);
485
486 /**
487  *      __dev_remove_offload     - remove offload handler
488  *      @po: packet offload declaration
489  *
490  *      Remove a protocol offload handler that was previously added to the
491  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
492  *      is removed from the kernel lists and can be freed or reused once this
493  *      function returns.
494  *
495  *      The packet type might still be in use by receivers
496  *      and must not be freed until after all the CPU's have gone
497  *      through a quiescent state.
498  */
499 static void __dev_remove_offload(struct packet_offload *po)
500 {
501         struct list_head *head = &offload_base;
502         struct packet_offload *po1;
503
504         spin_lock(&offload_lock);
505
506         list_for_each_entry(po1, head, list) {
507                 if (po == po1) {
508                         list_del_rcu(&po->list);
509                         goto out;
510                 }
511         }
512
513         pr_warn("dev_remove_offload: %p not found\n", po);
514 out:
515         spin_unlock(&offload_lock);
516 }
517
518 /**
519  *      dev_remove_offload       - remove packet offload handler
520  *      @po: packet offload declaration
521  *
522  *      Remove a packet offload handler that was previously added to the kernel
523  *      offload handlers by dev_add_offload(). The passed &offload_type is
524  *      removed from the kernel lists and can be freed or reused once this
525  *      function returns.
526  *
527  *      This call sleeps to guarantee that no CPU is looking at the packet
528  *      type after return.
529  */
530 void dev_remove_offload(struct packet_offload *po)
531 {
532         __dev_remove_offload(po);
533
534         synchronize_net();
535 }
536 EXPORT_SYMBOL(dev_remove_offload);
537
538 /******************************************************************************
539
540                       Device Boot-time Settings Routines
541
542 *******************************************************************************/
543
544 /* Boot time configuration table */
545 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
546
547 /**
548  *      netdev_boot_setup_add   - add new setup entry
549  *      @name: name of the device
550  *      @map: configured settings for the device
551  *
552  *      Adds new setup entry to the dev_boot_setup list.  The function
553  *      returns 0 on error and 1 on success.  This is a generic routine to
554  *      all netdevices.
555  */
556 static int netdev_boot_setup_add(char *name, struct ifmap *map)
557 {
558         struct netdev_boot_setup *s;
559         int i;
560
561         s = dev_boot_setup;
562         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
563                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
564                         memset(s[i].name, 0, sizeof(s[i].name));
565                         strlcpy(s[i].name, name, IFNAMSIZ);
566                         memcpy(&s[i].map, map, sizeof(s[i].map));
567                         break;
568                 }
569         }
570
571         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
572 }
573
574 /**
575  *      netdev_boot_setup_check - check boot time settings
576  *      @dev: the netdevice
577  *
578  *      Check boot time settings for the device.
579  *      The found settings are set for the device to be used
580  *      later in the device probing.
581  *      Returns 0 if no settings found, 1 if they are.
582  */
583 int netdev_boot_setup_check(struct net_device *dev)
584 {
585         struct netdev_boot_setup *s = dev_boot_setup;
586         int i;
587
588         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
589                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
590                     !strcmp(dev->name, s[i].name)) {
591                         dev->irq        = s[i].map.irq;
592                         dev->base_addr  = s[i].map.base_addr;
593                         dev->mem_start  = s[i].map.mem_start;
594                         dev->mem_end    = s[i].map.mem_end;
595                         return 1;
596                 }
597         }
598         return 0;
599 }
600 EXPORT_SYMBOL(netdev_boot_setup_check);
601
602
603 /**
604  *      netdev_boot_base        - get address from boot time settings
605  *      @prefix: prefix for network device
606  *      @unit: id for network device
607  *
608  *      Check boot time settings for the base address of device.
609  *      The found settings are set for the device to be used
610  *      later in the device probing.
611  *      Returns 0 if no settings found.
612  */
613 unsigned long netdev_boot_base(const char *prefix, int unit)
614 {
615         const struct netdev_boot_setup *s = dev_boot_setup;
616         char name[IFNAMSIZ];
617         int i;
618
619         sprintf(name, "%s%d", prefix, unit);
620
621         /*
622          * If device already registered then return base of 1
623          * to indicate not to probe for this interface
624          */
625         if (__dev_get_by_name(&init_net, name))
626                 return 1;
627
628         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
629                 if (!strcmp(name, s[i].name))
630                         return s[i].map.base_addr;
631         return 0;
632 }
633
634 /*
635  * Saves at boot time configured settings for any netdevice.
636  */
637 int __init netdev_boot_setup(char *str)
638 {
639         int ints[5];
640         struct ifmap map;
641
642         str = get_options(str, ARRAY_SIZE(ints), ints);
643         if (!str || !*str)
644                 return 0;
645
646         /* Save settings */
647         memset(&map, 0, sizeof(map));
648         if (ints[0] > 0)
649                 map.irq = ints[1];
650         if (ints[0] > 1)
651                 map.base_addr = ints[2];
652         if (ints[0] > 2)
653                 map.mem_start = ints[3];
654         if (ints[0] > 3)
655                 map.mem_end = ints[4];
656
657         /* Add new entry to the list */
658         return netdev_boot_setup_add(str, &map);
659 }
660
661 __setup("netdev=", netdev_boot_setup);
662
663 /*******************************************************************************
664
665                             Device Interface Subroutines
666
667 *******************************************************************************/
668
669 /**
670  *      dev_get_iflink  - get 'iflink' value of a interface
671  *      @dev: targeted interface
672  *
673  *      Indicates the ifindex the interface is linked to.
674  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
675  */
676
677 int dev_get_iflink(const struct net_device *dev)
678 {
679         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
680                 return dev->netdev_ops->ndo_get_iflink(dev);
681
682         return dev->ifindex;
683 }
684 EXPORT_SYMBOL(dev_get_iflink);
685
686 /**
687  *      dev_fill_metadata_dst - Retrieve tunnel egress information.
688  *      @dev: targeted interface
689  *      @skb: The packet.
690  *
691  *      For better visibility of tunnel traffic OVS needs to retrieve
692  *      egress tunnel information for a packet. Following API allows
693  *      user to get this info.
694  */
695 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
696 {
697         struct ip_tunnel_info *info;
698
699         if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
700                 return -EINVAL;
701
702         info = skb_tunnel_info_unclone(skb);
703         if (!info)
704                 return -ENOMEM;
705         if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
706                 return -EINVAL;
707
708         return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
709 }
710 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
711
712 /**
713  *      __dev_get_by_name       - find a device by its name
714  *      @net: the applicable net namespace
715  *      @name: name to find
716  *
717  *      Find an interface by name. Must be called under RTNL semaphore
718  *      or @dev_base_lock. If the name is found a pointer to the device
719  *      is returned. If the name is not found then %NULL is returned. The
720  *      reference counters are not incremented so the caller must be
721  *      careful with locks.
722  */
723
724 struct net_device *__dev_get_by_name(struct net *net, const char *name)
725 {
726         struct net_device *dev;
727         struct hlist_head *head = dev_name_hash(net, name);
728
729         hlist_for_each_entry(dev, head, name_hlist)
730                 if (!strncmp(dev->name, name, IFNAMSIZ))
731                         return dev;
732
733         return NULL;
734 }
735 EXPORT_SYMBOL(__dev_get_by_name);
736
737 /**
738  *      dev_get_by_name_rcu     - find a device by its name
739  *      @net: the applicable net namespace
740  *      @name: name to find
741  *
742  *      Find an interface by name.
743  *      If the name is found a pointer to the device is returned.
744  *      If the name is not found then %NULL is returned.
745  *      The reference counters are not incremented so the caller must be
746  *      careful with locks. The caller must hold RCU lock.
747  */
748
749 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
750 {
751         struct net_device *dev;
752         struct hlist_head *head = dev_name_hash(net, name);
753
754         hlist_for_each_entry_rcu(dev, head, name_hlist)
755                 if (!strncmp(dev->name, name, IFNAMSIZ))
756                         return dev;
757
758         return NULL;
759 }
760 EXPORT_SYMBOL(dev_get_by_name_rcu);
761
762 /**
763  *      dev_get_by_name         - find a device by its name
764  *      @net: the applicable net namespace
765  *      @name: name to find
766  *
767  *      Find an interface by name. This can be called from any
768  *      context and does its own locking. The returned handle has
769  *      the usage count incremented and the caller must use dev_put() to
770  *      release it when it is no longer needed. %NULL is returned if no
771  *      matching device is found.
772  */
773
774 struct net_device *dev_get_by_name(struct net *net, const char *name)
775 {
776         struct net_device *dev;
777
778         rcu_read_lock();
779         dev = dev_get_by_name_rcu(net, name);
780         if (dev)
781                 dev_hold(dev);
782         rcu_read_unlock();
783         return dev;
784 }
785 EXPORT_SYMBOL(dev_get_by_name);
786
787 /**
788  *      __dev_get_by_index - find a device by its ifindex
789  *      @net: the applicable net namespace
790  *      @ifindex: index of device
791  *
792  *      Search for an interface by index. Returns %NULL if the device
793  *      is not found or a pointer to the device. The device has not
794  *      had its reference counter increased so the caller must be careful
795  *      about locking. The caller must hold either the RTNL semaphore
796  *      or @dev_base_lock.
797  */
798
799 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
800 {
801         struct net_device *dev;
802         struct hlist_head *head = dev_index_hash(net, ifindex);
803
804         hlist_for_each_entry(dev, head, index_hlist)
805                 if (dev->ifindex == ifindex)
806                         return dev;
807
808         return NULL;
809 }
810 EXPORT_SYMBOL(__dev_get_by_index);
811
812 /**
813  *      dev_get_by_index_rcu - find a device by its ifindex
814  *      @net: the applicable net namespace
815  *      @ifindex: index of device
816  *
817  *      Search for an interface by index. Returns %NULL if the device
818  *      is not found or a pointer to the device. The device has not
819  *      had its reference counter increased so the caller must be careful
820  *      about locking. The caller must hold RCU lock.
821  */
822
823 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
824 {
825         struct net_device *dev;
826         struct hlist_head *head = dev_index_hash(net, ifindex);
827
828         hlist_for_each_entry_rcu(dev, head, index_hlist)
829                 if (dev->ifindex == ifindex)
830                         return dev;
831
832         return NULL;
833 }
834 EXPORT_SYMBOL(dev_get_by_index_rcu);
835
836
837 /**
838  *      dev_get_by_index - find a device by its ifindex
839  *      @net: the applicable net namespace
840  *      @ifindex: index of device
841  *
842  *      Search for an interface by index. Returns NULL if the device
843  *      is not found or a pointer to the device. The device returned has
844  *      had a reference added and the pointer is safe until the user calls
845  *      dev_put to indicate they have finished with it.
846  */
847
848 struct net_device *dev_get_by_index(struct net *net, int ifindex)
849 {
850         struct net_device *dev;
851
852         rcu_read_lock();
853         dev = dev_get_by_index_rcu(net, ifindex);
854         if (dev)
855                 dev_hold(dev);
856         rcu_read_unlock();
857         return dev;
858 }
859 EXPORT_SYMBOL(dev_get_by_index);
860
861 /**
862  *      netdev_get_name - get a netdevice name, knowing its ifindex.
863  *      @net: network namespace
864  *      @name: a pointer to the buffer where the name will be stored.
865  *      @ifindex: the ifindex of the interface to get the name from.
866  */
867 int netdev_get_name(struct net *net, char *name, int ifindex)
868 {
869         struct net_device *dev;
870         int ret;
871
872         down_read(&devnet_rename_sem);
873         rcu_read_lock();
874
875         dev = dev_get_by_index_rcu(net, ifindex);
876         if (!dev) {
877                 ret = -ENODEV;
878                 goto out;
879         }
880
881         strcpy(name, dev->name);
882
883         ret = 0;
884 out:
885         rcu_read_unlock();
886         up_read(&devnet_rename_sem);
887         return ret;
888 }
889
890 /**
891  *      dev_getbyhwaddr_rcu - find a device by its hardware address
892  *      @net: the applicable net namespace
893  *      @type: media type of device
894  *      @ha: hardware address
895  *
896  *      Search for an interface by MAC address. Returns NULL if the device
897  *      is not found or a pointer to the device.
898  *      The caller must hold RCU or RTNL.
899  *      The returned device has not had its ref count increased
900  *      and the caller must therefore be careful about locking
901  *
902  */
903
904 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
905                                        const char *ha)
906 {
907         struct net_device *dev;
908
909         for_each_netdev_rcu(net, dev)
910                 if (dev->type == type &&
911                     !memcmp(dev->dev_addr, ha, dev->addr_len))
912                         return dev;
913
914         return NULL;
915 }
916 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
917
918 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
919 {
920         struct net_device *dev;
921
922         ASSERT_RTNL();
923         for_each_netdev(net, dev)
924                 if (dev->type == type)
925                         return dev;
926
927         return NULL;
928 }
929 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
930
931 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
932 {
933         struct net_device *dev, *ret = NULL;
934
935         rcu_read_lock();
936         for_each_netdev_rcu(net, dev)
937                 if (dev->type == type) {
938                         dev_hold(dev);
939                         ret = dev;
940                         break;
941                 }
942         rcu_read_unlock();
943         return ret;
944 }
945 EXPORT_SYMBOL(dev_getfirstbyhwtype);
946
947 /**
948  *      __dev_get_by_flags - find any device with given flags
949  *      @net: the applicable net namespace
950  *      @if_flags: IFF_* values
951  *      @mask: bitmask of bits in if_flags to check
952  *
953  *      Search for any interface with the given flags. Returns NULL if a device
954  *      is not found or a pointer to the device. Must be called inside
955  *      rtnl_lock(), and result refcount is unchanged.
956  */
957
958 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
959                                       unsigned short mask)
960 {
961         struct net_device *dev, *ret;
962
963         ASSERT_RTNL();
964
965         ret = NULL;
966         for_each_netdev(net, dev) {
967                 if (((dev->flags ^ if_flags) & mask) == 0) {
968                         ret = dev;
969                         break;
970                 }
971         }
972         return ret;
973 }
974 EXPORT_SYMBOL(__dev_get_by_flags);
975
976 /**
977  *      dev_valid_name - check if name is okay for network device
978  *      @name: name string
979  *
980  *      Network device names need to be valid file names to
981  *      to allow sysfs to work.  We also disallow any kind of
982  *      whitespace.
983  */
984 bool dev_valid_name(const char *name)
985 {
986         if (*name == '\0')
987                 return false;
988         if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
989                 return false;
990         if (!strcmp(name, ".") || !strcmp(name, ".."))
991                 return false;
992
993         while (*name) {
994                 if (*name == '/' || *name == ':' || isspace(*name))
995                         return false;
996                 name++;
997         }
998         return true;
999 }
1000 EXPORT_SYMBOL(dev_valid_name);
1001
1002 /**
1003  *      __dev_alloc_name - allocate a name for a device
1004  *      @net: network namespace to allocate the device name in
1005  *      @name: name format string
1006  *      @buf:  scratch buffer and result name string
1007  *
1008  *      Passed a format string - eg "lt%d" it will try and find a suitable
1009  *      id. It scans list of devices to build up a free map, then chooses
1010  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1011  *      while allocating the name and adding the device in order to avoid
1012  *      duplicates.
1013  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1014  *      Returns the number of the unit assigned or a negative errno code.
1015  */
1016
1017 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1018 {
1019         int i = 0;
1020         const char *p;
1021         const int max_netdevices = 8*PAGE_SIZE;
1022         unsigned long *inuse;
1023         struct net_device *d;
1024
1025         p = strnchr(name, IFNAMSIZ-1, '%');
1026         if (p) {
1027                 /*
1028                  * Verify the string as this thing may have come from
1029                  * the user.  There must be either one "%d" and no other "%"
1030                  * characters.
1031                  */
1032                 if (p[1] != 'd' || strchr(p + 2, '%'))
1033                         return -EINVAL;
1034
1035                 /* Use one page as a bit array of possible slots */
1036                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1037                 if (!inuse)
1038                         return -ENOMEM;
1039
1040                 for_each_netdev(net, d) {
1041                         if (!sscanf(d->name, name, &i))
1042                                 continue;
1043                         if (i < 0 || i >= max_netdevices)
1044                                 continue;
1045
1046                         /*  avoid cases where sscanf is not exact inverse of printf */
1047                         snprintf(buf, IFNAMSIZ, name, i);
1048                         if (!strncmp(buf, d->name, IFNAMSIZ))
1049                                 set_bit(i, inuse);
1050                 }
1051
1052                 i = find_first_zero_bit(inuse, max_netdevices);
1053                 free_page((unsigned long) inuse);
1054         }
1055
1056         if (buf != name)
1057                 snprintf(buf, IFNAMSIZ, name, i);
1058         if (!__dev_get_by_name(net, buf))
1059                 return i;
1060
1061         /* It is possible to run out of possible slots
1062          * when the name is long and there isn't enough space left
1063          * for the digits, or if all bits are used.
1064          */
1065         return -ENFILE;
1066 }
1067
1068 /**
1069  *      dev_alloc_name - allocate a name for a device
1070  *      @dev: device
1071  *      @name: name format string
1072  *
1073  *      Passed a format string - eg "lt%d" it will try and find a suitable
1074  *      id. It scans list of devices to build up a free map, then chooses
1075  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1076  *      while allocating the name and adding the device in order to avoid
1077  *      duplicates.
1078  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1079  *      Returns the number of the unit assigned or a negative errno code.
1080  */
1081
1082 int dev_alloc_name(struct net_device *dev, const char *name)
1083 {
1084         char buf[IFNAMSIZ];
1085         struct net *net;
1086         int ret;
1087
1088         BUG_ON(!dev_net(dev));
1089         net = dev_net(dev);
1090         ret = __dev_alloc_name(net, name, buf);
1091         if (ret >= 0)
1092                 strlcpy(dev->name, buf, IFNAMSIZ);
1093         return ret;
1094 }
1095 EXPORT_SYMBOL(dev_alloc_name);
1096
1097 static int dev_alloc_name_ns(struct net *net,
1098                              struct net_device *dev,
1099                              const char *name)
1100 {
1101         char buf[IFNAMSIZ];
1102         int ret;
1103
1104         ret = __dev_alloc_name(net, name, buf);
1105         if (ret >= 0)
1106                 strlcpy(dev->name, buf, IFNAMSIZ);
1107         return ret;
1108 }
1109
1110 int dev_get_valid_name(struct net *net, struct net_device *dev,
1111                        const char *name)
1112 {
1113         BUG_ON(!net);
1114
1115         if (!dev_valid_name(name))
1116                 return -EINVAL;
1117
1118         if (strchr(name, '%'))
1119                 return dev_alloc_name_ns(net, dev, name);
1120         else if (__dev_get_by_name(net, name))
1121                 return -EEXIST;
1122         else if (dev->name != name)
1123                 strlcpy(dev->name, name, IFNAMSIZ);
1124
1125         return 0;
1126 }
1127 EXPORT_SYMBOL(dev_get_valid_name);
1128
1129 /**
1130  *      dev_change_name - change name of a device
1131  *      @dev: device
1132  *      @newname: name (or format string) must be at least IFNAMSIZ
1133  *
1134  *      Change name of a device, can pass format strings "eth%d".
1135  *      for wildcarding.
1136  */
1137 int dev_change_name(struct net_device *dev, const char *newname)
1138 {
1139         unsigned char old_assign_type;
1140         char oldname[IFNAMSIZ];
1141         int err = 0;
1142         int ret;
1143         struct net *net;
1144
1145         ASSERT_RTNL();
1146         BUG_ON(!dev_net(dev));
1147
1148         net = dev_net(dev);
1149         if (dev->flags & IFF_UP)
1150                 return -EBUSY;
1151
1152         down_write(&devnet_rename_sem);
1153
1154         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1155                 up_write(&devnet_rename_sem);
1156                 return 0;
1157         }
1158
1159         memcpy(oldname, dev->name, IFNAMSIZ);
1160
1161         err = dev_get_valid_name(net, dev, newname);
1162         if (err < 0) {
1163                 up_write(&devnet_rename_sem);
1164                 return err;
1165         }
1166
1167         if (oldname[0] && !strchr(oldname, '%'))
1168                 netdev_info(dev, "renamed from %s\n", oldname);
1169
1170         old_assign_type = dev->name_assign_type;
1171         dev->name_assign_type = NET_NAME_RENAMED;
1172
1173 rollback:
1174         ret = device_rename(&dev->dev, dev->name);
1175         if (ret) {
1176                 memcpy(dev->name, oldname, IFNAMSIZ);
1177                 dev->name_assign_type = old_assign_type;
1178                 up_write(&devnet_rename_sem);
1179                 return ret;
1180         }
1181
1182         up_write(&devnet_rename_sem);
1183
1184         netdev_adjacent_rename_links(dev, oldname);
1185
1186         write_lock_bh(&dev_base_lock);
1187         hlist_del_rcu(&dev->name_hlist);
1188         write_unlock_bh(&dev_base_lock);
1189
1190         synchronize_rcu();
1191
1192         write_lock_bh(&dev_base_lock);
1193         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1194         write_unlock_bh(&dev_base_lock);
1195
1196         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1197         ret = notifier_to_errno(ret);
1198
1199         if (ret) {
1200                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1201                 if (err >= 0) {
1202                         err = ret;
1203                         down_write(&devnet_rename_sem);
1204                         memcpy(dev->name, oldname, IFNAMSIZ);
1205                         memcpy(oldname, newname, IFNAMSIZ);
1206                         dev->name_assign_type = old_assign_type;
1207                         old_assign_type = NET_NAME_RENAMED;
1208                         goto rollback;
1209                 } else {
1210                         pr_err("%s: name change rollback failed: %d\n",
1211                                dev->name, ret);
1212                 }
1213         }
1214
1215         return err;
1216 }
1217
1218 /**
1219  *      dev_set_alias - change ifalias of a device
1220  *      @dev: device
1221  *      @alias: name up to IFALIASZ
1222  *      @len: limit of bytes to copy from info
1223  *
1224  *      Set ifalias for a device,
1225  */
1226 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1227 {
1228         char *new_ifalias;
1229
1230         ASSERT_RTNL();
1231
1232         if (len >= IFALIASZ)
1233                 return -EINVAL;
1234
1235         if (!len) {
1236                 kfree(dev->ifalias);
1237                 dev->ifalias = NULL;
1238                 return 0;
1239         }
1240
1241         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1242         if (!new_ifalias)
1243                 return -ENOMEM;
1244         dev->ifalias = new_ifalias;
1245         memcpy(dev->ifalias, alias, len);
1246         dev->ifalias[len] = 0;
1247
1248         return len;
1249 }
1250
1251
1252 /**
1253  *      netdev_features_change - device changes features
1254  *      @dev: device to cause notification
1255  *
1256  *      Called to indicate a device has changed features.
1257  */
1258 void netdev_features_change(struct net_device *dev)
1259 {
1260         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1261 }
1262 EXPORT_SYMBOL(netdev_features_change);
1263
1264 /**
1265  *      netdev_state_change - device changes state
1266  *      @dev: device to cause notification
1267  *
1268  *      Called to indicate a device has changed state. This function calls
1269  *      the notifier chains for netdev_chain and sends a NEWLINK message
1270  *      to the routing socket.
1271  */
1272 void netdev_state_change(struct net_device *dev)
1273 {
1274         if (dev->flags & IFF_UP) {
1275                 struct netdev_notifier_change_info change_info;
1276
1277                 change_info.flags_changed = 0;
1278                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1279                                               &change_info.info);
1280                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1281         }
1282 }
1283 EXPORT_SYMBOL(netdev_state_change);
1284
1285 /**
1286  *      netdev_notify_peers - notify network peers about existence of @dev
1287  *      @dev: network device
1288  *
1289  * Generate traffic such that interested network peers are aware of
1290  * @dev, such as by generating a gratuitous ARP. This may be used when
1291  * a device wants to inform the rest of the network about some sort of
1292  * reconfiguration such as a failover event or virtual machine
1293  * migration.
1294  */
1295 void netdev_notify_peers(struct net_device *dev)
1296 {
1297         rtnl_lock();
1298         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1299         call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1300         rtnl_unlock();
1301 }
1302 EXPORT_SYMBOL(netdev_notify_peers);
1303
1304 static int __dev_open(struct net_device *dev)
1305 {
1306         const struct net_device_ops *ops = dev->netdev_ops;
1307         int ret;
1308
1309         ASSERT_RTNL();
1310
1311         if (!netif_device_present(dev))
1312                 return -ENODEV;
1313
1314         /* Block netpoll from trying to do any rx path servicing.
1315          * If we don't do this there is a chance ndo_poll_controller
1316          * or ndo_poll may be running while we open the device
1317          */
1318         netpoll_poll_disable(dev);
1319
1320         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1321         ret = notifier_to_errno(ret);
1322         if (ret)
1323                 return ret;
1324
1325         set_bit(__LINK_STATE_START, &dev->state);
1326
1327         if (ops->ndo_validate_addr)
1328                 ret = ops->ndo_validate_addr(dev);
1329
1330         if (!ret && ops->ndo_open)
1331                 ret = ops->ndo_open(dev);
1332
1333         netpoll_poll_enable(dev);
1334
1335         if (ret)
1336                 clear_bit(__LINK_STATE_START, &dev->state);
1337         else {
1338                 dev->flags |= IFF_UP;
1339                 dev_set_rx_mode(dev);
1340                 dev_activate(dev);
1341                 add_device_randomness(dev->dev_addr, dev->addr_len);
1342         }
1343
1344         return ret;
1345 }
1346
1347 /**
1348  *      dev_open        - prepare an interface for use.
1349  *      @dev:   device to open
1350  *
1351  *      Takes a device from down to up state. The device's private open
1352  *      function is invoked and then the multicast lists are loaded. Finally
1353  *      the device is moved into the up state and a %NETDEV_UP message is
1354  *      sent to the netdev notifier chain.
1355  *
1356  *      Calling this function on an active interface is a nop. On a failure
1357  *      a negative errno code is returned.
1358  */
1359 int dev_open(struct net_device *dev)
1360 {
1361         int ret;
1362
1363         if (dev->flags & IFF_UP)
1364                 return 0;
1365
1366         ret = __dev_open(dev);
1367         if (ret < 0)
1368                 return ret;
1369
1370         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1371         call_netdevice_notifiers(NETDEV_UP, dev);
1372
1373         return ret;
1374 }
1375 EXPORT_SYMBOL(dev_open);
1376
1377 static int __dev_close_many(struct list_head *head)
1378 {
1379         struct net_device *dev;
1380
1381         ASSERT_RTNL();
1382         might_sleep();
1383
1384         list_for_each_entry(dev, head, close_list) {
1385                 /* Temporarily disable netpoll until the interface is down */
1386                 netpoll_poll_disable(dev);
1387
1388                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1389
1390                 clear_bit(__LINK_STATE_START, &dev->state);
1391
1392                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1393                  * can be even on different cpu. So just clear netif_running().
1394                  *
1395                  * dev->stop() will invoke napi_disable() on all of it's
1396                  * napi_struct instances on this device.
1397                  */
1398                 smp_mb__after_atomic(); /* Commit netif_running(). */
1399         }
1400
1401         dev_deactivate_many(head);
1402
1403         list_for_each_entry(dev, head, close_list) {
1404                 const struct net_device_ops *ops = dev->netdev_ops;
1405
1406                 /*
1407                  *      Call the device specific close. This cannot fail.
1408                  *      Only if device is UP
1409                  *
1410                  *      We allow it to be called even after a DETACH hot-plug
1411                  *      event.
1412                  */
1413                 if (ops->ndo_stop)
1414                         ops->ndo_stop(dev);
1415
1416                 dev->flags &= ~IFF_UP;
1417                 netpoll_poll_enable(dev);
1418         }
1419
1420         return 0;
1421 }
1422
1423 static int __dev_close(struct net_device *dev)
1424 {
1425         int retval;
1426         LIST_HEAD(single);
1427
1428         list_add(&dev->close_list, &single);
1429         retval = __dev_close_many(&single);
1430         list_del(&single);
1431
1432         return retval;
1433 }
1434
1435 int dev_close_many(struct list_head *head, bool unlink)
1436 {
1437         struct net_device *dev, *tmp;
1438
1439         /* Remove the devices that don't need to be closed */
1440         list_for_each_entry_safe(dev, tmp, head, close_list)
1441                 if (!(dev->flags & IFF_UP))
1442                         list_del_init(&dev->close_list);
1443
1444         __dev_close_many(head);
1445
1446         list_for_each_entry_safe(dev, tmp, head, close_list) {
1447                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1448                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1449                 if (unlink)
1450                         list_del_init(&dev->close_list);
1451         }
1452
1453         return 0;
1454 }
1455 EXPORT_SYMBOL(dev_close_many);
1456
1457 /**
1458  *      dev_close - shutdown an interface.
1459  *      @dev: device to shutdown
1460  *
1461  *      This function moves an active device into down state. A
1462  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1463  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1464  *      chain.
1465  */
1466 int dev_close(struct net_device *dev)
1467 {
1468         if (dev->flags & IFF_UP) {
1469                 LIST_HEAD(single);
1470
1471                 list_add(&dev->close_list, &single);
1472                 dev_close_many(&single, true);
1473                 list_del(&single);
1474         }
1475         return 0;
1476 }
1477 EXPORT_SYMBOL(dev_close);
1478
1479
1480 /**
1481  *      dev_disable_lro - disable Large Receive Offload on a device
1482  *      @dev: device
1483  *
1484  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1485  *      called under RTNL.  This is needed if received packets may be
1486  *      forwarded to another interface.
1487  */
1488 void dev_disable_lro(struct net_device *dev)
1489 {
1490         struct net_device *lower_dev;
1491         struct list_head *iter;
1492
1493         dev->wanted_features &= ~NETIF_F_LRO;
1494         netdev_update_features(dev);
1495
1496         if (unlikely(dev->features & NETIF_F_LRO))
1497                 netdev_WARN(dev, "failed to disable LRO!\n");
1498
1499         netdev_for_each_lower_dev(dev, lower_dev, iter)
1500                 dev_disable_lro(lower_dev);
1501 }
1502 EXPORT_SYMBOL(dev_disable_lro);
1503
1504 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1505                                    struct net_device *dev)
1506 {
1507         struct netdev_notifier_info info;
1508
1509         netdev_notifier_info_init(&info, dev);
1510         return nb->notifier_call(nb, val, &info);
1511 }
1512
1513 static int dev_boot_phase = 1;
1514
1515 /**
1516  *      register_netdevice_notifier - register a network notifier block
1517  *      @nb: notifier
1518  *
1519  *      Register a notifier to be called when network device events occur.
1520  *      The notifier passed is linked into the kernel structures and must
1521  *      not be reused until it has been unregistered. A negative errno code
1522  *      is returned on a failure.
1523  *
1524  *      When registered all registration and up events are replayed
1525  *      to the new notifier to allow device to have a race free
1526  *      view of the network device list.
1527  */
1528
1529 int register_netdevice_notifier(struct notifier_block *nb)
1530 {
1531         struct net_device *dev;
1532         struct net_device *last;
1533         struct net *net;
1534         int err;
1535
1536         rtnl_lock();
1537         err = raw_notifier_chain_register(&netdev_chain, nb);
1538         if (err)
1539                 goto unlock;
1540         if (dev_boot_phase)
1541                 goto unlock;
1542         for_each_net(net) {
1543                 for_each_netdev(net, dev) {
1544                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1545                         err = notifier_to_errno(err);
1546                         if (err)
1547                                 goto rollback;
1548
1549                         if (!(dev->flags & IFF_UP))
1550                                 continue;
1551
1552                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1553                 }
1554         }
1555
1556 unlock:
1557         rtnl_unlock();
1558         return err;
1559
1560 rollback:
1561         last = dev;
1562         for_each_net(net) {
1563                 for_each_netdev(net, dev) {
1564                         if (dev == last)
1565                                 goto outroll;
1566
1567                         if (dev->flags & IFF_UP) {
1568                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1569                                                         dev);
1570                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1571                         }
1572                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1573                 }
1574         }
1575
1576 outroll:
1577         raw_notifier_chain_unregister(&netdev_chain, nb);
1578         goto unlock;
1579 }
1580 EXPORT_SYMBOL(register_netdevice_notifier);
1581
1582 /**
1583  *      unregister_netdevice_notifier - unregister a network notifier block
1584  *      @nb: notifier
1585  *
1586  *      Unregister a notifier previously registered by
1587  *      register_netdevice_notifier(). The notifier is unlinked into the
1588  *      kernel structures and may then be reused. A negative errno code
1589  *      is returned on a failure.
1590  *
1591  *      After unregistering unregister and down device events are synthesized
1592  *      for all devices on the device list to the removed notifier to remove
1593  *      the need for special case cleanup code.
1594  */
1595
1596 int unregister_netdevice_notifier(struct notifier_block *nb)
1597 {
1598         struct net_device *dev;
1599         struct net *net;
1600         int err;
1601
1602         rtnl_lock();
1603         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1604         if (err)
1605                 goto unlock;
1606
1607         for_each_net(net) {
1608                 for_each_netdev(net, dev) {
1609                         if (dev->flags & IFF_UP) {
1610                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1611                                                         dev);
1612                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1613                         }
1614                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1615                 }
1616         }
1617 unlock:
1618         rtnl_unlock();
1619         return err;
1620 }
1621 EXPORT_SYMBOL(unregister_netdevice_notifier);
1622
1623 /**
1624  *      call_netdevice_notifiers_info - call all network notifier blocks
1625  *      @val: value passed unmodified to notifier function
1626  *      @dev: net_device pointer passed unmodified to notifier function
1627  *      @info: notifier information data
1628  *
1629  *      Call all network notifier blocks.  Parameters and return value
1630  *      are as for raw_notifier_call_chain().
1631  */
1632
1633 static int call_netdevice_notifiers_info(unsigned long val,
1634                                          struct net_device *dev,
1635                                          struct netdev_notifier_info *info)
1636 {
1637         ASSERT_RTNL();
1638         netdev_notifier_info_init(info, dev);
1639         return raw_notifier_call_chain(&netdev_chain, val, info);
1640 }
1641
1642 /**
1643  *      call_netdevice_notifiers - call all network notifier blocks
1644  *      @val: value passed unmodified to notifier function
1645  *      @dev: net_device pointer passed unmodified to notifier function
1646  *
1647  *      Call all network notifier blocks.  Parameters and return value
1648  *      are as for raw_notifier_call_chain().
1649  */
1650
1651 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1652 {
1653         struct netdev_notifier_info info;
1654
1655         return call_netdevice_notifiers_info(val, dev, &info);
1656 }
1657 EXPORT_SYMBOL(call_netdevice_notifiers);
1658
1659 /**
1660  *      call_netdevice_notifiers_mtu - call all network notifier blocks
1661  *      @val: value passed unmodified to notifier function
1662  *      @dev: net_device pointer passed unmodified to notifier function
1663  *      @arg: additional u32 argument passed to the notifier function
1664  *
1665  *      Call all network notifier blocks.  Parameters and return value
1666  *      are as for raw_notifier_call_chain().
1667  */
1668 static int call_netdevice_notifiers_mtu(unsigned long val,
1669                                         struct net_device *dev, u32 arg)
1670 {
1671         struct netdev_notifier_info_ext info = {
1672                 .info.dev = dev,
1673                 .ext.mtu = arg,
1674         };
1675
1676         BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
1677
1678         return call_netdevice_notifiers_info(val, dev, &info.info);
1679 }
1680
1681 #ifdef CONFIG_NET_INGRESS
1682 static struct static_key ingress_needed __read_mostly;
1683
1684 void net_inc_ingress_queue(void)
1685 {
1686         static_key_slow_inc(&ingress_needed);
1687 }
1688 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1689
1690 void net_dec_ingress_queue(void)
1691 {
1692         static_key_slow_dec(&ingress_needed);
1693 }
1694 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1695 #endif
1696
1697 static struct static_key netstamp_needed __read_mostly;
1698 #ifdef HAVE_JUMP_LABEL
1699 static atomic_t netstamp_needed_deferred;
1700 static atomic_t netstamp_wanted;
1701 static void netstamp_clear(struct work_struct *work)
1702 {
1703         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1704         int wanted;
1705
1706         wanted = atomic_add_return(deferred, &netstamp_wanted);
1707         if (wanted > 0)
1708                 static_key_enable(&netstamp_needed);
1709         else
1710                 static_key_disable(&netstamp_needed);
1711 }
1712 static DECLARE_WORK(netstamp_work, netstamp_clear);
1713 #endif
1714
1715 void net_enable_timestamp(void)
1716 {
1717 #ifdef HAVE_JUMP_LABEL
1718         int wanted;
1719
1720         while (1) {
1721                 wanted = atomic_read(&netstamp_wanted);
1722                 if (wanted <= 0)
1723                         break;
1724                 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1725                         return;
1726         }
1727         atomic_inc(&netstamp_needed_deferred);
1728         schedule_work(&netstamp_work);
1729 #else
1730         static_key_slow_inc(&netstamp_needed);
1731 #endif
1732 }
1733 EXPORT_SYMBOL(net_enable_timestamp);
1734
1735 void net_disable_timestamp(void)
1736 {
1737 #ifdef HAVE_JUMP_LABEL
1738         int wanted;
1739
1740         while (1) {
1741                 wanted = atomic_read(&netstamp_wanted);
1742                 if (wanted <= 1)
1743                         break;
1744                 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1745                         return;
1746         }
1747         atomic_dec(&netstamp_needed_deferred);
1748         schedule_work(&netstamp_work);
1749 #else
1750         static_key_slow_dec(&netstamp_needed);
1751 #endif
1752 }
1753 EXPORT_SYMBOL(net_disable_timestamp);
1754
1755 static inline void net_timestamp_set(struct sk_buff *skb)
1756 {
1757         skb->tstamp.tv64 = 0;
1758         if (static_key_false(&netstamp_needed))
1759                 __net_timestamp(skb);
1760 }
1761
1762 #define net_timestamp_check(COND, SKB)                  \
1763         if (static_key_false(&netstamp_needed)) {               \
1764                 if ((COND) && !(SKB)->tstamp.tv64)      \
1765                         __net_timestamp(SKB);           \
1766         }                                               \
1767
1768 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1769 {
1770         unsigned int len;
1771
1772         if (!(dev->flags & IFF_UP))
1773                 return false;
1774
1775         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1776         if (skb->len <= len)
1777                 return true;
1778
1779         /* if TSO is enabled, we don't care about the length as the packet
1780          * could be forwarded without being segmented before
1781          */
1782         if (skb_is_gso(skb))
1783                 return true;
1784
1785         return false;
1786 }
1787 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1788
1789 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1790 {
1791         if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1792             unlikely(!is_skb_forwardable(dev, skb))) {
1793                 atomic_long_inc(&dev->rx_dropped);
1794                 kfree_skb(skb);
1795                 return NET_RX_DROP;
1796         }
1797
1798         skb_scrub_packet(skb, true);
1799         skb->priority = 0;
1800         skb->protocol = eth_type_trans(skb, dev);
1801         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1802
1803         return 0;
1804 }
1805 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1806
1807 /**
1808  * dev_forward_skb - loopback an skb to another netif
1809  *
1810  * @dev: destination network device
1811  * @skb: buffer to forward
1812  *
1813  * return values:
1814  *      NET_RX_SUCCESS  (no congestion)
1815  *      NET_RX_DROP     (packet was dropped, but freed)
1816  *
1817  * dev_forward_skb can be used for injecting an skb from the
1818  * start_xmit function of one device into the receive queue
1819  * of another device.
1820  *
1821  * The receiving device may be in another namespace, so
1822  * we have to clear all information in the skb that could
1823  * impact namespace isolation.
1824  */
1825 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1826 {
1827         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1828 }
1829 EXPORT_SYMBOL_GPL(dev_forward_skb);
1830
1831 static inline int deliver_skb(struct sk_buff *skb,
1832                               struct packet_type *pt_prev,
1833                               struct net_device *orig_dev)
1834 {
1835         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1836                 return -ENOMEM;
1837         atomic_inc(&skb->users);
1838         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1839 }
1840
1841 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1842                                           struct packet_type **pt,
1843                                           struct net_device *orig_dev,
1844                                           __be16 type,
1845                                           struct list_head *ptype_list)
1846 {
1847         struct packet_type *ptype, *pt_prev = *pt;
1848
1849         list_for_each_entry_rcu(ptype, ptype_list, list) {
1850                 if (ptype->type != type)
1851                         continue;
1852                 if (pt_prev)
1853                         deliver_skb(skb, pt_prev, orig_dev);
1854                 pt_prev = ptype;
1855         }
1856         *pt = pt_prev;
1857 }
1858
1859 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1860 {
1861         if (!ptype->af_packet_priv || !skb->sk)
1862                 return false;
1863
1864         if (ptype->id_match)
1865                 return ptype->id_match(ptype, skb->sk);
1866         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1867                 return true;
1868
1869         return false;
1870 }
1871
1872 /*
1873  *      Support routine. Sends outgoing frames to any network
1874  *      taps currently in use.
1875  */
1876
1877 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1878 {
1879         struct packet_type *ptype;
1880         struct sk_buff *skb2 = NULL;
1881         struct packet_type *pt_prev = NULL;
1882         struct list_head *ptype_list = &ptype_all;
1883
1884         rcu_read_lock();
1885 again:
1886         list_for_each_entry_rcu(ptype, ptype_list, list) {
1887                 /* Never send packets back to the socket
1888                  * they originated from - MvS (miquels@drinkel.ow.org)
1889                  */
1890                 if (skb_loop_sk(ptype, skb))
1891                         continue;
1892
1893                 if (pt_prev) {
1894                         deliver_skb(skb2, pt_prev, skb->dev);
1895                         pt_prev = ptype;
1896                         continue;
1897                 }
1898
1899                 /* need to clone skb, done only once */
1900                 skb2 = skb_clone(skb, GFP_ATOMIC);
1901                 if (!skb2)
1902                         goto out_unlock;
1903
1904                 net_timestamp_set(skb2);
1905
1906                 /* skb->nh should be correctly
1907                  * set by sender, so that the second statement is
1908                  * just protection against buggy protocols.
1909                  */
1910                 skb_reset_mac_header(skb2);
1911
1912                 if (skb_network_header(skb2) < skb2->data ||
1913                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1914                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1915                                              ntohs(skb2->protocol),
1916                                              dev->name);
1917                         skb_reset_network_header(skb2);
1918                 }
1919
1920                 skb2->transport_header = skb2->network_header;
1921                 skb2->pkt_type = PACKET_OUTGOING;
1922                 pt_prev = ptype;
1923         }
1924
1925         if (ptype_list == &ptype_all) {
1926                 ptype_list = &dev->ptype_all;
1927                 goto again;
1928         }
1929 out_unlock:
1930         if (pt_prev)
1931                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1932         rcu_read_unlock();
1933 }
1934
1935 /**
1936  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1937  * @dev: Network device
1938  * @txq: number of queues available
1939  *
1940  * If real_num_tx_queues is changed the tc mappings may no longer be
1941  * valid. To resolve this verify the tc mapping remains valid and if
1942  * not NULL the mapping. With no priorities mapping to this
1943  * offset/count pair it will no longer be used. In the worst case TC0
1944  * is invalid nothing can be done so disable priority mappings. If is
1945  * expected that drivers will fix this mapping if they can before
1946  * calling netif_set_real_num_tx_queues.
1947  */
1948 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1949 {
1950         int i;
1951         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1952
1953         /* If TC0 is invalidated disable TC mapping */
1954         if (tc->offset + tc->count > txq) {
1955                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1956                 dev->num_tc = 0;
1957                 return;
1958         }
1959
1960         /* Invalidated prio to tc mappings set to TC0 */
1961         for (i = 1; i < TC_BITMASK + 1; i++) {
1962                 int q = netdev_get_prio_tc_map(dev, i);
1963
1964                 tc = &dev->tc_to_txq[q];
1965                 if (tc->offset + tc->count > txq) {
1966                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1967                                 i, q);
1968                         netdev_set_prio_tc_map(dev, i, 0);
1969                 }
1970         }
1971 }
1972
1973 #ifdef CONFIG_XPS
1974 static DEFINE_MUTEX(xps_map_mutex);
1975 #define xmap_dereference(P)             \
1976         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1977
1978 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1979                                         int cpu, u16 index)
1980 {
1981         struct xps_map *map = NULL;
1982         int pos;
1983
1984         if (dev_maps)
1985                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1986
1987         for (pos = 0; map && pos < map->len; pos++) {
1988                 if (map->queues[pos] == index) {
1989                         if (map->len > 1) {
1990                                 map->queues[pos] = map->queues[--map->len];
1991                         } else {
1992                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1993                                 kfree_rcu(map, rcu);
1994                                 map = NULL;
1995                         }
1996                         break;
1997                 }
1998         }
1999
2000         return map;
2001 }
2002
2003 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2004 {
2005         struct xps_dev_maps *dev_maps;
2006         int cpu, i;
2007         bool active = false;
2008
2009         mutex_lock(&xps_map_mutex);
2010         dev_maps = xmap_dereference(dev->xps_maps);
2011
2012         if (!dev_maps)
2013                 goto out_no_maps;
2014
2015         for_each_possible_cpu(cpu) {
2016                 for (i = index; i < dev->num_tx_queues; i++) {
2017                         if (!remove_xps_queue(dev_maps, cpu, i))
2018                                 break;
2019                 }
2020                 if (i == dev->num_tx_queues)
2021                         active = true;
2022         }
2023
2024         if (!active) {
2025                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2026                 kfree_rcu(dev_maps, rcu);
2027         }
2028
2029         for (i = index; i < dev->num_tx_queues; i++)
2030                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2031                                              NUMA_NO_NODE);
2032
2033 out_no_maps:
2034         mutex_unlock(&xps_map_mutex);
2035 }
2036
2037 static struct xps_map *expand_xps_map(struct xps_map *map,
2038                                       int cpu, u16 index)
2039 {
2040         struct xps_map *new_map;
2041         int alloc_len = XPS_MIN_MAP_ALLOC;
2042         int i, pos;
2043
2044         for (pos = 0; map && pos < map->len; pos++) {
2045                 if (map->queues[pos] != index)
2046                         continue;
2047                 return map;
2048         }
2049
2050         /* Need to add queue to this CPU's existing map */
2051         if (map) {
2052                 if (pos < map->alloc_len)
2053                         return map;
2054
2055                 alloc_len = map->alloc_len * 2;
2056         }
2057
2058         /* Need to allocate new map to store queue on this CPU's map */
2059         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2060                                cpu_to_node(cpu));
2061         if (!new_map)
2062                 return NULL;
2063
2064         for (i = 0; i < pos; i++)
2065                 new_map->queues[i] = map->queues[i];
2066         new_map->alloc_len = alloc_len;
2067         new_map->len = pos;
2068
2069         return new_map;
2070 }
2071
2072 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2073                         u16 index)
2074 {
2075         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2076         struct xps_map *map, *new_map;
2077         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2078         int cpu, numa_node_id = -2;
2079         bool active = false;
2080
2081         mutex_lock(&xps_map_mutex);
2082
2083         dev_maps = xmap_dereference(dev->xps_maps);
2084
2085         /* allocate memory for queue storage */
2086         for_each_online_cpu(cpu) {
2087                 if (!cpumask_test_cpu(cpu, mask))
2088                         continue;
2089
2090                 if (!new_dev_maps)
2091                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2092                 if (!new_dev_maps) {
2093                         mutex_unlock(&xps_map_mutex);
2094                         return -ENOMEM;
2095                 }
2096
2097                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2098                                  NULL;
2099
2100                 map = expand_xps_map(map, cpu, index);
2101                 if (!map)
2102                         goto error;
2103
2104                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2105         }
2106
2107         if (!new_dev_maps)
2108                 goto out_no_new_maps;
2109
2110         for_each_possible_cpu(cpu) {
2111                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2112                         /* add queue to CPU maps */
2113                         int pos = 0;
2114
2115                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2116                         while ((pos < map->len) && (map->queues[pos] != index))
2117                                 pos++;
2118
2119                         if (pos == map->len)
2120                                 map->queues[map->len++] = index;
2121 #ifdef CONFIG_NUMA
2122                         if (numa_node_id == -2)
2123                                 numa_node_id = cpu_to_node(cpu);
2124                         else if (numa_node_id != cpu_to_node(cpu))
2125                                 numa_node_id = -1;
2126 #endif
2127                 } else if (dev_maps) {
2128                         /* fill in the new device map from the old device map */
2129                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2130                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2131                 }
2132
2133         }
2134
2135         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2136
2137         /* Cleanup old maps */
2138         if (dev_maps) {
2139                 for_each_possible_cpu(cpu) {
2140                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2141                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2142                         if (map && map != new_map)
2143                                 kfree_rcu(map, rcu);
2144                 }
2145
2146                 kfree_rcu(dev_maps, rcu);
2147         }
2148
2149         dev_maps = new_dev_maps;
2150         active = true;
2151
2152 out_no_new_maps:
2153         /* update Tx queue numa node */
2154         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2155                                      (numa_node_id >= 0) ? numa_node_id :
2156                                      NUMA_NO_NODE);
2157
2158         if (!dev_maps)
2159                 goto out_no_maps;
2160
2161         /* removes queue from unused CPUs */
2162         for_each_possible_cpu(cpu) {
2163                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2164                         continue;
2165
2166                 if (remove_xps_queue(dev_maps, cpu, index))
2167                         active = true;
2168         }
2169
2170         /* free map if not active */
2171         if (!active) {
2172                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2173                 kfree_rcu(dev_maps, rcu);
2174         }
2175
2176 out_no_maps:
2177         mutex_unlock(&xps_map_mutex);
2178
2179         return 0;
2180 error:
2181         /* remove any maps that we added */
2182         for_each_possible_cpu(cpu) {
2183                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2184                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2185                                  NULL;
2186                 if (new_map && new_map != map)
2187                         kfree(new_map);
2188         }
2189
2190         mutex_unlock(&xps_map_mutex);
2191
2192         kfree(new_dev_maps);
2193         return -ENOMEM;
2194 }
2195 EXPORT_SYMBOL(netif_set_xps_queue);
2196
2197 #endif
2198 /*
2199  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2200  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2201  */
2202 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2203 {
2204         bool disabling;
2205         int rc;
2206
2207         disabling = txq < dev->real_num_tx_queues;
2208
2209         if (txq < 1 || txq > dev->num_tx_queues)
2210                 return -EINVAL;
2211
2212         if (dev->reg_state == NETREG_REGISTERED ||
2213             dev->reg_state == NETREG_UNREGISTERING) {
2214                 ASSERT_RTNL();
2215
2216                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2217                                                   txq);
2218                 if (rc)
2219                         return rc;
2220
2221                 if (dev->num_tc)
2222                         netif_setup_tc(dev, txq);
2223
2224                 dev->real_num_tx_queues = txq;
2225
2226                 if (disabling) {
2227                         synchronize_net();
2228                         qdisc_reset_all_tx_gt(dev, txq);
2229 #ifdef CONFIG_XPS
2230                         netif_reset_xps_queues_gt(dev, txq);
2231 #endif
2232                 }
2233         } else {
2234                 dev->real_num_tx_queues = txq;
2235         }
2236
2237         return 0;
2238 }
2239 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2240
2241 #ifdef CONFIG_SYSFS
2242 /**
2243  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2244  *      @dev: Network device
2245  *      @rxq: Actual number of RX queues
2246  *
2247  *      This must be called either with the rtnl_lock held or before
2248  *      registration of the net device.  Returns 0 on success, or a
2249  *      negative error code.  If called before registration, it always
2250  *      succeeds.
2251  */
2252 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2253 {
2254         int rc;
2255
2256         if (rxq < 1 || rxq > dev->num_rx_queues)
2257                 return -EINVAL;
2258
2259         if (dev->reg_state == NETREG_REGISTERED) {
2260                 ASSERT_RTNL();
2261
2262                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2263                                                   rxq);
2264                 if (rc)
2265                         return rc;
2266         }
2267
2268         dev->real_num_rx_queues = rxq;
2269         return 0;
2270 }
2271 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2272 #endif
2273
2274 /**
2275  * netif_get_num_default_rss_queues - default number of RSS queues
2276  *
2277  * This routine should set an upper limit on the number of RSS queues
2278  * used by default by multiqueue devices.
2279  */
2280 int netif_get_num_default_rss_queues(void)
2281 {
2282         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2283 }
2284 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2285
2286 static inline void __netif_reschedule(struct Qdisc *q)
2287 {
2288         struct softnet_data *sd;
2289         unsigned long flags;
2290
2291         local_irq_save(flags);
2292         sd = this_cpu_ptr(&softnet_data);
2293         q->next_sched = NULL;
2294         *sd->output_queue_tailp = q;
2295         sd->output_queue_tailp = &q->next_sched;
2296         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2297         local_irq_restore(flags);
2298 }
2299
2300 void __netif_schedule(struct Qdisc *q)
2301 {
2302         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2303                 __netif_reschedule(q);
2304 }
2305 EXPORT_SYMBOL(__netif_schedule);
2306
2307 struct dev_kfree_skb_cb {
2308         enum skb_free_reason reason;
2309 };
2310
2311 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2312 {
2313         return (struct dev_kfree_skb_cb *)skb->cb;
2314 }
2315
2316 void netif_schedule_queue(struct netdev_queue *txq)
2317 {
2318         rcu_read_lock();
2319         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2320                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2321
2322                 __netif_schedule(q);
2323         }
2324         rcu_read_unlock();
2325 }
2326 EXPORT_SYMBOL(netif_schedule_queue);
2327
2328 /**
2329  *      netif_wake_subqueue - allow sending packets on subqueue
2330  *      @dev: network device
2331  *      @queue_index: sub queue index
2332  *
2333  * Resume individual transmit queue of a device with multiple transmit queues.
2334  */
2335 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2336 {
2337         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2338
2339         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2340                 struct Qdisc *q;
2341
2342                 rcu_read_lock();
2343                 q = rcu_dereference(txq->qdisc);
2344                 __netif_schedule(q);
2345                 rcu_read_unlock();
2346         }
2347 }
2348 EXPORT_SYMBOL(netif_wake_subqueue);
2349
2350 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2351 {
2352         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2353                 struct Qdisc *q;
2354
2355                 rcu_read_lock();
2356                 q = rcu_dereference(dev_queue->qdisc);
2357                 __netif_schedule(q);
2358                 rcu_read_unlock();
2359         }
2360 }
2361 EXPORT_SYMBOL(netif_tx_wake_queue);
2362
2363 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2364 {
2365         unsigned long flags;
2366
2367         if (unlikely(!skb))
2368                 return;
2369
2370         if (likely(atomic_read(&skb->users) == 1)) {
2371                 smp_rmb();
2372                 atomic_set(&skb->users, 0);
2373         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2374                 return;
2375         }
2376         get_kfree_skb_cb(skb)->reason = reason;
2377         local_irq_save(flags);
2378         skb->next = __this_cpu_read(softnet_data.completion_queue);
2379         __this_cpu_write(softnet_data.completion_queue, skb);
2380         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2381         local_irq_restore(flags);
2382 }
2383 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2384
2385 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2386 {
2387         if (in_irq() || irqs_disabled())
2388                 __dev_kfree_skb_irq(skb, reason);
2389         else
2390                 dev_kfree_skb(skb);
2391 }
2392 EXPORT_SYMBOL(__dev_kfree_skb_any);
2393
2394
2395 /**
2396  * netif_device_detach - mark device as removed
2397  * @dev: network device
2398  *
2399  * Mark device as removed from system and therefore no longer available.
2400  */
2401 void netif_device_detach(struct net_device *dev)
2402 {
2403         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2404             netif_running(dev)) {
2405                 netif_tx_stop_all_queues(dev);
2406         }
2407 }
2408 EXPORT_SYMBOL(netif_device_detach);
2409
2410 /**
2411  * netif_device_attach - mark device as attached
2412  * @dev: network device
2413  *
2414  * Mark device as attached from system and restart if needed.
2415  */
2416 void netif_device_attach(struct net_device *dev)
2417 {
2418         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2419             netif_running(dev)) {
2420                 netif_tx_wake_all_queues(dev);
2421                 __netdev_watchdog_up(dev);
2422         }
2423 }
2424 EXPORT_SYMBOL(netif_device_attach);
2425
2426 /*
2427  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2428  * to be used as a distribution range.
2429  */
2430 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2431                   unsigned int num_tx_queues)
2432 {
2433         u32 hash;
2434         u16 qoffset = 0;
2435         u16 qcount = num_tx_queues;
2436
2437         if (skb_rx_queue_recorded(skb)) {
2438                 hash = skb_get_rx_queue(skb);
2439                 while (unlikely(hash >= num_tx_queues))
2440                         hash -= num_tx_queues;
2441                 return hash;
2442         }
2443
2444         if (dev->num_tc) {
2445                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2446                 qoffset = dev->tc_to_txq[tc].offset;
2447                 qcount = dev->tc_to_txq[tc].count;
2448         }
2449
2450         return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2451 }
2452 EXPORT_SYMBOL(__skb_tx_hash);
2453
2454 static void skb_warn_bad_offload(const struct sk_buff *skb)
2455 {
2456         static const netdev_features_t null_features = 0;
2457         struct net_device *dev = skb->dev;
2458         const char *name = "";
2459
2460         if (!net_ratelimit())
2461                 return;
2462
2463         if (dev) {
2464                 if (dev->dev.parent)
2465                         name = dev_driver_string(dev->dev.parent);
2466                 else
2467                         name = netdev_name(dev);
2468         }
2469         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2470              "gso_type=%d ip_summed=%d\n",
2471              name, dev ? &dev->features : &null_features,
2472              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2473              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2474              skb_shinfo(skb)->gso_type, skb->ip_summed);
2475 }
2476
2477 /*
2478  * Invalidate hardware checksum when packet is to be mangled, and
2479  * complete checksum manually on outgoing path.
2480  */
2481 int skb_checksum_help(struct sk_buff *skb)
2482 {
2483         __wsum csum;
2484         int ret = 0, offset;
2485
2486         if (skb->ip_summed == CHECKSUM_COMPLETE)
2487                 goto out_set_summed;
2488
2489         if (unlikely(skb_shinfo(skb)->gso_size)) {
2490                 skb_warn_bad_offload(skb);
2491                 return -EINVAL;
2492         }
2493
2494         /* Before computing a checksum, we should make sure no frag could
2495          * be modified by an external entity : checksum could be wrong.
2496          */
2497         if (skb_has_shared_frag(skb)) {
2498                 ret = __skb_linearize(skb);
2499                 if (ret)
2500                         goto out;
2501         }
2502
2503         offset = skb_checksum_start_offset(skb);
2504         BUG_ON(offset >= skb_headlen(skb));
2505         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2506
2507         offset += skb->csum_offset;
2508         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2509
2510         if (skb_cloned(skb) &&
2511             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2512                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2513                 if (ret)
2514                         goto out;
2515         }
2516
2517         *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2518 out_set_summed:
2519         skb->ip_summed = CHECKSUM_NONE;
2520 out:
2521         return ret;
2522 }
2523 EXPORT_SYMBOL(skb_checksum_help);
2524
2525 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2526 {
2527         __be16 type = skb->protocol;
2528
2529         /* Tunnel gso handlers can set protocol to ethernet. */
2530         if (type == htons(ETH_P_TEB)) {
2531                 struct ethhdr *eth;
2532
2533                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2534                         return 0;
2535
2536                 eth = (struct ethhdr *)skb->data;
2537                 type = eth->h_proto;
2538         }
2539
2540         return __vlan_get_protocol(skb, type, depth);
2541 }
2542
2543 /**
2544  *      skb_mac_gso_segment - mac layer segmentation handler.
2545  *      @skb: buffer to segment
2546  *      @features: features for the output path (see dev->features)
2547  */
2548 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2549                                     netdev_features_t features)
2550 {
2551         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2552         struct packet_offload *ptype;
2553         int vlan_depth = skb->mac_len;
2554         __be16 type = skb_network_protocol(skb, &vlan_depth);
2555
2556         if (unlikely(!type))
2557                 return ERR_PTR(-EINVAL);
2558
2559         __skb_pull(skb, vlan_depth);
2560
2561         rcu_read_lock();
2562         list_for_each_entry_rcu(ptype, &offload_base, list) {
2563                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2564                         segs = ptype->callbacks.gso_segment(skb, features);
2565                         break;
2566                 }
2567         }
2568         rcu_read_unlock();
2569
2570         __skb_push(skb, skb->data - skb_mac_header(skb));
2571
2572         return segs;
2573 }
2574 EXPORT_SYMBOL(skb_mac_gso_segment);
2575
2576
2577 /* openvswitch calls this on rx path, so we need a different check.
2578  */
2579 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2580 {
2581         if (tx_path)
2582                 return skb->ip_summed != CHECKSUM_PARTIAL &&
2583                        skb->ip_summed != CHECKSUM_UNNECESSARY;
2584
2585         return skb->ip_summed == CHECKSUM_NONE;
2586 }
2587
2588 /**
2589  *      __skb_gso_segment - Perform segmentation on skb.
2590  *      @skb: buffer to segment
2591  *      @features: features for the output path (see dev->features)
2592  *      @tx_path: whether it is called in TX path
2593  *
2594  *      This function segments the given skb and returns a list of segments.
2595  *
2596  *      It may return NULL if the skb requires no segmentation.  This is
2597  *      only possible when GSO is used for verifying header integrity.
2598  *
2599  *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2600  */
2601 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2602                                   netdev_features_t features, bool tx_path)
2603 {
2604         struct sk_buff *segs;
2605
2606         if (unlikely(skb_needs_check(skb, tx_path))) {
2607                 int err;
2608
2609                 /* We're going to init ->check field in TCP or UDP header */
2610                 err = skb_cow_head(skb, 0);
2611                 if (err < 0)
2612                         return ERR_PTR(err);
2613         }
2614
2615         BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2616                      sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2617
2618         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2619         SKB_GSO_CB(skb)->encap_level = 0;
2620
2621         skb_reset_mac_header(skb);
2622         skb_reset_mac_len(skb);
2623
2624         segs = skb_mac_gso_segment(skb, features);
2625
2626         if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
2627                 skb_warn_bad_offload(skb);
2628
2629         return segs;
2630 }
2631 EXPORT_SYMBOL(__skb_gso_segment);
2632
2633 /* Take action when hardware reception checksum errors are detected. */
2634 #ifdef CONFIG_BUG
2635 void netdev_rx_csum_fault(struct net_device *dev)
2636 {
2637         if (net_ratelimit()) {
2638                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2639                 dump_stack();
2640         }
2641 }
2642 EXPORT_SYMBOL(netdev_rx_csum_fault);
2643 #endif
2644
2645 /* Actually, we should eliminate this check as soon as we know, that:
2646  * 1. IOMMU is present and allows to map all the memory.
2647  * 2. No high memory really exists on this machine.
2648  */
2649
2650 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2651 {
2652 #ifdef CONFIG_HIGHMEM
2653         int i;
2654         if (!(dev->features & NETIF_F_HIGHDMA)) {
2655                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2656                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2657                         if (PageHighMem(skb_frag_page(frag)))
2658                                 return 1;
2659                 }
2660         }
2661
2662         if (PCI_DMA_BUS_IS_PHYS) {
2663                 struct device *pdev = dev->dev.parent;
2664
2665                 if (!pdev)
2666                         return 0;
2667                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2668                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2669                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2670                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2671                                 return 1;
2672                 }
2673         }
2674 #endif
2675         return 0;
2676 }
2677
2678 /* If MPLS offload request, verify we are testing hardware MPLS features
2679  * instead of standard features for the netdev.
2680  */
2681 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2682 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2683                                            netdev_features_t features,
2684                                            __be16 type)
2685 {
2686         if (eth_p_mpls(type))
2687                 features &= skb->dev->mpls_features;
2688
2689         return features;
2690 }
2691 #else
2692 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2693                                            netdev_features_t features,
2694                                            __be16 type)
2695 {
2696         return features;
2697 }
2698 #endif
2699
2700 static netdev_features_t harmonize_features(struct sk_buff *skb,
2701         netdev_features_t features)
2702 {
2703         int tmp;
2704         __be16 type;
2705
2706         type = skb_network_protocol(skb, &tmp);
2707         features = net_mpls_features(skb, features, type);
2708
2709         if (skb->ip_summed != CHECKSUM_NONE &&
2710             !can_checksum_protocol(features, type)) {
2711                 features &= ~NETIF_F_ALL_CSUM;
2712         }
2713         if (illegal_highdma(skb->dev, skb))
2714                 features &= ~NETIF_F_SG;
2715
2716         return features;
2717 }
2718
2719 netdev_features_t passthru_features_check(struct sk_buff *skb,
2720                                           struct net_device *dev,
2721                                           netdev_features_t features)
2722 {
2723         return features;
2724 }
2725 EXPORT_SYMBOL(passthru_features_check);
2726
2727 static netdev_features_t dflt_features_check(struct sk_buff *skb,
2728                                              struct net_device *dev,
2729                                              netdev_features_t features)
2730 {
2731         return vlan_features_check(skb, features);
2732 }
2733
2734 netdev_features_t netif_skb_features(struct sk_buff *skb)
2735 {
2736         struct net_device *dev = skb->dev;
2737         netdev_features_t features = dev->features;
2738         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2739
2740         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2741                 features &= ~NETIF_F_GSO_MASK;
2742
2743         /* If encapsulation offload request, verify we are testing
2744          * hardware encapsulation features instead of standard
2745          * features for the netdev
2746          */
2747         if (skb->encapsulation)
2748                 features &= dev->hw_enc_features;
2749
2750         if (skb_vlan_tagged(skb))
2751                 features = netdev_intersect_features(features,
2752                                                      dev->vlan_features |
2753                                                      NETIF_F_HW_VLAN_CTAG_TX |
2754                                                      NETIF_F_HW_VLAN_STAG_TX);
2755
2756         if (dev->netdev_ops->ndo_features_check)
2757                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2758                                                                 features);
2759         else
2760                 features &= dflt_features_check(skb, dev, features);
2761
2762         return harmonize_features(skb, features);
2763 }
2764 EXPORT_SYMBOL(netif_skb_features);
2765
2766 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2767                     struct netdev_queue *txq, bool more)
2768 {
2769         unsigned int len;
2770         int rc;
2771
2772         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2773                 dev_queue_xmit_nit(skb, dev);
2774
2775         len = skb->len;
2776         trace_net_dev_start_xmit(skb, dev);
2777         rc = netdev_start_xmit(skb, dev, txq, more);
2778         trace_net_dev_xmit(skb, rc, dev, len);
2779
2780         return rc;
2781 }
2782
2783 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2784                                     struct netdev_queue *txq, int *ret)
2785 {
2786         struct sk_buff *skb = first;
2787         int rc = NETDEV_TX_OK;
2788
2789         while (skb) {
2790                 struct sk_buff *next = skb->next;
2791
2792                 skb->next = NULL;
2793                 rc = xmit_one(skb, dev, txq, next != NULL);
2794                 if (unlikely(!dev_xmit_complete(rc))) {
2795                         skb->next = next;
2796                         goto out;
2797                 }
2798
2799                 skb = next;
2800                 if (netif_tx_queue_stopped(txq) && skb) {
2801                         rc = NETDEV_TX_BUSY;
2802                         break;
2803                 }
2804         }
2805
2806 out:
2807         *ret = rc;
2808         return skb;
2809 }
2810
2811 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2812                                           netdev_features_t features)
2813 {
2814         if (skb_vlan_tag_present(skb) &&
2815             !vlan_hw_offload_capable(features, skb->vlan_proto))
2816                 skb = __vlan_hwaccel_push_inside(skb);
2817         return skb;
2818 }
2819
2820 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2821 {
2822         netdev_features_t features;
2823
2824         if (skb->next)
2825                 return skb;
2826
2827         features = netif_skb_features(skb);
2828         skb = validate_xmit_vlan(skb, features);
2829         if (unlikely(!skb))
2830                 goto out_null;
2831
2832         if (netif_needs_gso(skb, features)) {
2833                 struct sk_buff *segs;
2834
2835                 segs = skb_gso_segment(skb, features);
2836                 if (IS_ERR(segs)) {
2837                         goto out_kfree_skb;
2838                 } else if (segs) {
2839                         consume_skb(skb);
2840                         skb = segs;
2841                 }
2842         } else {
2843                 if (skb_needs_linearize(skb, features) &&
2844                     __skb_linearize(skb))
2845                         goto out_kfree_skb;
2846
2847                 /* If packet is not checksummed and device does not
2848                  * support checksumming for this protocol, complete
2849                  * checksumming here.
2850                  */
2851                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2852                         if (skb->encapsulation)
2853                                 skb_set_inner_transport_header(skb,
2854                                                                skb_checksum_start_offset(skb));
2855                         else
2856                                 skb_set_transport_header(skb,
2857                                                          skb_checksum_start_offset(skb));
2858                         if (!(features & NETIF_F_ALL_CSUM) &&
2859                             skb_checksum_help(skb))
2860                                 goto out_kfree_skb;
2861                 }
2862         }
2863
2864         return skb;
2865
2866 out_kfree_skb:
2867         kfree_skb(skb);
2868 out_null:
2869         return NULL;
2870 }
2871
2872 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2873 {
2874         struct sk_buff *next, *head = NULL, *tail;
2875
2876         for (; skb != NULL; skb = next) {
2877                 next = skb->next;
2878                 skb->next = NULL;
2879
2880                 /* in case skb wont be segmented, point to itself */
2881                 skb->prev = skb;
2882
2883                 skb = validate_xmit_skb(skb, dev);
2884                 if (!skb)
2885                         continue;
2886
2887                 if (!head)
2888                         head = skb;
2889                 else
2890                         tail->next = skb;
2891                 /* If skb was segmented, skb->prev points to
2892                  * the last segment. If not, it still contains skb.
2893                  */
2894                 tail = skb->prev;
2895         }
2896         return head;
2897 }
2898 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
2899
2900 static void qdisc_pkt_len_init(struct sk_buff *skb)
2901 {
2902         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2903
2904         qdisc_skb_cb(skb)->pkt_len = skb->len;
2905
2906         /* To get more precise estimation of bytes sent on wire,
2907          * we add to pkt_len the headers size of all segments
2908          */
2909         if (shinfo->gso_size)  {
2910                 unsigned int hdr_len;
2911                 u16 gso_segs = shinfo->gso_segs;
2912
2913                 /* mac layer + network layer */
2914                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2915
2916                 /* + transport layer */
2917                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
2918                         const struct tcphdr *th;
2919                         struct tcphdr _tcphdr;
2920
2921                         th = skb_header_pointer(skb, skb_transport_offset(skb),
2922                                                 sizeof(_tcphdr), &_tcphdr);
2923                         if (likely(th))
2924                                 hdr_len += __tcp_hdrlen(th);
2925                 } else {
2926                         struct udphdr _udphdr;
2927
2928                         if (skb_header_pointer(skb, skb_transport_offset(skb),
2929                                                sizeof(_udphdr), &_udphdr))
2930                                 hdr_len += sizeof(struct udphdr);
2931                 }
2932
2933                 if (shinfo->gso_type & SKB_GSO_DODGY)
2934                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2935                                                 shinfo->gso_size);
2936
2937                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2938         }
2939 }
2940
2941 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2942                                  struct net_device *dev,
2943                                  struct netdev_queue *txq)
2944 {
2945         spinlock_t *root_lock = qdisc_lock(q);
2946         bool contended;
2947         int rc;
2948
2949         qdisc_pkt_len_init(skb);
2950         qdisc_calculate_pkt_len(skb, q);
2951         /*
2952          * Heuristic to force contended enqueues to serialize on a
2953          * separate lock before trying to get qdisc main lock.
2954          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2955          * often and dequeue packets faster.
2956          */
2957         contended = qdisc_is_running(q);
2958         if (unlikely(contended))
2959                 spin_lock(&q->busylock);
2960
2961         spin_lock(root_lock);
2962         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2963                 kfree_skb(skb);
2964                 rc = NET_XMIT_DROP;
2965         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2966                    qdisc_run_begin(q)) {
2967                 /*
2968                  * This is a work-conserving queue; there are no old skbs
2969                  * waiting to be sent out; and the qdisc is not running -
2970                  * xmit the skb directly.
2971                  */
2972
2973                 qdisc_bstats_update(q, skb);
2974
2975                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2976                         if (unlikely(contended)) {
2977                                 spin_unlock(&q->busylock);
2978                                 contended = false;
2979                         }
2980                         __qdisc_run(q);
2981                 } else
2982                         qdisc_run_end(q);
2983
2984                 rc = NET_XMIT_SUCCESS;
2985         } else {
2986                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2987                 if (qdisc_run_begin(q)) {
2988                         if (unlikely(contended)) {
2989                                 spin_unlock(&q->busylock);
2990                                 contended = false;
2991                         }
2992                         __qdisc_run(q);
2993                 }
2994         }
2995         spin_unlock(root_lock);
2996         if (unlikely(contended))
2997                 spin_unlock(&q->busylock);
2998         return rc;
2999 }
3000
3001 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3002 static void skb_update_prio(struct sk_buff *skb)
3003 {
3004         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3005
3006         if (!skb->priority && skb->sk && map) {
3007                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
3008
3009                 if (prioidx < map->priomap_len)
3010                         skb->priority = map->priomap[prioidx];
3011         }
3012 }
3013 #else
3014 #define skb_update_prio(skb)
3015 #endif
3016
3017 DEFINE_PER_CPU(int, xmit_recursion);
3018 EXPORT_SYMBOL(xmit_recursion);
3019
3020 #define RECURSION_LIMIT 8
3021
3022 /**
3023  *      dev_loopback_xmit - loop back @skb
3024  *      @net: network namespace this loopback is happening in
3025  *      @sk:  sk needed to be a netfilter okfn
3026  *      @skb: buffer to transmit
3027  */
3028 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3029 {
3030         skb_reset_mac_header(skb);
3031         __skb_pull(skb, skb_network_offset(skb));
3032         skb->pkt_type = PACKET_LOOPBACK;
3033         skb->ip_summed = CHECKSUM_UNNECESSARY;
3034         WARN_ON(!skb_dst(skb));
3035         skb_dst_force(skb);
3036         netif_rx_ni(skb);
3037         return 0;
3038 }
3039 EXPORT_SYMBOL(dev_loopback_xmit);
3040
3041 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3042 {
3043 #ifdef CONFIG_XPS
3044         struct xps_dev_maps *dev_maps;
3045         struct xps_map *map;
3046         int queue_index = -1;
3047
3048         rcu_read_lock();
3049         dev_maps = rcu_dereference(dev->xps_maps);
3050         if (dev_maps) {
3051                 map = rcu_dereference(
3052                     dev_maps->cpu_map[skb->sender_cpu - 1]);
3053                 if (map) {
3054                         if (map->len == 1)
3055                                 queue_index = map->queues[0];
3056                         else
3057                                 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3058                                                                            map->len)];
3059                         if (unlikely(queue_index >= dev->real_num_tx_queues))
3060                                 queue_index = -1;
3061                 }
3062         }
3063         rcu_read_unlock();
3064
3065         return queue_index;
3066 #else
3067         return -1;
3068 #endif
3069 }
3070
3071 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3072 {
3073         struct sock *sk = skb->sk;
3074         int queue_index = sk_tx_queue_get(sk);
3075
3076         if (queue_index < 0 || skb->ooo_okay ||
3077             queue_index >= dev->real_num_tx_queues) {
3078                 int new_index = get_xps_queue(dev, skb);
3079                 if (new_index < 0)
3080                         new_index = skb_tx_hash(dev, skb);
3081
3082                 if (queue_index != new_index && sk &&
3083                     sk_fullsock(sk) &&
3084                     rcu_access_pointer(sk->sk_dst_cache))
3085                         sk_tx_queue_set(sk, new_index);
3086
3087                 queue_index = new_index;
3088         }
3089
3090         return queue_index;
3091 }
3092
3093 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3094                                     struct sk_buff *skb,
3095                                     void *accel_priv)
3096 {
3097         int queue_index = 0;
3098
3099 #ifdef CONFIG_XPS
3100         u32 sender_cpu = skb->sender_cpu - 1;
3101
3102         if (sender_cpu >= (u32)NR_CPUS)
3103                 skb->sender_cpu = raw_smp_processor_id() + 1;
3104 #endif
3105
3106         if (dev->real_num_tx_queues != 1) {
3107                 const struct net_device_ops *ops = dev->netdev_ops;
3108                 if (ops->ndo_select_queue)
3109                         queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3110                                                             __netdev_pick_tx);
3111                 else
3112                         queue_index = __netdev_pick_tx(dev, skb);
3113
3114                 if (!accel_priv)
3115                         queue_index = netdev_cap_txqueue(dev, queue_index);
3116         }
3117
3118         skb_set_queue_mapping(skb, queue_index);
3119         return netdev_get_tx_queue(dev, queue_index);
3120 }
3121
3122 /**
3123  *      __dev_queue_xmit - transmit a buffer
3124  *      @skb: buffer to transmit
3125  *      @accel_priv: private data used for L2 forwarding offload
3126  *
3127  *      Queue a buffer for transmission to a network device. The caller must
3128  *      have set the device and priority and built the buffer before calling
3129  *      this function. The function can be called from an interrupt.
3130  *
3131  *      A negative errno code is returned on a failure. A success does not
3132  *      guarantee the frame will be transmitted as it may be dropped due
3133  *      to congestion or traffic shaping.
3134  *
3135  * -----------------------------------------------------------------------------------
3136  *      I notice this method can also return errors from the queue disciplines,
3137  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3138  *      be positive.
3139  *
3140  *      Regardless of the return value, the skb is consumed, so it is currently
3141  *      difficult to retry a send to this method.  (You can bump the ref count
3142  *      before sending to hold a reference for retry if you are careful.)
3143  *
3144  *      When calling this method, interrupts MUST be enabled.  This is because
3145  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3146  *          --BLG
3147  */
3148 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3149 {
3150         struct net_device *dev = skb->dev;
3151         struct netdev_queue *txq;
3152         struct Qdisc *q;
3153         int rc = -ENOMEM;
3154
3155         skb_reset_mac_header(skb);
3156
3157         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3158                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3159
3160         /* Disable soft irqs for various locks below. Also
3161          * stops preemption for RCU.
3162          */
3163         rcu_read_lock_bh();
3164
3165         skb_update_prio(skb);
3166
3167         /* If device/qdisc don't need skb->dst, release it right now while
3168          * its hot in this cpu cache.
3169          */
3170         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3171                 skb_dst_drop(skb);
3172         else
3173                 skb_dst_force(skb);
3174
3175 #ifdef CONFIG_NET_SWITCHDEV
3176         /* Don't forward if offload device already forwarded */
3177         if (skb->offload_fwd_mark &&
3178             skb->offload_fwd_mark == dev->offload_fwd_mark) {
3179                 consume_skb(skb);
3180                 rc = NET_XMIT_SUCCESS;
3181                 goto out;
3182         }
3183 #endif
3184
3185         txq = netdev_pick_tx(dev, skb, accel_priv);
3186         q = rcu_dereference_bh(txq->qdisc);
3187
3188 #ifdef CONFIG_NET_CLS_ACT
3189         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3190 #endif
3191         trace_net_dev_queue(skb);
3192         if (q->enqueue) {
3193                 rc = __dev_xmit_skb(skb, q, dev, txq);
3194                 goto out;
3195         }
3196
3197         /* The device has no queue. Common case for software devices:
3198            loopback, all the sorts of tunnels...
3199
3200            Really, it is unlikely that netif_tx_lock protection is necessary
3201            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3202            counters.)
3203            However, it is possible, that they rely on protection
3204            made by us here.
3205
3206            Check this and shot the lock. It is not prone from deadlocks.
3207            Either shot noqueue qdisc, it is even simpler 8)
3208          */
3209         if (dev->flags & IFF_UP) {
3210                 int cpu = smp_processor_id(); /* ok because BHs are off */
3211
3212                 if (txq->xmit_lock_owner != cpu) {
3213
3214                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
3215                                 goto recursion_alert;
3216
3217                         skb = validate_xmit_skb(skb, dev);
3218                         if (!skb)
3219                                 goto drop;
3220
3221                         HARD_TX_LOCK(dev, txq, cpu);
3222
3223                         if (!netif_xmit_stopped(txq)) {
3224                                 __this_cpu_inc(xmit_recursion);
3225                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3226                                 __this_cpu_dec(xmit_recursion);
3227                                 if (dev_xmit_complete(rc)) {
3228                                         HARD_TX_UNLOCK(dev, txq);
3229                                         goto out;
3230                                 }
3231                         }
3232                         HARD_TX_UNLOCK(dev, txq);
3233                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3234                                              dev->name);
3235                 } else {
3236                         /* Recursion is detected! It is possible,
3237                          * unfortunately
3238                          */
3239 recursion_alert:
3240                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3241                                              dev->name);
3242                 }
3243         }
3244
3245         rc = -ENETDOWN;
3246 drop:
3247         rcu_read_unlock_bh();
3248
3249         atomic_long_inc(&dev->tx_dropped);
3250         kfree_skb_list(skb);
3251         return rc;
3252 out:
3253         rcu_read_unlock_bh();
3254         return rc;
3255 }
3256
3257 int dev_queue_xmit(struct sk_buff *skb)
3258 {
3259         return __dev_queue_xmit(skb, NULL);
3260 }
3261 EXPORT_SYMBOL(dev_queue_xmit);
3262
3263 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3264 {
3265         return __dev_queue_xmit(skb, accel_priv);
3266 }
3267 EXPORT_SYMBOL(dev_queue_xmit_accel);
3268
3269
3270 /*=======================================================================
3271                         Receiver routines
3272   =======================================================================*/
3273
3274 int netdev_max_backlog __read_mostly = 1000;
3275 EXPORT_SYMBOL(netdev_max_backlog);
3276
3277 int netdev_tstamp_prequeue __read_mostly = 1;
3278 int netdev_budget __read_mostly = 300;
3279 int weight_p __read_mostly = 64;            /* old backlog weight */
3280
3281 /* Called with irq disabled */
3282 static inline void ____napi_schedule(struct softnet_data *sd,
3283                                      struct napi_struct *napi)
3284 {
3285         list_add_tail(&napi->poll_list, &sd->poll_list);
3286         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3287 }
3288
3289 #ifdef CONFIG_RPS
3290
3291 /* One global table that all flow-based protocols share. */
3292 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3293 EXPORT_SYMBOL(rps_sock_flow_table);
3294 u32 rps_cpu_mask __read_mostly;
3295 EXPORT_SYMBOL(rps_cpu_mask);
3296
3297 struct static_key rps_needed __read_mostly;
3298
3299 static struct rps_dev_flow *
3300 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3301             struct rps_dev_flow *rflow, u16 next_cpu)
3302 {
3303         if (next_cpu < nr_cpu_ids) {
3304 #ifdef CONFIG_RFS_ACCEL
3305                 struct netdev_rx_queue *rxqueue;
3306                 struct rps_dev_flow_table *flow_table;
3307                 struct rps_dev_flow *old_rflow;
3308                 u32 flow_id;
3309                 u16 rxq_index;
3310                 int rc;
3311
3312                 /* Should we steer this flow to a different hardware queue? */
3313                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3314                     !(dev->features & NETIF_F_NTUPLE))
3315                         goto out;
3316                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3317                 if (rxq_index == skb_get_rx_queue(skb))
3318                         goto out;
3319
3320                 rxqueue = dev->_rx + rxq_index;
3321                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3322                 if (!flow_table)
3323                         goto out;
3324                 flow_id = skb_get_hash(skb) & flow_table->mask;
3325                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3326                                                         rxq_index, flow_id);
3327                 if (rc < 0)
3328                         goto out;
3329                 old_rflow = rflow;
3330                 rflow = &flow_table->flows[flow_id];
3331                 rflow->filter = rc;
3332                 if (old_rflow->filter == rflow->filter)
3333                         old_rflow->filter = RPS_NO_FILTER;
3334         out:
3335 #endif
3336                 rflow->last_qtail =
3337                         per_cpu(softnet_data, next_cpu).input_queue_head;
3338         }
3339
3340         rflow->cpu = next_cpu;
3341         return rflow;
3342 }
3343
3344 /*
3345  * get_rps_cpu is called from netif_receive_skb and returns the target
3346  * CPU from the RPS map of the receiving queue for a given skb.
3347  * rcu_read_lock must be held on entry.
3348  */
3349 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3350                        struct rps_dev_flow **rflowp)
3351 {
3352         const struct rps_sock_flow_table *sock_flow_table;
3353         struct netdev_rx_queue *rxqueue = dev->_rx;
3354         struct rps_dev_flow_table *flow_table;
3355         struct rps_map *map;
3356         int cpu = -1;
3357         u32 tcpu;
3358         u32 hash;
3359
3360         if (skb_rx_queue_recorded(skb)) {
3361                 u16 index = skb_get_rx_queue(skb);
3362
3363                 if (unlikely(index >= dev->real_num_rx_queues)) {
3364                         WARN_ONCE(dev->real_num_rx_queues > 1,
3365                                   "%s received packet on queue %u, but number "
3366                                   "of RX queues is %u\n",
3367                                   dev->name, index, dev->real_num_rx_queues);
3368                         goto done;
3369                 }
3370                 rxqueue += index;
3371         }
3372
3373         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3374
3375         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3376         map = rcu_dereference(rxqueue->rps_map);
3377         if (!flow_table && !map)
3378                 goto done;
3379
3380         skb_reset_network_header(skb);
3381         hash = skb_get_hash(skb);
3382         if (!hash)
3383                 goto done;
3384
3385         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3386         if (flow_table && sock_flow_table) {
3387                 struct rps_dev_flow *rflow;
3388                 u32 next_cpu;
3389                 u32 ident;
3390
3391                 /* First check into global flow table if there is a match */
3392                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3393                 if ((ident ^ hash) & ~rps_cpu_mask)
3394                         goto try_rps;
3395
3396                 next_cpu = ident & rps_cpu_mask;
3397
3398                 /* OK, now we know there is a match,
3399                  * we can look at the local (per receive queue) flow table
3400                  */
3401                 rflow = &flow_table->flows[hash & flow_table->mask];
3402                 tcpu = rflow->cpu;
3403
3404                 /*
3405                  * If the desired CPU (where last recvmsg was done) is
3406                  * different from current CPU (one in the rx-queue flow
3407                  * table entry), switch if one of the following holds:
3408                  *   - Current CPU is unset (>= nr_cpu_ids).
3409                  *   - Current CPU is offline.
3410                  *   - The current CPU's queue tail has advanced beyond the
3411                  *     last packet that was enqueued using this table entry.
3412                  *     This guarantees that all previous packets for the flow
3413                  *     have been dequeued, thus preserving in order delivery.
3414                  */
3415                 if (unlikely(tcpu != next_cpu) &&
3416                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3417                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3418                       rflow->last_qtail)) >= 0)) {
3419                         tcpu = next_cpu;
3420                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3421                 }
3422
3423                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3424                         *rflowp = rflow;
3425                         cpu = tcpu;
3426                         goto done;
3427                 }
3428         }
3429
3430 try_rps:
3431
3432         if (map) {
3433                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3434                 if (cpu_online(tcpu)) {
3435                         cpu = tcpu;
3436                         goto done;
3437                 }
3438         }
3439
3440 done:
3441         return cpu;
3442 }
3443
3444 #ifdef CONFIG_RFS_ACCEL
3445
3446 /**
3447  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3448  * @dev: Device on which the filter was set
3449  * @rxq_index: RX queue index
3450  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3451  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3452  *
3453  * Drivers that implement ndo_rx_flow_steer() should periodically call
3454  * this function for each installed filter and remove the filters for
3455  * which it returns %true.
3456  */
3457 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3458                          u32 flow_id, u16 filter_id)
3459 {
3460         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3461         struct rps_dev_flow_table *flow_table;
3462         struct rps_dev_flow *rflow;
3463         bool expire = true;
3464         unsigned int cpu;
3465
3466         rcu_read_lock();
3467         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3468         if (flow_table && flow_id <= flow_table->mask) {
3469                 rflow = &flow_table->flows[flow_id];
3470                 cpu = ACCESS_ONCE(rflow->cpu);
3471                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3472                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3473                            rflow->last_qtail) <
3474                      (int)(10 * flow_table->mask)))
3475                         expire = false;
3476         }
3477         rcu_read_unlock();
3478         return expire;
3479 }
3480 EXPORT_SYMBOL(rps_may_expire_flow);
3481
3482 #endif /* CONFIG_RFS_ACCEL */
3483
3484 /* Called from hardirq (IPI) context */
3485 static void rps_trigger_softirq(void *data)
3486 {
3487         struct softnet_data *sd = data;
3488
3489         ____napi_schedule(sd, &sd->backlog);
3490         sd->received_rps++;
3491 }
3492
3493 #endif /* CONFIG_RPS */
3494
3495 /*
3496  * Check if this softnet_data structure is another cpu one
3497  * If yes, queue it to our IPI list and return 1
3498  * If no, return 0
3499  */
3500 static int rps_ipi_queued(struct softnet_data *sd)
3501 {
3502 #ifdef CONFIG_RPS
3503         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3504
3505         if (sd != mysd) {
3506                 sd->rps_ipi_next = mysd->rps_ipi_list;
3507                 mysd->rps_ipi_list = sd;
3508
3509                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3510                 return 1;
3511         }
3512 #endif /* CONFIG_RPS */
3513         return 0;
3514 }
3515
3516 #ifdef CONFIG_NET_FLOW_LIMIT
3517 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3518 #endif
3519
3520 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3521 {
3522 #ifdef CONFIG_NET_FLOW_LIMIT
3523         struct sd_flow_limit *fl;
3524         struct softnet_data *sd;
3525         unsigned int old_flow, new_flow;
3526
3527         if (qlen < (netdev_max_backlog >> 1))
3528                 return false;
3529
3530         sd = this_cpu_ptr(&softnet_data);
3531
3532         rcu_read_lock();
3533         fl = rcu_dereference(sd->flow_limit);
3534         if (fl) {
3535                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3536                 old_flow = fl->history[fl->history_head];
3537                 fl->history[fl->history_head] = new_flow;
3538
3539                 fl->history_head++;
3540                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3541
3542                 if (likely(fl->buckets[old_flow]))
3543                         fl->buckets[old_flow]--;
3544
3545                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3546                         fl->count++;
3547                         rcu_read_unlock();
3548                         return true;
3549                 }
3550         }
3551         rcu_read_unlock();
3552 #endif
3553         return false;
3554 }
3555
3556 /*
3557  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3558  * queue (may be a remote CPU queue).
3559  */
3560 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3561                               unsigned int *qtail)
3562 {
3563         struct softnet_data *sd;
3564         unsigned long flags;
3565         unsigned int qlen;
3566
3567         sd = &per_cpu(softnet_data, cpu);
3568
3569         local_irq_save(flags);
3570
3571         rps_lock(sd);
3572         if (!netif_running(skb->dev))
3573                 goto drop;
3574         qlen = skb_queue_len(&sd->input_pkt_queue);
3575         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3576                 if (qlen) {
3577 enqueue:
3578                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3579                         input_queue_tail_incr_save(sd, qtail);
3580                         rps_unlock(sd);
3581                         local_irq_restore(flags);
3582                         return NET_RX_SUCCESS;
3583                 }
3584
3585                 /* Schedule NAPI for backlog device
3586                  * We can use non atomic operation since we own the queue lock
3587                  */
3588                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3589                         if (!rps_ipi_queued(sd))
3590                                 ____napi_schedule(sd, &sd->backlog);
3591                 }
3592                 goto enqueue;
3593         }
3594
3595 drop:
3596         sd->dropped++;
3597         rps_unlock(sd);
3598
3599         local_irq_restore(flags);
3600
3601         atomic_long_inc(&skb->dev->rx_dropped);
3602         kfree_skb(skb);
3603         return NET_RX_DROP;
3604 }
3605
3606 static int netif_rx_internal(struct sk_buff *skb)
3607 {
3608         int ret;
3609
3610         net_timestamp_check(netdev_tstamp_prequeue, skb);
3611
3612         trace_netif_rx(skb);
3613 #ifdef CONFIG_RPS
3614         if (static_key_false(&rps_needed)) {
3615                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3616                 int cpu;
3617
3618                 preempt_disable();
3619                 rcu_read_lock();
3620
3621                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3622                 if (cpu < 0)
3623                         cpu = smp_processor_id();
3624
3625                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3626
3627                 rcu_read_unlock();
3628                 preempt_enable();
3629         } else
3630 #endif
3631         {
3632                 unsigned int qtail;
3633                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3634                 put_cpu();
3635         }
3636         return ret;
3637 }
3638
3639 /**
3640  *      netif_rx        -       post buffer to the network code
3641  *      @skb: buffer to post
3642  *
3643  *      This function receives a packet from a device driver and queues it for
3644  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3645  *      may be dropped during processing for congestion control or by the
3646  *      protocol layers.
3647  *
3648  *      return values:
3649  *      NET_RX_SUCCESS  (no congestion)
3650  *      NET_RX_DROP     (packet was dropped)
3651  *
3652  */
3653
3654 int netif_rx(struct sk_buff *skb)
3655 {
3656         trace_netif_rx_entry(skb);
3657
3658         return netif_rx_internal(skb);
3659 }
3660 EXPORT_SYMBOL(netif_rx);
3661
3662 int netif_rx_ni(struct sk_buff *skb)
3663 {
3664         int err;
3665
3666         trace_netif_rx_ni_entry(skb);
3667
3668         preempt_disable();
3669         err = netif_rx_internal(skb);
3670         if (local_softirq_pending())
3671                 do_softirq();
3672         preempt_enable();
3673
3674         return err;
3675 }
3676 EXPORT_SYMBOL(netif_rx_ni);
3677
3678 static void net_tx_action(struct softirq_action *h)
3679 {
3680         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3681
3682         if (sd->completion_queue) {
3683                 struct sk_buff *clist;
3684
3685                 local_irq_disable();
3686                 clist = sd->completion_queue;
3687                 sd->completion_queue = NULL;
3688                 local_irq_enable();
3689
3690                 while (clist) {
3691                         struct sk_buff *skb = clist;
3692                         clist = clist->next;
3693
3694                         WARN_ON(atomic_read(&skb->users));
3695                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3696                                 trace_consume_skb(skb);
3697                         else
3698                                 trace_kfree_skb(skb, net_tx_action);
3699                         __kfree_skb(skb);
3700                 }
3701         }
3702
3703         if (sd->output_queue) {
3704                 struct Qdisc *head;
3705
3706                 local_irq_disable();
3707                 head = sd->output_queue;
3708                 sd->output_queue = NULL;
3709                 sd->output_queue_tailp = &sd->output_queue;
3710                 local_irq_enable();
3711
3712                 while (head) {
3713                         struct Qdisc *q = head;
3714                         spinlock_t *root_lock;
3715
3716                         head = head->next_sched;
3717
3718                         root_lock = qdisc_lock(q);
3719                         if (spin_trylock(root_lock)) {
3720                                 smp_mb__before_atomic();
3721                                 clear_bit(__QDISC_STATE_SCHED,
3722                                           &q->state);
3723                                 qdisc_run(q);
3724                                 spin_unlock(root_lock);
3725                         } else {
3726                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3727                                               &q->state)) {
3728                                         __netif_reschedule(q);
3729                                 } else {
3730                                         smp_mb__before_atomic();
3731                                         clear_bit(__QDISC_STATE_SCHED,
3732                                                   &q->state);
3733                                 }
3734                         }
3735                 }
3736         }
3737 }
3738
3739 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3740     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3741 /* This hook is defined here for ATM LANE */
3742 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3743                              unsigned char *addr) __read_mostly;
3744 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3745 #endif
3746
3747 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3748                                          struct packet_type **pt_prev,
3749                                          int *ret, struct net_device *orig_dev)
3750 {
3751 #ifdef CONFIG_NET_CLS_ACT
3752         struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3753         struct tcf_result cl_res;
3754
3755         /* If there's at least one ingress present somewhere (so
3756          * we get here via enabled static key), remaining devices
3757          * that are not configured with an ingress qdisc will bail
3758          * out here.
3759          */
3760         if (!cl)
3761                 return skb;
3762         if (*pt_prev) {
3763                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3764                 *pt_prev = NULL;
3765         }
3766
3767         qdisc_skb_cb(skb)->pkt_len = skb->len;
3768         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3769         qdisc_bstats_cpu_update(cl->q, skb);
3770
3771         switch (tc_classify(skb, cl, &cl_res, false)) {
3772         case TC_ACT_OK:
3773         case TC_ACT_RECLASSIFY:
3774                 skb->tc_index = TC_H_MIN(cl_res.classid);
3775                 break;
3776         case TC_ACT_SHOT:
3777                 qdisc_qstats_cpu_drop(cl->q);
3778         case TC_ACT_STOLEN:
3779         case TC_ACT_QUEUED:
3780                 kfree_skb(skb);
3781                 return NULL;
3782         case TC_ACT_REDIRECT:
3783                 /* skb_mac_header check was done by cls/act_bpf, so
3784                  * we can safely push the L2 header back before
3785                  * redirecting to another netdev
3786                  */
3787                 __skb_push(skb, skb->mac_len);
3788                 skb_do_redirect(skb);
3789                 return NULL;
3790         default:
3791                 break;
3792         }
3793 #endif /* CONFIG_NET_CLS_ACT */
3794         return skb;
3795 }
3796
3797 /**
3798  *      netdev_is_rx_handler_busy - check if receive handler is registered
3799  *      @dev: device to check
3800  *
3801  *      Check if a receive handler is already registered for a given device.
3802  *      Return true if there one.
3803  *
3804  *      The caller must hold the rtnl_mutex.
3805  */
3806 bool netdev_is_rx_handler_busy(struct net_device *dev)
3807 {
3808         ASSERT_RTNL();
3809         return dev && rtnl_dereference(dev->rx_handler);
3810 }
3811 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3812
3813 /**
3814  *      netdev_rx_handler_register - register receive handler
3815  *      @dev: device to register a handler for
3816  *      @rx_handler: receive handler to register
3817  *      @rx_handler_data: data pointer that is used by rx handler
3818  *
3819  *      Register a receive handler for a device. This handler will then be
3820  *      called from __netif_receive_skb. A negative errno code is returned
3821  *      on a failure.
3822  *
3823  *      The caller must hold the rtnl_mutex.
3824  *
3825  *      For a general description of rx_handler, see enum rx_handler_result.
3826  */
3827 int netdev_rx_handler_register(struct net_device *dev,
3828                                rx_handler_func_t *rx_handler,
3829                                void *rx_handler_data)
3830 {
3831         ASSERT_RTNL();
3832
3833         if (dev->rx_handler)
3834                 return -EBUSY;
3835
3836         /* Note: rx_handler_data must be set before rx_handler */
3837         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3838         rcu_assign_pointer(dev->rx_handler, rx_handler);
3839
3840         return 0;
3841 }
3842 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3843
3844 /**
3845  *      netdev_rx_handler_unregister - unregister receive handler
3846  *      @dev: device to unregister a handler from
3847  *
3848  *      Unregister a receive handler from a device.
3849  *
3850  *      The caller must hold the rtnl_mutex.
3851  */
3852 void netdev_rx_handler_unregister(struct net_device *dev)
3853 {
3854
3855         ASSERT_RTNL();
3856         RCU_INIT_POINTER(dev->rx_handler, NULL);
3857         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3858          * section has a guarantee to see a non NULL rx_handler_data
3859          * as well.
3860          */
3861         synchronize_net();
3862         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3863 }
3864 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3865
3866 /*
3867  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3868  * the special handling of PFMEMALLOC skbs.
3869  */
3870 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3871 {
3872         switch (skb->protocol) {
3873         case htons(ETH_P_ARP):
3874         case htons(ETH_P_IP):
3875         case htons(ETH_P_IPV6):
3876         case htons(ETH_P_8021Q):
3877         case htons(ETH_P_8021AD):
3878                 return true;
3879         default:
3880                 return false;
3881         }
3882 }
3883
3884 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
3885                              int *ret, struct net_device *orig_dev)
3886 {
3887 #ifdef CONFIG_NETFILTER_INGRESS
3888         if (nf_hook_ingress_active(skb)) {
3889                 if (*pt_prev) {
3890                         *ret = deliver_skb(skb, *pt_prev, orig_dev);
3891                         *pt_prev = NULL;
3892                 }
3893
3894                 return nf_hook_ingress(skb);
3895         }
3896 #endif /* CONFIG_NETFILTER_INGRESS */
3897         return 0;
3898 }
3899
3900 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3901 {
3902         struct packet_type *ptype, *pt_prev;
3903         rx_handler_func_t *rx_handler;
3904         struct net_device *orig_dev;
3905         bool deliver_exact = false;
3906         int ret = NET_RX_DROP;
3907         __be16 type;
3908
3909         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3910
3911         trace_netif_receive_skb(skb);
3912
3913         orig_dev = skb->dev;
3914
3915         skb_reset_network_header(skb);
3916         if (!skb_transport_header_was_set(skb))
3917                 skb_reset_transport_header(skb);
3918         skb_reset_mac_len(skb);
3919
3920         pt_prev = NULL;
3921
3922 another_round:
3923         skb->skb_iif = skb->dev->ifindex;
3924
3925         __this_cpu_inc(softnet_data.processed);
3926
3927         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3928             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3929                 skb = skb_vlan_untag(skb);
3930                 if (unlikely(!skb))
3931                         goto out;
3932         }
3933
3934 #ifdef CONFIG_NET_CLS_ACT
3935         if (skb->tc_verd & TC_NCLS) {
3936                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3937                 goto ncls;
3938         }
3939 #endif
3940
3941         if (pfmemalloc)
3942                 goto skip_taps;
3943
3944         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3945                 if (pt_prev)
3946                         ret = deliver_skb(skb, pt_prev, orig_dev);
3947                 pt_prev = ptype;
3948         }
3949
3950         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3951                 if (pt_prev)
3952                         ret = deliver_skb(skb, pt_prev, orig_dev);
3953                 pt_prev = ptype;
3954         }
3955
3956 skip_taps:
3957 #ifdef CONFIG_NET_INGRESS
3958         if (static_key_false(&ingress_needed)) {
3959                 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3960                 if (!skb)
3961                         goto out;
3962
3963                 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
3964                         goto out;
3965         }
3966 #endif
3967 #ifdef CONFIG_NET_CLS_ACT
3968         skb->tc_verd = 0;
3969 ncls:
3970 #endif
3971         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3972                 goto drop;
3973
3974         if (skb_vlan_tag_present(skb)) {
3975                 if (pt_prev) {
3976                         ret = deliver_skb(skb, pt_prev, orig_dev);
3977                         pt_prev = NULL;
3978                 }
3979                 if (vlan_do_receive(&skb))
3980                         goto another_round;
3981                 else if (unlikely(!skb))
3982                         goto out;
3983         }
3984
3985         rx_handler = rcu_dereference(skb->dev->rx_handler);
3986         if (rx_handler) {
3987                 if (pt_prev) {
3988                         ret = deliver_skb(skb, pt_prev, orig_dev);
3989                         pt_prev = NULL;
3990                 }
3991                 switch (rx_handler(&skb)) {
3992                 case RX_HANDLER_CONSUMED:
3993                         ret = NET_RX_SUCCESS;
3994                         goto out;
3995                 case RX_HANDLER_ANOTHER:
3996                         goto another_round;
3997                 case RX_HANDLER_EXACT:
3998                         deliver_exact = true;
3999                 case RX_HANDLER_PASS:
4000                         break;
4001                 default:
4002                         BUG();
4003                 }
4004         }
4005
4006         if (unlikely(skb_vlan_tag_present(skb))) {
4007                 if (skb_vlan_tag_get_id(skb))
4008                         skb->pkt_type = PACKET_OTHERHOST;
4009                 /* Note: we might in the future use prio bits
4010                  * and set skb->priority like in vlan_do_receive()
4011                  * For the time being, just ignore Priority Code Point
4012                  */
4013                 skb->vlan_tci = 0;
4014         }
4015
4016         type = skb->protocol;
4017
4018         /* deliver only exact match when indicated */
4019         if (likely(!deliver_exact)) {
4020                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4021                                        &ptype_base[ntohs(type) &
4022                                                    PTYPE_HASH_MASK]);
4023         }
4024
4025         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4026                                &orig_dev->ptype_specific);
4027
4028         if (unlikely(skb->dev != orig_dev)) {
4029                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4030                                        &skb->dev->ptype_specific);
4031         }
4032
4033         if (pt_prev) {
4034                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4035                         goto drop;
4036                 else
4037                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4038         } else {
4039 drop:
4040                 atomic_long_inc(&skb->dev->rx_dropped);
4041                 kfree_skb(skb);
4042                 /* Jamal, now you will not able to escape explaining
4043                  * me how you were going to use this. :-)
4044                  */
4045                 ret = NET_RX_DROP;
4046         }
4047
4048 out:
4049         return ret;
4050 }
4051
4052 static int __netif_receive_skb(struct sk_buff *skb)
4053 {
4054         int ret;
4055
4056         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4057                 unsigned long pflags = current->flags;
4058
4059                 /*
4060                  * PFMEMALLOC skbs are special, they should
4061                  * - be delivered to SOCK_MEMALLOC sockets only
4062                  * - stay away from userspace
4063                  * - have bounded memory usage
4064                  *
4065                  * Use PF_MEMALLOC as this saves us from propagating the allocation
4066                  * context down to all allocation sites.
4067                  */
4068                 current->flags |= PF_MEMALLOC;
4069                 ret = __netif_receive_skb_core(skb, true);
4070                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
4071         } else
4072                 ret = __netif_receive_skb_core(skb, false);
4073
4074         return ret;
4075 }
4076
4077 static int netif_receive_skb_internal(struct sk_buff *skb)
4078 {
4079         int ret;
4080
4081         net_timestamp_check(netdev_tstamp_prequeue, skb);
4082
4083         if (skb_defer_rx_timestamp(skb))
4084                 return NET_RX_SUCCESS;
4085
4086         rcu_read_lock();
4087
4088 #ifdef CONFIG_RPS
4089         if (static_key_false(&rps_needed)) {
4090                 struct rps_dev_flow voidflow, *rflow = &voidflow;
4091                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4092
4093                 if (cpu >= 0) {
4094                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4095                         rcu_read_unlock();
4096                         return ret;
4097                 }
4098         }
4099 #endif
4100         ret = __netif_receive_skb(skb);
4101         rcu_read_unlock();
4102         return ret;
4103 }
4104
4105 /**
4106  *      netif_receive_skb - process receive buffer from network
4107  *      @skb: buffer to process
4108  *
4109  *      netif_receive_skb() is the main receive data processing function.
4110  *      It always succeeds. The buffer may be dropped during processing
4111  *      for congestion control or by the protocol layers.
4112  *
4113  *      This function may only be called from softirq context and interrupts
4114  *      should be enabled.
4115  *
4116  *      Return values (usually ignored):
4117  *      NET_RX_SUCCESS: no congestion
4118  *      NET_RX_DROP: packet was dropped
4119  */
4120 int netif_receive_skb(struct sk_buff *skb)
4121 {
4122         trace_netif_receive_skb_entry(skb);
4123
4124         return netif_receive_skb_internal(skb);
4125 }
4126 EXPORT_SYMBOL(netif_receive_skb);
4127
4128 /* Network device is going away, flush any packets still pending
4129  * Called with irqs disabled.
4130  */
4131 static void flush_backlog(void *arg)
4132 {
4133         struct net_device *dev = arg;
4134         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4135         struct sk_buff *skb, *tmp;
4136
4137         rps_lock(sd);
4138         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4139                 if (skb->dev == dev) {
4140                         __skb_unlink(skb, &sd->input_pkt_queue);
4141                         kfree_skb(skb);
4142                         input_queue_head_incr(sd);
4143                 }
4144         }
4145         rps_unlock(sd);
4146
4147         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4148                 if (skb->dev == dev) {
4149                         __skb_unlink(skb, &sd->process_queue);
4150                         kfree_skb(skb);
4151                         input_queue_head_incr(sd);
4152                 }
4153         }
4154 }
4155
4156 static int napi_gro_complete(struct sk_buff *skb)
4157 {
4158         struct packet_offload *ptype;
4159         __be16 type = skb->protocol;
4160         struct list_head *head = &offload_base;
4161         int err = -ENOENT;
4162
4163         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4164
4165         if (NAPI_GRO_CB(skb)->count == 1) {
4166                 skb_shinfo(skb)->gso_size = 0;
4167                 goto out;
4168         }
4169
4170         rcu_read_lock();
4171         list_for_each_entry_rcu(ptype, head, list) {
4172                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4173                         continue;
4174
4175                 err = ptype->callbacks.gro_complete(skb, 0);
4176                 break;
4177         }
4178         rcu_read_unlock();
4179
4180         if (err) {
4181                 WARN_ON(&ptype->list == head);
4182                 kfree_skb(skb);
4183                 return NET_RX_SUCCESS;
4184         }
4185
4186 out:
4187         return netif_receive_skb_internal(skb);
4188 }
4189
4190 /* napi->gro_list contains packets ordered by age.
4191  * youngest packets at the head of it.
4192  * Complete skbs in reverse order to reduce latencies.
4193  */
4194 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4195 {
4196         struct sk_buff *skb, *prev = NULL;
4197
4198         /* scan list and build reverse chain */
4199         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4200                 skb->prev = prev;
4201                 prev = skb;
4202         }
4203
4204         for (skb = prev; skb; skb = prev) {
4205                 skb->next = NULL;
4206
4207                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4208                         return;
4209
4210                 prev = skb->prev;
4211                 napi_gro_complete(skb);
4212                 napi->gro_count--;
4213         }
4214
4215         napi->gro_list = NULL;
4216 }
4217 EXPORT_SYMBOL(napi_gro_flush);
4218
4219 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4220 {
4221         struct sk_buff *p;
4222         unsigned int maclen = skb->dev->hard_header_len;
4223         u32 hash = skb_get_hash_raw(skb);
4224
4225         for (p = napi->gro_list; p; p = p->next) {
4226                 unsigned long diffs;
4227
4228                 NAPI_GRO_CB(p)->flush = 0;
4229
4230                 if (hash != skb_get_hash_raw(p)) {
4231                         NAPI_GRO_CB(p)->same_flow = 0;
4232                         continue;
4233                 }
4234
4235                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4236                 diffs |= p->vlan_tci ^ skb->vlan_tci;
4237                 diffs |= skb_metadata_dst_cmp(p, skb);
4238                 if (maclen == ETH_HLEN)
4239                         diffs |= compare_ether_header(skb_mac_header(p),
4240                                                       skb_mac_header(skb));
4241                 else if (!diffs)
4242                         diffs = memcmp(skb_mac_header(p),
4243                                        skb_mac_header(skb),
4244                                        maclen);
4245                 NAPI_GRO_CB(p)->same_flow = !diffs;
4246         }
4247 }
4248
4249 static void skb_gro_reset_offset(struct sk_buff *skb)
4250 {
4251         const struct skb_shared_info *pinfo = skb_shinfo(skb);
4252         const skb_frag_t *frag0 = &pinfo->frags[0];
4253
4254         NAPI_GRO_CB(skb)->data_offset = 0;
4255         NAPI_GRO_CB(skb)->frag0 = NULL;
4256         NAPI_GRO_CB(skb)->frag0_len = 0;
4257
4258         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4259             pinfo->nr_frags &&
4260             !PageHighMem(skb_frag_page(frag0))) {
4261                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4262                 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4263                                                     skb_frag_size(frag0),
4264                                                     skb->end - skb->tail);
4265         }
4266 }
4267
4268 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4269 {
4270         struct skb_shared_info *pinfo = skb_shinfo(skb);
4271
4272         BUG_ON(skb->end - skb->tail < grow);
4273
4274         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4275
4276         skb->data_len -= grow;
4277         skb->tail += grow;
4278
4279         pinfo->frags[0].page_offset += grow;
4280         skb_frag_size_sub(&pinfo->frags[0], grow);
4281
4282         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4283                 skb_frag_unref(skb, 0);
4284                 memmove(pinfo->frags, pinfo->frags + 1,
4285                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4286         }
4287 }
4288
4289 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4290 {
4291         struct sk_buff **pp = NULL;
4292         struct packet_offload *ptype;
4293         __be16 type = skb->protocol;
4294         struct list_head *head = &offload_base;
4295         int same_flow;
4296         enum gro_result ret;
4297         int grow;
4298
4299         if (!(skb->dev->features & NETIF_F_GRO))
4300                 goto normal;
4301
4302         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4303                 goto normal;
4304
4305         gro_list_prepare(napi, skb);
4306
4307         rcu_read_lock();
4308         list_for_each_entry_rcu(ptype, head, list) {
4309                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4310                         continue;
4311
4312                 skb_set_network_header(skb, skb_gro_offset(skb));
4313                 skb_reset_mac_len(skb);
4314                 NAPI_GRO_CB(skb)->same_flow = 0;
4315                 NAPI_GRO_CB(skb)->flush = 0;
4316                 NAPI_GRO_CB(skb)->free = 0;
4317                 NAPI_GRO_CB(skb)->encap_mark = 0;
4318                 NAPI_GRO_CB(skb)->recursion_counter = 0;
4319                 NAPI_GRO_CB(skb)->is_fou = 0;
4320                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4321
4322                 /* Setup for GRO checksum validation */
4323                 switch (skb->ip_summed) {
4324                 case CHECKSUM_COMPLETE:
4325                         NAPI_GRO_CB(skb)->csum = skb->csum;
4326                         NAPI_GRO_CB(skb)->csum_valid = 1;
4327                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4328                         break;
4329                 case CHECKSUM_UNNECESSARY:
4330                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4331                         NAPI_GRO_CB(skb)->csum_valid = 0;
4332                         break;
4333                 default:
4334                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4335                         NAPI_GRO_CB(skb)->csum_valid = 0;
4336                 }
4337
4338                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4339                 break;
4340         }
4341         rcu_read_unlock();
4342
4343         if (&ptype->list == head)
4344                 goto normal;
4345
4346         same_flow = NAPI_GRO_CB(skb)->same_flow;
4347         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4348
4349         if (pp) {
4350                 struct sk_buff *nskb = *pp;
4351
4352                 *pp = nskb->next;
4353                 nskb->next = NULL;
4354                 napi_gro_complete(nskb);
4355                 napi->gro_count--;
4356         }
4357
4358         if (same_flow)
4359                 goto ok;
4360
4361         if (NAPI_GRO_CB(skb)->flush)
4362                 goto normal;
4363
4364         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4365                 struct sk_buff *nskb = napi->gro_list;
4366
4367                 /* locate the end of the list to select the 'oldest' flow */
4368                 while (nskb->next) {
4369                         pp = &nskb->next;
4370                         nskb = *pp;
4371                 }
4372                 *pp = NULL;
4373                 nskb->next = NULL;
4374                 napi_gro_complete(nskb);
4375         } else {
4376                 napi->gro_count++;
4377         }
4378         NAPI_GRO_CB(skb)->count = 1;
4379         NAPI_GRO_CB(skb)->age = jiffies;
4380         NAPI_GRO_CB(skb)->last = skb;
4381         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4382         skb->next = napi->gro_list;
4383         napi->gro_list = skb;
4384         ret = GRO_HELD;
4385
4386 pull:
4387         grow = skb_gro_offset(skb) - skb_headlen(skb);
4388         if (grow > 0)
4389                 gro_pull_from_frag0(skb, grow);
4390 ok:
4391         return ret;
4392
4393 normal:
4394         ret = GRO_NORMAL;
4395         goto pull;
4396 }
4397
4398 struct packet_offload *gro_find_receive_by_type(__be16 type)
4399 {
4400         struct list_head *offload_head = &offload_base;
4401         struct packet_offload *ptype;
4402
4403         list_for_each_entry_rcu(ptype, offload_head, list) {
4404                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4405                         continue;
4406                 return ptype;
4407         }
4408         return NULL;
4409 }
4410 EXPORT_SYMBOL(gro_find_receive_by_type);
4411
4412 struct packet_offload *gro_find_complete_by_type(__be16 type)
4413 {
4414         struct list_head *offload_head = &offload_base;
4415         struct packet_offload *ptype;
4416
4417         list_for_each_entry_rcu(ptype, offload_head, list) {
4418                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4419                         continue;
4420                 return ptype;
4421         }
4422         return NULL;
4423 }
4424 EXPORT_SYMBOL(gro_find_complete_by_type);
4425
4426 static void napi_skb_free_stolen_head(struct sk_buff *skb)
4427 {
4428         skb_dst_drop(skb);
4429         kmem_cache_free(skbuff_head_cache, skb);
4430 }
4431
4432 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4433 {
4434         switch (ret) {
4435         case GRO_NORMAL:
4436                 if (netif_receive_skb_internal(skb))
4437                         ret = GRO_DROP;
4438                 break;
4439
4440         case GRO_DROP:
4441                 kfree_skb(skb);
4442                 break;
4443
4444         case GRO_MERGED_FREE:
4445                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4446                         napi_skb_free_stolen_head(skb);
4447                 else
4448                         __kfree_skb(skb);
4449                 break;
4450
4451         case GRO_HELD:
4452         case GRO_MERGED:
4453                 break;
4454         }
4455
4456         return ret;
4457 }
4458
4459 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4460 {
4461         trace_napi_gro_receive_entry(skb);
4462
4463         skb_gro_reset_offset(skb);
4464
4465         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4466 }
4467 EXPORT_SYMBOL(napi_gro_receive);
4468
4469 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4470 {
4471         if (unlikely(skb->pfmemalloc)) {
4472                 consume_skb(skb);
4473                 return;
4474         }
4475         __skb_pull(skb, skb_headlen(skb));
4476         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4477         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4478         skb->vlan_tci = 0;
4479         skb->dev = napi->dev;
4480         skb->skb_iif = 0;
4481
4482         /* eth_type_trans() assumes pkt_type is PACKET_HOST */
4483         skb->pkt_type = PACKET_HOST;
4484
4485         skb->encapsulation = 0;
4486         skb_shinfo(skb)->gso_type = 0;
4487         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4488
4489         napi->skb = skb;
4490 }
4491
4492 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4493 {
4494         struct sk_buff *skb = napi->skb;
4495
4496         if (!skb) {
4497                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4498                 napi->skb = skb;
4499         }
4500         return skb;
4501 }
4502 EXPORT_SYMBOL(napi_get_frags);
4503
4504 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4505                                       struct sk_buff *skb,
4506                                       gro_result_t ret)
4507 {
4508         switch (ret) {
4509         case GRO_NORMAL:
4510         case GRO_HELD:
4511                 __skb_push(skb, ETH_HLEN);
4512                 skb->protocol = eth_type_trans(skb, skb->dev);
4513                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4514                         ret = GRO_DROP;
4515                 break;
4516
4517         case GRO_DROP:
4518                 napi_reuse_skb(napi, skb);
4519                 break;
4520
4521         case GRO_MERGED_FREE:
4522                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4523                         napi_skb_free_stolen_head(skb);
4524                 else
4525                         napi_reuse_skb(napi, skb);
4526                 break;
4527
4528         case GRO_MERGED:
4529                 break;
4530         }
4531
4532         return ret;
4533 }
4534
4535 /* Upper GRO stack assumes network header starts at gro_offset=0
4536  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4537  * We copy ethernet header into skb->data to have a common layout.
4538  */
4539 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4540 {
4541         struct sk_buff *skb = napi->skb;
4542         const struct ethhdr *eth;
4543         unsigned int hlen = sizeof(*eth);
4544
4545         napi->skb = NULL;
4546
4547         skb_reset_mac_header(skb);
4548         skb_gro_reset_offset(skb);
4549
4550         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4551                 eth = skb_gro_header_slow(skb, hlen, 0);
4552                 if (unlikely(!eth)) {
4553                         napi_reuse_skb(napi, skb);
4554                         return NULL;
4555                 }
4556         } else {
4557                 eth = (const struct ethhdr *)skb->data;
4558                 gro_pull_from_frag0(skb, hlen);
4559                 NAPI_GRO_CB(skb)->frag0 += hlen;
4560                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4561         }
4562         __skb_pull(skb, hlen);
4563
4564         /*
4565          * This works because the only protocols we care about don't require
4566          * special handling.
4567          * We'll fix it up properly in napi_frags_finish()
4568          */
4569         skb->protocol = eth->h_proto;
4570
4571         return skb;
4572 }
4573
4574 gro_result_t napi_gro_frags(struct napi_struct *napi)
4575 {
4576         struct sk_buff *skb = napi_frags_skb(napi);
4577
4578         if (!skb)
4579                 return GRO_DROP;
4580
4581         trace_napi_gro_frags_entry(skb);
4582
4583         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4584 }
4585 EXPORT_SYMBOL(napi_gro_frags);
4586
4587 /* Compute the checksum from gro_offset and return the folded value
4588  * after adding in any pseudo checksum.
4589  */
4590 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4591 {
4592         __wsum wsum;
4593         __sum16 sum;
4594
4595         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4596
4597         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4598         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4599         if (likely(!sum)) {
4600                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4601                     !skb->csum_complete_sw)
4602                         netdev_rx_csum_fault(skb->dev);
4603         }
4604
4605         NAPI_GRO_CB(skb)->csum = wsum;
4606         NAPI_GRO_CB(skb)->csum_valid = 1;
4607
4608         return sum;
4609 }
4610 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4611
4612 /*
4613  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4614  * Note: called with local irq disabled, but exits with local irq enabled.
4615  */
4616 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4617 {
4618 #ifdef CONFIG_RPS
4619         struct softnet_data *remsd = sd->rps_ipi_list;
4620
4621         if (remsd) {
4622                 sd->rps_ipi_list = NULL;
4623
4624                 local_irq_enable();
4625
4626                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4627                 while (remsd) {
4628                         struct softnet_data *next = remsd->rps_ipi_next;
4629
4630                         if (cpu_online(remsd->cpu))
4631                                 smp_call_function_single_async(remsd->cpu,
4632                                                            &remsd->csd);
4633                         remsd = next;
4634                 }
4635         } else
4636 #endif
4637                 local_irq_enable();
4638 }
4639
4640 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4641 {
4642 #ifdef CONFIG_RPS
4643         return sd->rps_ipi_list != NULL;
4644 #else
4645         return false;
4646 #endif
4647 }
4648
4649 static int process_backlog(struct napi_struct *napi, int quota)
4650 {
4651         int work = 0;
4652         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4653
4654         /* Check if we have pending ipi, its better to send them now,
4655          * not waiting net_rx_action() end.
4656          */
4657         if (sd_has_rps_ipi_waiting(sd)) {
4658                 local_irq_disable();
4659                 net_rps_action_and_irq_enable(sd);
4660         }
4661
4662         napi->weight = weight_p;
4663         local_irq_disable();
4664         while (1) {
4665                 struct sk_buff *skb;
4666
4667                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4668                         rcu_read_lock();
4669                         local_irq_enable();
4670                         __netif_receive_skb(skb);
4671                         rcu_read_unlock();
4672                         local_irq_disable();
4673                         input_queue_head_incr(sd);
4674                         if (++work >= quota) {
4675                                 local_irq_enable();
4676                                 return work;
4677                         }
4678                 }
4679
4680                 rps_lock(sd);
4681                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4682                         /*
4683                          * Inline a custom version of __napi_complete().
4684                          * only current cpu owns and manipulates this napi,
4685                          * and NAPI_STATE_SCHED is the only possible flag set
4686                          * on backlog.
4687                          * We can use a plain write instead of clear_bit(),
4688                          * and we dont need an smp_mb() memory barrier.
4689                          */
4690                         napi->state = 0;
4691                         rps_unlock(sd);
4692
4693                         break;
4694                 }
4695
4696                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4697                                            &sd->process_queue);
4698                 rps_unlock(sd);
4699         }
4700         local_irq_enable();
4701
4702         return work;
4703 }
4704
4705 /**
4706  * __napi_schedule - schedule for receive
4707  * @n: entry to schedule
4708  *
4709  * The entry's receive function will be scheduled to run.
4710  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4711  */
4712 void __napi_schedule(struct napi_struct *n)
4713 {
4714         unsigned long flags;
4715
4716         local_irq_save(flags);
4717         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4718         local_irq_restore(flags);
4719 }
4720 EXPORT_SYMBOL(__napi_schedule);
4721
4722 /**
4723  * __napi_schedule_irqoff - schedule for receive
4724  * @n: entry to schedule
4725  *
4726  * Variant of __napi_schedule() assuming hard irqs are masked.
4727  *
4728  * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
4729  * because the interrupt disabled assumption might not be true
4730  * due to force-threaded interrupts and spinlock substitution.
4731  */
4732 void __napi_schedule_irqoff(struct napi_struct *n)
4733 {
4734         if (!IS_ENABLED(CONFIG_PREEMPT_RT))
4735                 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4736         else
4737                 __napi_schedule(n);
4738 }
4739 EXPORT_SYMBOL(__napi_schedule_irqoff);
4740
4741 void __napi_complete(struct napi_struct *n)
4742 {
4743         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4744
4745         list_del_init(&n->poll_list);
4746         smp_mb__before_atomic();
4747         clear_bit(NAPI_STATE_SCHED, &n->state);
4748 }
4749 EXPORT_SYMBOL(__napi_complete);
4750
4751 void napi_complete_done(struct napi_struct *n, int work_done)
4752 {
4753         unsigned long flags;
4754
4755         /*
4756          * don't let napi dequeue from the cpu poll list
4757          * just in case its running on a different cpu
4758          */
4759         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4760                 return;
4761
4762         if (n->gro_list) {
4763                 unsigned long timeout = 0;
4764
4765                 if (work_done)
4766                         timeout = n->dev->gro_flush_timeout;
4767
4768                 if (timeout)
4769                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4770                                       HRTIMER_MODE_REL_PINNED);
4771                 else
4772                         napi_gro_flush(n, false);
4773         }
4774         if (likely(list_empty(&n->poll_list))) {
4775                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4776         } else {
4777                 /* If n->poll_list is not empty, we need to mask irqs */
4778                 local_irq_save(flags);
4779                 __napi_complete(n);
4780                 local_irq_restore(flags);
4781         }
4782 }
4783 EXPORT_SYMBOL(napi_complete_done);
4784
4785 /* must be called under rcu_read_lock(), as we dont take a reference */
4786 struct napi_struct *napi_by_id(unsigned int napi_id)
4787 {
4788         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4789         struct napi_struct *napi;
4790
4791         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4792                 if (napi->napi_id == napi_id)
4793                         return napi;
4794
4795         return NULL;
4796 }
4797 EXPORT_SYMBOL_GPL(napi_by_id);
4798
4799 void napi_hash_add(struct napi_struct *napi)
4800 {
4801         if (test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
4802                 return;
4803
4804         spin_lock(&napi_hash_lock);
4805
4806         /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
4807         do {
4808                 if (unlikely(++napi_gen_id < NR_CPUS + 1))
4809                         napi_gen_id = NR_CPUS + 1;
4810         } while (napi_by_id(napi_gen_id));
4811         napi->napi_id = napi_gen_id;
4812
4813         hlist_add_head_rcu(&napi->napi_hash_node,
4814                            &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4815
4816         spin_unlock(&napi_hash_lock);
4817 }
4818 EXPORT_SYMBOL_GPL(napi_hash_add);
4819
4820 /* Warning : caller is responsible to make sure rcu grace period
4821  * is respected before freeing memory containing @napi
4822  */
4823 void napi_hash_del(struct napi_struct *napi)
4824 {
4825         spin_lock(&napi_hash_lock);
4826
4827         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4828                 hlist_del_rcu(&napi->napi_hash_node);
4829
4830         spin_unlock(&napi_hash_lock);
4831 }
4832 EXPORT_SYMBOL_GPL(napi_hash_del);
4833
4834 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4835 {
4836         struct napi_struct *napi;
4837
4838         napi = container_of(timer, struct napi_struct, timer);
4839         if (napi->gro_list)
4840                 napi_schedule(napi);
4841
4842         return HRTIMER_NORESTART;
4843 }
4844
4845 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4846                     int (*poll)(struct napi_struct *, int), int weight)
4847 {
4848         INIT_LIST_HEAD(&napi->poll_list);
4849         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4850         napi->timer.function = napi_watchdog;
4851         napi->gro_count = 0;
4852         napi->gro_list = NULL;
4853         napi->skb = NULL;
4854         napi->poll = poll;
4855         if (weight > NAPI_POLL_WEIGHT)
4856                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4857                             weight, dev->name);
4858         napi->weight = weight;
4859         napi->dev = dev;
4860 #ifdef CONFIG_NETPOLL
4861         spin_lock_init(&napi->poll_lock);
4862         napi->poll_owner = -1;
4863 #endif
4864         set_bit(NAPI_STATE_SCHED, &napi->state);
4865         set_bit(NAPI_STATE_NPSVC, &napi->state);
4866         list_add_rcu(&napi->dev_list, &dev->napi_list);
4867 }
4868 EXPORT_SYMBOL(netif_napi_add);
4869
4870 void napi_disable(struct napi_struct *n)
4871 {
4872         might_sleep();
4873         set_bit(NAPI_STATE_DISABLE, &n->state);
4874
4875         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4876                 msleep(1);
4877         while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
4878                 msleep(1);
4879
4880         hrtimer_cancel(&n->timer);
4881
4882         clear_bit(NAPI_STATE_DISABLE, &n->state);
4883 }
4884 EXPORT_SYMBOL(napi_disable);
4885
4886 void netif_napi_del(struct napi_struct *napi)
4887 {
4888         list_del_init(&napi->dev_list);
4889         napi_free_frags(napi);
4890
4891         kfree_skb_list(napi->gro_list);
4892         napi->gro_list = NULL;
4893         napi->gro_count = 0;
4894 }
4895 EXPORT_SYMBOL(netif_napi_del);
4896
4897 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4898 {
4899         void *have;
4900         int work, weight;
4901
4902         list_del_init(&n->poll_list);
4903
4904         have = netpoll_poll_lock(n);
4905
4906         weight = n->weight;
4907
4908         /* This NAPI_STATE_SCHED test is for avoiding a race
4909          * with netpoll's poll_napi().  Only the entity which
4910          * obtains the lock and sees NAPI_STATE_SCHED set will
4911          * actually make the ->poll() call.  Therefore we avoid
4912          * accidentally calling ->poll() when NAPI is not scheduled.
4913          */
4914         work = 0;
4915         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4916                 work = n->poll(n, weight);
4917                 trace_napi_poll(n);
4918         }
4919
4920         WARN_ON_ONCE(work > weight);
4921
4922         if (likely(work < weight))
4923                 goto out_unlock;
4924
4925         /* Drivers must not modify the NAPI state if they
4926          * consume the entire weight.  In such cases this code
4927          * still "owns" the NAPI instance and therefore can
4928          * move the instance around on the list at-will.
4929          */
4930         if (unlikely(napi_disable_pending(n))) {
4931                 napi_complete(n);
4932                 goto out_unlock;
4933         }
4934
4935         if (n->gro_list) {
4936                 /* flush too old packets
4937                  * If HZ < 1000, flush all packets.
4938                  */
4939                 napi_gro_flush(n, HZ >= 1000);
4940         }
4941
4942         /* Some drivers may have called napi_schedule
4943          * prior to exhausting their budget.
4944          */
4945         if (unlikely(!list_empty(&n->poll_list))) {
4946                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4947                              n->dev ? n->dev->name : "backlog");
4948                 goto out_unlock;
4949         }
4950
4951         list_add_tail(&n->poll_list, repoll);
4952
4953 out_unlock:
4954         netpoll_poll_unlock(have);
4955
4956         return work;
4957 }
4958
4959 static void net_rx_action(struct softirq_action *h)
4960 {
4961         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4962         unsigned long time_limit = jiffies + 2;
4963         int budget = netdev_budget;
4964         LIST_HEAD(list);
4965         LIST_HEAD(repoll);
4966
4967         local_irq_disable();
4968         list_splice_init(&sd->poll_list, &list);
4969         local_irq_enable();
4970
4971         for (;;) {
4972                 struct napi_struct *n;
4973
4974                 if (list_empty(&list)) {
4975                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4976                                 return;
4977                         break;
4978                 }
4979
4980                 n = list_first_entry(&list, struct napi_struct, poll_list);
4981                 budget -= napi_poll(n, &repoll);
4982
4983                 /* If softirq window is exhausted then punt.
4984                  * Allow this to run for 2 jiffies since which will allow
4985                  * an average latency of 1.5/HZ.
4986                  */
4987                 if (unlikely(budget <= 0 ||
4988                              time_after_eq(jiffies, time_limit))) {
4989                         sd->time_squeeze++;
4990                         break;
4991                 }
4992         }
4993
4994         local_irq_disable();
4995
4996         list_splice_tail_init(&sd->poll_list, &list);
4997         list_splice_tail(&repoll, &list);
4998         list_splice(&list, &sd->poll_list);
4999         if (!list_empty(&sd->poll_list))
5000                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5001
5002         net_rps_action_and_irq_enable(sd);
5003 }
5004
5005 struct netdev_adjacent {
5006         struct net_device *dev;
5007
5008         /* upper master flag, there can only be one master device per list */
5009         bool master;
5010
5011         /* counter for the number of times this device was added to us */
5012         u16 ref_nr;
5013
5014         /* private field for the users */
5015         void *private;
5016
5017         struct list_head list;
5018         struct rcu_head rcu;
5019 };
5020
5021 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5022                                                  struct list_head *adj_list)
5023 {
5024         struct netdev_adjacent *adj;
5025
5026         list_for_each_entry(adj, adj_list, list) {
5027                 if (adj->dev == adj_dev)
5028                         return adj;
5029         }
5030         return NULL;
5031 }
5032
5033 /**
5034  * netdev_has_upper_dev - Check if device is linked to an upper device
5035  * @dev: device
5036  * @upper_dev: upper device to check
5037  *
5038  * Find out if a device is linked to specified upper device and return true
5039  * in case it is. Note that this checks only immediate upper device,
5040  * not through a complete stack of devices. The caller must hold the RTNL lock.
5041  */
5042 bool netdev_has_upper_dev(struct net_device *dev,
5043                           struct net_device *upper_dev)
5044 {
5045         ASSERT_RTNL();
5046
5047         return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
5048 }
5049 EXPORT_SYMBOL(netdev_has_upper_dev);
5050
5051 /**
5052  * netdev_has_any_upper_dev - Check if device is linked to some device
5053  * @dev: device
5054  *
5055  * Find out if a device is linked to an upper device and return true in case
5056  * it is. The caller must hold the RTNL lock.
5057  */
5058 static bool netdev_has_any_upper_dev(struct net_device *dev)
5059 {
5060         ASSERT_RTNL();
5061
5062         return !list_empty(&dev->all_adj_list.upper);
5063 }
5064
5065 /**
5066  * netdev_master_upper_dev_get - Get master upper device
5067  * @dev: device
5068  *
5069  * Find a master upper device and return pointer to it or NULL in case
5070  * it's not there. The caller must hold the RTNL lock.
5071  */
5072 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5073 {
5074         struct netdev_adjacent *upper;
5075
5076         ASSERT_RTNL();
5077
5078         if (list_empty(&dev->adj_list.upper))
5079                 return NULL;
5080
5081         upper = list_first_entry(&dev->adj_list.upper,
5082                                  struct netdev_adjacent, list);
5083         if (likely(upper->master))
5084                 return upper->dev;
5085         return NULL;
5086 }
5087 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5088
5089 void *netdev_adjacent_get_private(struct list_head *adj_list)
5090 {
5091         struct netdev_adjacent *adj;
5092
5093         adj = list_entry(adj_list, struct netdev_adjacent, list);
5094
5095         return adj->private;
5096 }
5097 EXPORT_SYMBOL(netdev_adjacent_get_private);
5098
5099 /**
5100  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5101  * @dev: device
5102  * @iter: list_head ** of the current position
5103  *
5104  * Gets the next device from the dev's upper list, starting from iter
5105  * position. The caller must hold RCU read lock.
5106  */
5107 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5108                                                  struct list_head **iter)
5109 {
5110         struct netdev_adjacent *upper;
5111
5112         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5113
5114         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5115
5116         if (&upper->list == &dev->adj_list.upper)
5117                 return NULL;
5118
5119         *iter = &upper->list;
5120
5121         return upper->dev;
5122 }
5123 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5124
5125 /**
5126  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5127  * @dev: device
5128  * @iter: list_head ** of the current position
5129  *
5130  * Gets the next device from the dev's upper list, starting from iter
5131  * position. The caller must hold RCU read lock.
5132  */
5133 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5134                                                      struct list_head **iter)
5135 {
5136         struct netdev_adjacent *upper;
5137
5138         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5139
5140         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5141
5142         if (&upper->list == &dev->all_adj_list.upper)
5143                 return NULL;
5144
5145         *iter = &upper->list;
5146
5147         return upper->dev;
5148 }
5149 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5150
5151 /**
5152  * netdev_lower_get_next_private - Get the next ->private from the
5153  *                                 lower neighbour list
5154  * @dev: device
5155  * @iter: list_head ** of the current position
5156  *
5157  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5158  * list, starting from iter position. The caller must hold either hold the
5159  * RTNL lock or its own locking that guarantees that the neighbour lower
5160  * list will remain unchanged.
5161  */
5162 void *netdev_lower_get_next_private(struct net_device *dev,
5163                                     struct list_head **iter)
5164 {
5165         struct netdev_adjacent *lower;
5166
5167         lower = list_entry(*iter, struct netdev_adjacent, list);
5168
5169         if (&lower->list == &dev->adj_list.lower)
5170                 return NULL;
5171
5172         *iter = lower->list.next;
5173
5174         return lower->private;
5175 }
5176 EXPORT_SYMBOL(netdev_lower_get_next_private);
5177
5178 /**
5179  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5180  *                                     lower neighbour list, RCU
5181  *                                     variant
5182  * @dev: device
5183  * @iter: list_head ** of the current position
5184  *
5185  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5186  * list, starting from iter position. The caller must hold RCU read lock.
5187  */
5188 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5189                                         struct list_head **iter)
5190 {
5191         struct netdev_adjacent *lower;
5192
5193         WARN_ON_ONCE(!rcu_read_lock_held());
5194
5195         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5196
5197         if (&lower->list == &dev->adj_list.lower)
5198                 return NULL;
5199
5200         *iter = &lower->list;
5201
5202         return lower->private;
5203 }
5204 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5205
5206 /**
5207  * netdev_lower_get_next - Get the next device from the lower neighbour
5208  *                         list
5209  * @dev: device
5210  * @iter: list_head ** of the current position
5211  *
5212  * Gets the next netdev_adjacent from the dev's lower neighbour
5213  * list, starting from iter position. The caller must hold RTNL lock or
5214  * its own locking that guarantees that the neighbour lower
5215  * list will remain unchanged.
5216  */
5217 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5218 {
5219         struct netdev_adjacent *lower;
5220
5221         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5222
5223         if (&lower->list == &dev->adj_list.lower)
5224                 return NULL;
5225
5226         *iter = &lower->list;
5227
5228         return lower->dev;
5229 }
5230 EXPORT_SYMBOL(netdev_lower_get_next);
5231
5232 /**
5233  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5234  *                                     lower neighbour list, RCU
5235  *                                     variant
5236  * @dev: device
5237  *
5238  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5239  * list. The caller must hold RCU read lock.
5240  */
5241 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5242 {
5243         struct netdev_adjacent *lower;
5244
5245         lower = list_first_or_null_rcu(&dev->adj_list.lower,
5246                         struct netdev_adjacent, list);
5247         if (lower)
5248                 return lower->private;
5249         return NULL;
5250 }
5251 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5252
5253 /**
5254  * netdev_master_upper_dev_get_rcu - Get master upper device
5255  * @dev: device
5256  *
5257  * Find a master upper device and return pointer to it or NULL in case
5258  * it's not there. The caller must hold the RCU read lock.
5259  */
5260 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5261 {
5262         struct netdev_adjacent *upper;
5263
5264         upper = list_first_or_null_rcu(&dev->adj_list.upper,
5265                                        struct netdev_adjacent, list);
5266         if (upper && likely(upper->master))
5267                 return upper->dev;
5268         return NULL;
5269 }
5270 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5271
5272 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5273                               struct net_device *adj_dev,
5274                               struct list_head *dev_list)
5275 {
5276         char linkname[IFNAMSIZ+7];
5277         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5278                 "upper_%s" : "lower_%s", adj_dev->name);
5279         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5280                                  linkname);
5281 }
5282 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5283                                char *name,
5284                                struct list_head *dev_list)
5285 {
5286         char linkname[IFNAMSIZ+7];
5287         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5288                 "upper_%s" : "lower_%s", name);
5289         sysfs_remove_link(&(dev->dev.kobj), linkname);
5290 }
5291
5292 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5293                                                  struct net_device *adj_dev,
5294                                                  struct list_head *dev_list)
5295 {
5296         return (dev_list == &dev->adj_list.upper ||
5297                 dev_list == &dev->adj_list.lower) &&
5298                 net_eq(dev_net(dev), dev_net(adj_dev));
5299 }
5300
5301 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5302                                         struct net_device *adj_dev,
5303                                         u16 ref_nr,
5304                                         struct list_head *dev_list,
5305                                         void *private, bool master)
5306 {
5307         struct netdev_adjacent *adj;
5308         int ret;
5309
5310         adj = __netdev_find_adj(adj_dev, dev_list);
5311
5312         if (adj) {
5313                 adj->ref_nr += ref_nr;
5314                 return 0;
5315         }
5316
5317         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5318         if (!adj)
5319                 return -ENOMEM;
5320
5321         adj->dev = adj_dev;
5322         adj->master = master;
5323         adj->ref_nr = ref_nr;
5324         adj->private = private;
5325         dev_hold(adj_dev);
5326
5327         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5328                  adj_dev->name, dev->name, adj_dev->name);
5329
5330         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5331                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5332                 if (ret)
5333                         goto free_adj;
5334         }
5335
5336         /* Ensure that master link is always the first item in list. */
5337         if (master) {
5338                 ret = sysfs_create_link(&(dev->dev.kobj),
5339                                         &(adj_dev->dev.kobj), "master");
5340                 if (ret)
5341                         goto remove_symlinks;
5342
5343                 list_add_rcu(&adj->list, dev_list);
5344         } else {
5345                 list_add_tail_rcu(&adj->list, dev_list);
5346         }
5347
5348         return 0;
5349
5350 remove_symlinks:
5351         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5352                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5353 free_adj:
5354         kfree(adj);
5355         dev_put(adj_dev);
5356
5357         return ret;
5358 }
5359
5360 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5361                                          struct net_device *adj_dev,
5362                                          u16 ref_nr,
5363                                          struct list_head *dev_list)
5364 {
5365         struct netdev_adjacent *adj;
5366
5367         adj = __netdev_find_adj(adj_dev, dev_list);
5368
5369         if (!adj) {
5370                 pr_err("tried to remove device %s from %s\n",
5371                        dev->name, adj_dev->name);
5372                 BUG();
5373         }
5374
5375         if (adj->ref_nr > ref_nr) {
5376                 pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name,
5377                          ref_nr, adj->ref_nr-ref_nr);
5378                 adj->ref_nr -= ref_nr;
5379                 return;
5380         }
5381
5382         if (adj->master)
5383                 sysfs_remove_link(&(dev->dev.kobj), "master");
5384
5385         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5386                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5387
5388         list_del_rcu(&adj->list);
5389         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5390                  adj_dev->name, dev->name, adj_dev->name);
5391         dev_put(adj_dev);
5392         kfree_rcu(adj, rcu);
5393 }
5394
5395 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5396                                             struct net_device *upper_dev,
5397                                             u16 ref_nr,
5398                                             struct list_head *up_list,
5399                                             struct list_head *down_list,
5400                                             void *private, bool master)
5401 {
5402         int ret;
5403
5404         ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list,
5405                                            private, master);
5406         if (ret)
5407                 return ret;
5408
5409         ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list,
5410                                            private, false);
5411         if (ret) {
5412                 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5413                 return ret;
5414         }
5415
5416         return 0;
5417 }
5418
5419 static int __netdev_adjacent_dev_link(struct net_device *dev,
5420                                       struct net_device *upper_dev,
5421                                       u16 ref_nr)
5422 {
5423         return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
5424                                                 &dev->all_adj_list.upper,
5425                                                 &upper_dev->all_adj_list.lower,
5426                                                 NULL, false);
5427 }
5428
5429 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5430                                                struct net_device *upper_dev,
5431                                                u16 ref_nr,
5432                                                struct list_head *up_list,
5433                                                struct list_head *down_list)
5434 {
5435         __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5436         __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5437 }
5438
5439 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5440                                          struct net_device *upper_dev,
5441                                          u16 ref_nr)
5442 {
5443         __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
5444                                            &dev->all_adj_list.upper,
5445                                            &upper_dev->all_adj_list.lower);
5446 }
5447
5448 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5449                                                 struct net_device *upper_dev,
5450                                                 void *private, bool master)
5451 {
5452         int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1);
5453
5454         if (ret)
5455                 return ret;
5456
5457         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
5458                                                &dev->adj_list.upper,
5459                                                &upper_dev->adj_list.lower,
5460                                                private, master);
5461         if (ret) {
5462                 __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5463                 return ret;
5464         }
5465
5466         return 0;
5467 }
5468
5469 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5470                                                    struct net_device *upper_dev)
5471 {
5472         __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5473         __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5474                                            &dev->adj_list.upper,
5475                                            &upper_dev->adj_list.lower);
5476 }
5477
5478 static int __netdev_upper_dev_link(struct net_device *dev,
5479                                    struct net_device *upper_dev, bool master,
5480                                    void *private)
5481 {
5482         struct netdev_notifier_changeupper_info changeupper_info;
5483         struct netdev_adjacent *i, *j, *to_i, *to_j;
5484         int ret = 0;
5485
5486         ASSERT_RTNL();
5487
5488         if (dev == upper_dev)
5489                 return -EBUSY;
5490
5491         /* To prevent loops, check if dev is not upper device to upper_dev. */
5492         if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5493                 return -EBUSY;
5494
5495         if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5496                 return -EEXIST;
5497
5498         if (master && netdev_master_upper_dev_get(dev))
5499                 return -EBUSY;
5500
5501         changeupper_info.upper_dev = upper_dev;
5502         changeupper_info.master = master;
5503         changeupper_info.linking = true;
5504
5505         ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5506                                             &changeupper_info.info);
5507         ret = notifier_to_errno(ret);
5508         if (ret)
5509                 return ret;
5510
5511         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5512                                                    master);
5513         if (ret)
5514                 return ret;
5515
5516         /* Now that we linked these devs, make all the upper_dev's
5517          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5518          * versa, and don't forget the devices itself. All of these
5519          * links are non-neighbours.
5520          */
5521         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5522                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5523                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5524                                  i->dev->name, j->dev->name);
5525                         ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr);
5526                         if (ret)
5527                                 goto rollback_mesh;
5528                 }
5529         }
5530
5531         /* add dev to every upper_dev's upper device */
5532         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5533                 pr_debug("linking %s's upper device %s with %s\n",
5534                          upper_dev->name, i->dev->name, dev->name);
5535                 ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr);
5536                 if (ret)
5537                         goto rollback_upper_mesh;
5538         }
5539
5540         /* add upper_dev to every dev's lower device */
5541         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5542                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5543                          i->dev->name, upper_dev->name);
5544                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr);
5545                 if (ret)
5546                         goto rollback_lower_mesh;
5547         }
5548
5549         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5550                                       &changeupper_info.info);
5551         return 0;
5552
5553 rollback_lower_mesh:
5554         to_i = i;
5555         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5556                 if (i == to_i)
5557                         break;
5558                 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5559         }
5560
5561         i = NULL;
5562
5563 rollback_upper_mesh:
5564         to_i = i;
5565         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5566                 if (i == to_i)
5567                         break;
5568                 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5569         }
5570
5571         i = j = NULL;
5572
5573 rollback_mesh:
5574         to_i = i;
5575         to_j = j;
5576         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5577                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5578                         if (i == to_i && j == to_j)
5579                                 break;
5580                         __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5581                 }
5582                 if (i == to_i)
5583                         break;
5584         }
5585
5586         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5587
5588         return ret;
5589 }
5590
5591 /**
5592  * netdev_upper_dev_link - Add a link to the upper device
5593  * @dev: device
5594  * @upper_dev: new upper device
5595  *
5596  * Adds a link to device which is upper to this one. The caller must hold
5597  * the RTNL lock. On a failure a negative errno code is returned.
5598  * On success the reference counts are adjusted and the function
5599  * returns zero.
5600  */
5601 int netdev_upper_dev_link(struct net_device *dev,
5602                           struct net_device *upper_dev)
5603 {
5604         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5605 }
5606 EXPORT_SYMBOL(netdev_upper_dev_link);
5607
5608 /**
5609  * netdev_master_upper_dev_link - Add a master link to the upper device
5610  * @dev: device
5611  * @upper_dev: new upper device
5612  *
5613  * Adds a link to device which is upper to this one. In this case, only
5614  * one master upper device can be linked, although other non-master devices
5615  * might be linked as well. The caller must hold the RTNL lock.
5616  * On a failure a negative errno code is returned. On success the reference
5617  * counts are adjusted and the function returns zero.
5618  */
5619 int netdev_master_upper_dev_link(struct net_device *dev,
5620                                  struct net_device *upper_dev)
5621 {
5622         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5623 }
5624 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5625
5626 int netdev_master_upper_dev_link_private(struct net_device *dev,
5627                                          struct net_device *upper_dev,
5628                                          void *private)
5629 {
5630         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5631 }
5632 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5633
5634 /**
5635  * netdev_upper_dev_unlink - Removes a link to upper device
5636  * @dev: device
5637  * @upper_dev: new upper device
5638  *
5639  * Removes a link to device which is upper to this one. The caller must hold
5640  * the RTNL lock.
5641  */
5642 void netdev_upper_dev_unlink(struct net_device *dev,
5643                              struct net_device *upper_dev)
5644 {
5645         struct netdev_notifier_changeupper_info changeupper_info;
5646         struct netdev_adjacent *i, *j;
5647         ASSERT_RTNL();
5648
5649         changeupper_info.upper_dev = upper_dev;
5650         changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5651         changeupper_info.linking = false;
5652
5653         call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5654                                       &changeupper_info.info);
5655
5656         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5657
5658         /* Here is the tricky part. We must remove all dev's lower
5659          * devices from all upper_dev's upper devices and vice
5660          * versa, to maintain the graph relationship.
5661          */
5662         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5663                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5664                         __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5665
5666         /* remove also the devices itself from lower/upper device
5667          * list
5668          */
5669         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5670                 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5671
5672         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5673                 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5674
5675         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5676                                       &changeupper_info.info);
5677 }
5678 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5679
5680 /**
5681  * netdev_bonding_info_change - Dispatch event about slave change
5682  * @dev: device
5683  * @bonding_info: info to dispatch
5684  *
5685  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5686  * The caller must hold the RTNL lock.
5687  */
5688 void netdev_bonding_info_change(struct net_device *dev,
5689                                 struct netdev_bonding_info *bonding_info)
5690 {
5691         struct netdev_notifier_bonding_info     info;
5692
5693         memcpy(&info.bonding_info, bonding_info,
5694                sizeof(struct netdev_bonding_info));
5695         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5696                                       &info.info);
5697 }
5698 EXPORT_SYMBOL(netdev_bonding_info_change);
5699
5700 static void netdev_adjacent_add_links(struct net_device *dev)
5701 {
5702         struct netdev_adjacent *iter;
5703
5704         struct net *net = dev_net(dev);
5705
5706         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5707                 if (!net_eq(net,dev_net(iter->dev)))
5708                         continue;
5709                 netdev_adjacent_sysfs_add(iter->dev, dev,
5710                                           &iter->dev->adj_list.lower);
5711                 netdev_adjacent_sysfs_add(dev, iter->dev,
5712                                           &dev->adj_list.upper);
5713         }
5714
5715         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5716                 if (!net_eq(net,dev_net(iter->dev)))
5717                         continue;
5718                 netdev_adjacent_sysfs_add(iter->dev, dev,
5719                                           &iter->dev->adj_list.upper);
5720                 netdev_adjacent_sysfs_add(dev, iter->dev,
5721                                           &dev->adj_list.lower);
5722         }
5723 }
5724
5725 static void netdev_adjacent_del_links(struct net_device *dev)
5726 {
5727         struct netdev_adjacent *iter;
5728
5729         struct net *net = dev_net(dev);
5730
5731         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5732                 if (!net_eq(net,dev_net(iter->dev)))
5733                         continue;
5734                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5735                                           &iter->dev->adj_list.lower);
5736                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5737                                           &dev->adj_list.upper);
5738         }
5739
5740         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5741                 if (!net_eq(net,dev_net(iter->dev)))
5742                         continue;
5743                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5744                                           &iter->dev->adj_list.upper);
5745                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5746                                           &dev->adj_list.lower);
5747         }
5748 }
5749
5750 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5751 {
5752         struct netdev_adjacent *iter;
5753
5754         struct net *net = dev_net(dev);
5755
5756         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5757                 if (!net_eq(net,dev_net(iter->dev)))
5758                         continue;
5759                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5760                                           &iter->dev->adj_list.lower);
5761                 netdev_adjacent_sysfs_add(iter->dev, dev,
5762                                           &iter->dev->adj_list.lower);
5763         }
5764
5765         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5766                 if (!net_eq(net,dev_net(iter->dev)))
5767                         continue;
5768                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5769                                           &iter->dev->adj_list.upper);
5770                 netdev_adjacent_sysfs_add(iter->dev, dev,
5771                                           &iter->dev->adj_list.upper);
5772         }
5773 }
5774
5775 void *netdev_lower_dev_get_private(struct net_device *dev,
5776                                    struct net_device *lower_dev)
5777 {
5778         struct netdev_adjacent *lower;
5779
5780         if (!lower_dev)
5781                 return NULL;
5782         lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
5783         if (!lower)
5784                 return NULL;
5785
5786         return lower->private;
5787 }
5788 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5789
5790
5791 int dev_get_nest_level(struct net_device *dev,
5792                        bool (*type_check)(struct net_device *dev))
5793 {
5794         struct net_device *lower = NULL;
5795         struct list_head *iter;
5796         int max_nest = -1;
5797         int nest;
5798
5799         ASSERT_RTNL();
5800
5801         netdev_for_each_lower_dev(dev, lower, iter) {
5802                 nest = dev_get_nest_level(lower, type_check);
5803                 if (max_nest < nest)
5804                         max_nest = nest;
5805         }
5806
5807         if (type_check(dev))
5808                 max_nest++;
5809
5810         return max_nest;
5811 }
5812 EXPORT_SYMBOL(dev_get_nest_level);
5813
5814 static void dev_change_rx_flags(struct net_device *dev, int flags)
5815 {
5816         const struct net_device_ops *ops = dev->netdev_ops;
5817
5818         if (ops->ndo_change_rx_flags)
5819                 ops->ndo_change_rx_flags(dev, flags);
5820 }
5821
5822 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5823 {
5824         unsigned int old_flags = dev->flags;
5825         kuid_t uid;
5826         kgid_t gid;
5827
5828         ASSERT_RTNL();
5829
5830         dev->flags |= IFF_PROMISC;
5831         dev->promiscuity += inc;
5832         if (dev->promiscuity == 0) {
5833                 /*
5834                  * Avoid overflow.
5835                  * If inc causes overflow, untouch promisc and return error.
5836                  */
5837                 if (inc < 0)
5838                         dev->flags &= ~IFF_PROMISC;
5839                 else {
5840                         dev->promiscuity -= inc;
5841                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5842                                 dev->name);
5843                         return -EOVERFLOW;
5844                 }
5845         }
5846         if (dev->flags != old_flags) {
5847                 pr_info("device %s %s promiscuous mode\n",
5848                         dev->name,
5849                         dev->flags & IFF_PROMISC ? "entered" : "left");
5850                 if (audit_enabled) {
5851                         current_uid_gid(&uid, &gid);
5852                         audit_log(current->audit_context, GFP_ATOMIC,
5853                                 AUDIT_ANOM_PROMISCUOUS,
5854                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5855                                 dev->name, (dev->flags & IFF_PROMISC),
5856                                 (old_flags & IFF_PROMISC),
5857                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5858                                 from_kuid(&init_user_ns, uid),
5859                                 from_kgid(&init_user_ns, gid),
5860                                 audit_get_sessionid(current));
5861                 }
5862
5863                 dev_change_rx_flags(dev, IFF_PROMISC);
5864         }
5865         if (notify)
5866                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5867         return 0;
5868 }
5869
5870 /**
5871  *      dev_set_promiscuity     - update promiscuity count on a device
5872  *      @dev: device
5873  *      @inc: modifier
5874  *
5875  *      Add or remove promiscuity from a device. While the count in the device
5876  *      remains above zero the interface remains promiscuous. Once it hits zero
5877  *      the device reverts back to normal filtering operation. A negative inc
5878  *      value is used to drop promiscuity on the device.
5879  *      Return 0 if successful or a negative errno code on error.
5880  */
5881 int dev_set_promiscuity(struct net_device *dev, int inc)
5882 {
5883         unsigned int old_flags = dev->flags;
5884         int err;
5885
5886         err = __dev_set_promiscuity(dev, inc, true);
5887         if (err < 0)
5888                 return err;
5889         if (dev->flags != old_flags)
5890                 dev_set_rx_mode(dev);
5891         return err;
5892 }
5893 EXPORT_SYMBOL(dev_set_promiscuity);
5894
5895 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5896 {
5897         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5898
5899         ASSERT_RTNL();
5900
5901         dev->flags |= IFF_ALLMULTI;
5902         dev->allmulti += inc;
5903         if (dev->allmulti == 0) {
5904                 /*
5905                  * Avoid overflow.
5906                  * If inc causes overflow, untouch allmulti and return error.
5907                  */
5908                 if (inc < 0)
5909                         dev->flags &= ~IFF_ALLMULTI;
5910                 else {
5911                         dev->allmulti -= inc;
5912                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5913                                 dev->name);
5914                         return -EOVERFLOW;
5915                 }
5916         }
5917         if (dev->flags ^ old_flags) {
5918                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5919                 dev_set_rx_mode(dev);
5920                 if (notify)
5921                         __dev_notify_flags(dev, old_flags,
5922                                            dev->gflags ^ old_gflags);
5923         }
5924         return 0;
5925 }
5926
5927 /**
5928  *      dev_set_allmulti        - update allmulti count on a device
5929  *      @dev: device
5930  *      @inc: modifier
5931  *
5932  *      Add or remove reception of all multicast frames to a device. While the
5933  *      count in the device remains above zero the interface remains listening
5934  *      to all interfaces. Once it hits zero the device reverts back to normal
5935  *      filtering operation. A negative @inc value is used to drop the counter
5936  *      when releasing a resource needing all multicasts.
5937  *      Return 0 if successful or a negative errno code on error.
5938  */
5939
5940 int dev_set_allmulti(struct net_device *dev, int inc)
5941 {
5942         return __dev_set_allmulti(dev, inc, true);
5943 }
5944 EXPORT_SYMBOL(dev_set_allmulti);
5945
5946 /*
5947  *      Upload unicast and multicast address lists to device and
5948  *      configure RX filtering. When the device doesn't support unicast
5949  *      filtering it is put in promiscuous mode while unicast addresses
5950  *      are present.
5951  */
5952 void __dev_set_rx_mode(struct net_device *dev)
5953 {
5954         const struct net_device_ops *ops = dev->netdev_ops;
5955
5956         /* dev_open will call this function so the list will stay sane. */
5957         if (!(dev->flags&IFF_UP))
5958                 return;
5959
5960         if (!netif_device_present(dev))
5961                 return;
5962
5963         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5964                 /* Unicast addresses changes may only happen under the rtnl,
5965                  * therefore calling __dev_set_promiscuity here is safe.
5966                  */
5967                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5968                         __dev_set_promiscuity(dev, 1, false);
5969                         dev->uc_promisc = true;
5970                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5971                         __dev_set_promiscuity(dev, -1, false);
5972                         dev->uc_promisc = false;
5973                 }
5974         }
5975
5976         if (ops->ndo_set_rx_mode)
5977                 ops->ndo_set_rx_mode(dev);
5978 }
5979
5980 void dev_set_rx_mode(struct net_device *dev)
5981 {
5982         netif_addr_lock_bh(dev);
5983         __dev_set_rx_mode(dev);
5984         netif_addr_unlock_bh(dev);
5985 }
5986
5987 /**
5988  *      dev_get_flags - get flags reported to userspace
5989  *      @dev: device
5990  *
5991  *      Get the combination of flag bits exported through APIs to userspace.
5992  */
5993 unsigned int dev_get_flags(const struct net_device *dev)
5994 {
5995         unsigned int flags;
5996
5997         flags = (dev->flags & ~(IFF_PROMISC |
5998                                 IFF_ALLMULTI |
5999                                 IFF_RUNNING |
6000                                 IFF_LOWER_UP |
6001                                 IFF_DORMANT)) |
6002                 (dev->gflags & (IFF_PROMISC |
6003                                 IFF_ALLMULTI));
6004
6005         if (netif_running(dev)) {
6006                 if (netif_oper_up(dev))
6007                         flags |= IFF_RUNNING;
6008                 if (netif_carrier_ok(dev))
6009                         flags |= IFF_LOWER_UP;
6010                 if (netif_dormant(dev))
6011                         flags |= IFF_DORMANT;
6012         }
6013
6014         return flags;
6015 }
6016 EXPORT_SYMBOL(dev_get_flags);
6017
6018 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6019 {
6020         unsigned int old_flags = dev->flags;
6021         int ret;
6022
6023         ASSERT_RTNL();
6024
6025         /*
6026          *      Set the flags on our device.
6027          */
6028
6029         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6030                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6031                                IFF_AUTOMEDIA)) |
6032                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6033                                     IFF_ALLMULTI));
6034
6035         /*
6036          *      Load in the correct multicast list now the flags have changed.
6037          */
6038
6039         if ((old_flags ^ flags) & IFF_MULTICAST)
6040                 dev_change_rx_flags(dev, IFF_MULTICAST);
6041
6042         dev_set_rx_mode(dev);
6043
6044         /*
6045          *      Have we downed the interface. We handle IFF_UP ourselves
6046          *      according to user attempts to set it, rather than blindly
6047          *      setting it.
6048          */
6049
6050         ret = 0;
6051         if ((old_flags ^ flags) & IFF_UP)
6052                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6053
6054         if ((flags ^ dev->gflags) & IFF_PROMISC) {
6055                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
6056                 unsigned int old_flags = dev->flags;
6057
6058                 dev->gflags ^= IFF_PROMISC;
6059
6060                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6061                         if (dev->flags != old_flags)
6062                                 dev_set_rx_mode(dev);
6063         }
6064
6065         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6066            is important. Some (broken) drivers set IFF_PROMISC, when
6067            IFF_ALLMULTI is requested not asking us and not reporting.
6068          */
6069         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6070                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6071
6072                 dev->gflags ^= IFF_ALLMULTI;
6073                 __dev_set_allmulti(dev, inc, false);
6074         }
6075
6076         return ret;
6077 }
6078
6079 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6080                         unsigned int gchanges)
6081 {
6082         unsigned int changes = dev->flags ^ old_flags;
6083
6084         if (gchanges)
6085                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6086
6087         if (changes & IFF_UP) {
6088                 if (dev->flags & IFF_UP)
6089                         call_netdevice_notifiers(NETDEV_UP, dev);
6090                 else
6091                         call_netdevice_notifiers(NETDEV_DOWN, dev);
6092         }
6093
6094         if (dev->flags & IFF_UP &&
6095             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6096                 struct netdev_notifier_change_info change_info;
6097
6098                 change_info.flags_changed = changes;
6099                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6100                                               &change_info.info);
6101         }
6102 }
6103
6104 /**
6105  *      dev_change_flags - change device settings
6106  *      @dev: device
6107  *      @flags: device state flags
6108  *
6109  *      Change settings on device based state flags. The flags are
6110  *      in the userspace exported format.
6111  */
6112 int dev_change_flags(struct net_device *dev, unsigned int flags)
6113 {
6114         int ret;
6115         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6116
6117         ret = __dev_change_flags(dev, flags);
6118         if (ret < 0)
6119                 return ret;
6120
6121         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6122         __dev_notify_flags(dev, old_flags, changes);
6123         return ret;
6124 }
6125 EXPORT_SYMBOL(dev_change_flags);
6126
6127 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6128 {
6129         const struct net_device_ops *ops = dev->netdev_ops;
6130
6131         if (ops->ndo_change_mtu)
6132                 return ops->ndo_change_mtu(dev, new_mtu);
6133
6134         /* Pairs with all the lockless reads of dev->mtu in the stack */
6135         WRITE_ONCE(dev->mtu, new_mtu);
6136         return 0;
6137 }
6138
6139 /**
6140  *      dev_set_mtu - Change maximum transfer unit
6141  *      @dev: device
6142  *      @new_mtu: new transfer unit
6143  *
6144  *      Change the maximum transfer size of the network device.
6145  */
6146 int dev_set_mtu(struct net_device *dev, int new_mtu)
6147 {
6148         int err, orig_mtu;
6149
6150         if (new_mtu == dev->mtu)
6151                 return 0;
6152
6153         /*      MTU must be positive.    */
6154         if (new_mtu < 0)
6155                 return -EINVAL;
6156
6157         if (!netif_device_present(dev))
6158                 return -ENODEV;
6159
6160         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6161         err = notifier_to_errno(err);
6162         if (err)
6163                 return err;
6164
6165         orig_mtu = dev->mtu;
6166         err = __dev_set_mtu(dev, new_mtu);
6167
6168         if (!err) {
6169                 err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
6170                                                    orig_mtu);
6171                 err = notifier_to_errno(err);
6172                 if (err) {
6173                         /* setting mtu back and notifying everyone again,
6174                          * so that they have a chance to revert changes.
6175                          */
6176                         __dev_set_mtu(dev, orig_mtu);
6177                         call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
6178                                                      new_mtu);
6179                 }
6180         }
6181         return err;
6182 }
6183 EXPORT_SYMBOL(dev_set_mtu);
6184
6185 /**
6186  *      dev_set_group - Change group this device belongs to
6187  *      @dev: device
6188  *      @new_group: group this device should belong to
6189  */
6190 void dev_set_group(struct net_device *dev, int new_group)
6191 {
6192         dev->group = new_group;
6193 }
6194 EXPORT_SYMBOL(dev_set_group);
6195
6196 /**
6197  *      dev_set_mac_address - Change Media Access Control Address
6198  *      @dev: device
6199  *      @sa: new address
6200  *
6201  *      Change the hardware (MAC) address of the device
6202  */
6203 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6204 {
6205         const struct net_device_ops *ops = dev->netdev_ops;
6206         int err;
6207
6208         if (!ops->ndo_set_mac_address)
6209                 return -EOPNOTSUPP;
6210         if (sa->sa_family != dev->type)
6211                 return -EINVAL;
6212         if (!netif_device_present(dev))
6213                 return -ENODEV;
6214         err = ops->ndo_set_mac_address(dev, sa);
6215         if (err)
6216                 return err;
6217         dev->addr_assign_type = NET_ADDR_SET;
6218         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6219         add_device_randomness(dev->dev_addr, dev->addr_len);
6220         return 0;
6221 }
6222 EXPORT_SYMBOL(dev_set_mac_address);
6223
6224 /**
6225  *      dev_change_carrier - Change device carrier
6226  *      @dev: device
6227  *      @new_carrier: new value
6228  *
6229  *      Change device carrier
6230  */
6231 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6232 {
6233         const struct net_device_ops *ops = dev->netdev_ops;
6234
6235         if (!ops->ndo_change_carrier)
6236                 return -EOPNOTSUPP;
6237         if (!netif_device_present(dev))
6238                 return -ENODEV;
6239         return ops->ndo_change_carrier(dev, new_carrier);
6240 }
6241 EXPORT_SYMBOL(dev_change_carrier);
6242
6243 /**
6244  *      dev_get_phys_port_id - Get device physical port ID
6245  *      @dev: device
6246  *      @ppid: port ID
6247  *
6248  *      Get device physical port ID
6249  */
6250 int dev_get_phys_port_id(struct net_device *dev,
6251                          struct netdev_phys_item_id *ppid)
6252 {
6253         const struct net_device_ops *ops = dev->netdev_ops;
6254
6255         if (!ops->ndo_get_phys_port_id)
6256                 return -EOPNOTSUPP;
6257         return ops->ndo_get_phys_port_id(dev, ppid);
6258 }
6259 EXPORT_SYMBOL(dev_get_phys_port_id);
6260
6261 /**
6262  *      dev_get_phys_port_name - Get device physical port name
6263  *      @dev: device
6264  *      @name: port name
6265  *
6266  *      Get device physical port name
6267  */
6268 int dev_get_phys_port_name(struct net_device *dev,
6269                            char *name, size_t len)
6270 {
6271         const struct net_device_ops *ops = dev->netdev_ops;
6272
6273         if (!ops->ndo_get_phys_port_name)
6274                 return -EOPNOTSUPP;
6275         return ops->ndo_get_phys_port_name(dev, name, len);
6276 }
6277 EXPORT_SYMBOL(dev_get_phys_port_name);
6278
6279 /**
6280  *      dev_change_proto_down - update protocol port state information
6281  *      @dev: device
6282  *      @proto_down: new value
6283  *
6284  *      This info can be used by switch drivers to set the phys state of the
6285  *      port.
6286  */
6287 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6288 {
6289         const struct net_device_ops *ops = dev->netdev_ops;
6290
6291         if (!ops->ndo_change_proto_down)
6292                 return -EOPNOTSUPP;
6293         if (!netif_device_present(dev))
6294                 return -ENODEV;
6295         return ops->ndo_change_proto_down(dev, proto_down);
6296 }
6297 EXPORT_SYMBOL(dev_change_proto_down);
6298
6299 /**
6300  *      dev_new_index   -       allocate an ifindex
6301  *      @net: the applicable net namespace
6302  *
6303  *      Returns a suitable unique value for a new device interface
6304  *      number.  The caller must hold the rtnl semaphore or the
6305  *      dev_base_lock to be sure it remains unique.
6306  */
6307 static int dev_new_index(struct net *net)
6308 {
6309         int ifindex = net->ifindex;
6310         for (;;) {
6311                 if (++ifindex <= 0)
6312                         ifindex = 1;
6313                 if (!__dev_get_by_index(net, ifindex))
6314                         return net->ifindex = ifindex;
6315         }
6316 }
6317
6318 /* Delayed registration/unregisteration */
6319 static LIST_HEAD(net_todo_list);
6320 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6321
6322 static void net_set_todo(struct net_device *dev)
6323 {
6324         list_add_tail(&dev->todo_list, &net_todo_list);
6325         dev_net(dev)->dev_unreg_count++;
6326 }
6327
6328 static void rollback_registered_many(struct list_head *head)
6329 {
6330         struct net_device *dev, *tmp;
6331         LIST_HEAD(close_head);
6332
6333         BUG_ON(dev_boot_phase);
6334         ASSERT_RTNL();
6335
6336         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6337                 /* Some devices call without registering
6338                  * for initialization unwind. Remove those
6339                  * devices and proceed with the remaining.
6340                  */
6341                 if (dev->reg_state == NETREG_UNINITIALIZED) {
6342                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6343                                  dev->name, dev);
6344
6345                         WARN_ON(1);
6346                         list_del(&dev->unreg_list);
6347                         continue;
6348                 }
6349                 dev->dismantle = true;
6350                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6351         }
6352
6353         /* If device is running, close it first. */
6354         list_for_each_entry(dev, head, unreg_list)
6355                 list_add_tail(&dev->close_list, &close_head);
6356         dev_close_many(&close_head, true);
6357
6358         list_for_each_entry(dev, head, unreg_list) {
6359                 /* And unlink it from device chain. */
6360                 unlist_netdevice(dev);
6361
6362                 dev->reg_state = NETREG_UNREGISTERING;
6363                 on_each_cpu(flush_backlog, dev, 1);
6364         }
6365
6366         synchronize_net();
6367
6368         list_for_each_entry(dev, head, unreg_list) {
6369                 struct sk_buff *skb = NULL;
6370
6371                 /* Shutdown queueing discipline. */
6372                 dev_shutdown(dev);
6373
6374
6375                 /* Notify protocols, that we are about to destroy
6376                    this device. They should clean all the things.
6377                 */
6378                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6379
6380                 if (!dev->rtnl_link_ops ||
6381                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6382                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6383                                                      GFP_KERNEL);
6384
6385                 /*
6386                  *      Flush the unicast and multicast chains
6387                  */
6388                 dev_uc_flush(dev);
6389                 dev_mc_flush(dev);
6390
6391                 if (dev->netdev_ops->ndo_uninit)
6392                         dev->netdev_ops->ndo_uninit(dev);
6393
6394                 if (skb)
6395                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6396
6397                 /* Notifier chain MUST detach us all upper devices. */
6398                 WARN_ON(netdev_has_any_upper_dev(dev));
6399
6400                 /* Remove entries from kobject tree */
6401                 netdev_unregister_kobject(dev);
6402 #ifdef CONFIG_XPS
6403                 /* Remove XPS queueing entries */
6404                 netif_reset_xps_queues_gt(dev, 0);
6405 #endif
6406         }
6407
6408         synchronize_net();
6409
6410         list_for_each_entry(dev, head, unreg_list)
6411                 dev_put(dev);
6412 }
6413
6414 static void rollback_registered(struct net_device *dev)
6415 {
6416         LIST_HEAD(single);
6417
6418         list_add(&dev->unreg_list, &single);
6419         rollback_registered_many(&single);
6420         list_del(&single);
6421 }
6422
6423 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6424         struct net_device *upper, netdev_features_t features)
6425 {
6426         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6427         netdev_features_t feature;
6428         int feature_bit;
6429
6430         for_each_netdev_feature(upper_disables, feature_bit) {
6431                 feature = __NETIF_F_BIT(feature_bit);
6432                 if (!(upper->wanted_features & feature)
6433                     && (features & feature)) {
6434                         netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6435                                    &feature, upper->name);
6436                         features &= ~feature;
6437                 }
6438         }
6439
6440         return features;
6441 }
6442
6443 static void netdev_sync_lower_features(struct net_device *upper,
6444         struct net_device *lower, netdev_features_t features)
6445 {
6446         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6447         netdev_features_t feature;
6448         int feature_bit;
6449
6450         for_each_netdev_feature(upper_disables, feature_bit) {
6451                 feature = __NETIF_F_BIT(feature_bit);
6452                 if (!(features & feature) && (lower->features & feature)) {
6453                         netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6454                                    &feature, lower->name);
6455                         lower->wanted_features &= ~feature;
6456                         __netdev_update_features(lower);
6457
6458                         if (unlikely(lower->features & feature))
6459                                 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6460                                             &feature, lower->name);
6461                         else
6462                                 netdev_features_change(lower);
6463                 }
6464         }
6465 }
6466
6467 static netdev_features_t netdev_fix_features(struct net_device *dev,
6468         netdev_features_t features)
6469 {
6470         /* Fix illegal checksum combinations */
6471         if ((features & NETIF_F_HW_CSUM) &&
6472             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6473                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6474                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6475         }
6476
6477         /* TSO requires that SG is present as well. */
6478         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6479                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6480                 features &= ~NETIF_F_ALL_TSO;
6481         }
6482
6483         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6484                                         !(features & NETIF_F_IP_CSUM)) {
6485                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6486                 features &= ~NETIF_F_TSO;
6487                 features &= ~NETIF_F_TSO_ECN;
6488         }
6489
6490         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6491                                          !(features & NETIF_F_IPV6_CSUM)) {
6492                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6493                 features &= ~NETIF_F_TSO6;
6494         }
6495
6496         /* TSO ECN requires that TSO is present as well. */
6497         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6498                 features &= ~NETIF_F_TSO_ECN;
6499
6500         /* Software GSO depends on SG. */
6501         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6502                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6503                 features &= ~NETIF_F_GSO;
6504         }
6505
6506         /* UFO needs SG and checksumming */
6507         if (features & NETIF_F_UFO) {
6508                 /* maybe split UFO into V4 and V6? */
6509                 if (!((features & NETIF_F_GEN_CSUM) ||
6510                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6511                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6512                         netdev_dbg(dev,
6513                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6514                         features &= ~NETIF_F_UFO;
6515                 }
6516
6517                 if (!(features & NETIF_F_SG)) {
6518                         netdev_dbg(dev,
6519                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6520                         features &= ~NETIF_F_UFO;
6521                 }
6522         }
6523
6524 #ifdef CONFIG_NET_RX_BUSY_POLL
6525         if (dev->netdev_ops->ndo_busy_poll)
6526                 features |= NETIF_F_BUSY_POLL;
6527         else
6528 #endif
6529                 features &= ~NETIF_F_BUSY_POLL;
6530
6531         return features;
6532 }
6533
6534 int __netdev_update_features(struct net_device *dev)
6535 {
6536         struct net_device *upper, *lower;
6537         netdev_features_t features;
6538         struct list_head *iter;
6539         int err = -1;
6540
6541         ASSERT_RTNL();
6542
6543         features = netdev_get_wanted_features(dev);
6544
6545         if (dev->netdev_ops->ndo_fix_features)
6546                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6547
6548         /* driver might be less strict about feature dependencies */
6549         features = netdev_fix_features(dev, features);
6550
6551         /* some features can't be enabled if they're off an an upper device */
6552         netdev_for_each_upper_dev_rcu(dev, upper, iter)
6553                 features = netdev_sync_upper_features(dev, upper, features);
6554
6555         if (dev->features == features)
6556                 goto sync_lower;
6557
6558         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6559                 &dev->features, &features);
6560
6561         if (dev->netdev_ops->ndo_set_features)
6562                 err = dev->netdev_ops->ndo_set_features(dev, features);
6563         else
6564                 err = 0;
6565
6566         if (unlikely(err < 0)) {
6567                 netdev_err(dev,
6568                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6569                         err, &features, &dev->features);
6570                 /* return non-0 since some features might have changed and
6571                  * it's better to fire a spurious notification than miss it
6572                  */
6573                 return -1;
6574         }
6575
6576 sync_lower:
6577         /* some features must be disabled on lower devices when disabled
6578          * on an upper device (think: bonding master or bridge)
6579          */
6580         netdev_for_each_lower_dev(dev, lower, iter)
6581                 netdev_sync_lower_features(dev, lower, features);
6582
6583         if (!err)
6584                 dev->features = features;
6585
6586         return err < 0 ? 0 : 1;
6587 }
6588
6589 /**
6590  *      netdev_update_features - recalculate device features
6591  *      @dev: the device to check
6592  *
6593  *      Recalculate dev->features set and send notifications if it
6594  *      has changed. Should be called after driver or hardware dependent
6595  *      conditions might have changed that influence the features.
6596  */
6597 void netdev_update_features(struct net_device *dev)
6598 {
6599         if (__netdev_update_features(dev))
6600                 netdev_features_change(dev);
6601 }
6602 EXPORT_SYMBOL(netdev_update_features);
6603
6604 /**
6605  *      netdev_change_features - recalculate device features
6606  *      @dev: the device to check
6607  *
6608  *      Recalculate dev->features set and send notifications even
6609  *      if they have not changed. Should be called instead of
6610  *      netdev_update_features() if also dev->vlan_features might
6611  *      have changed to allow the changes to be propagated to stacked
6612  *      VLAN devices.
6613  */
6614 void netdev_change_features(struct net_device *dev)
6615 {
6616         __netdev_update_features(dev);
6617         netdev_features_change(dev);
6618 }
6619 EXPORT_SYMBOL(netdev_change_features);
6620
6621 /**
6622  *      netif_stacked_transfer_operstate -      transfer operstate
6623  *      @rootdev: the root or lower level device to transfer state from
6624  *      @dev: the device to transfer operstate to
6625  *
6626  *      Transfer operational state from root to device. This is normally
6627  *      called when a stacking relationship exists between the root
6628  *      device and the device(a leaf device).
6629  */
6630 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6631                                         struct net_device *dev)
6632 {
6633         if (rootdev->operstate == IF_OPER_DORMANT)
6634                 netif_dormant_on(dev);
6635         else
6636                 netif_dormant_off(dev);
6637
6638         if (netif_carrier_ok(rootdev)) {
6639                 if (!netif_carrier_ok(dev))
6640                         netif_carrier_on(dev);
6641         } else {
6642                 if (netif_carrier_ok(dev))
6643                         netif_carrier_off(dev);
6644         }
6645 }
6646 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6647
6648 #ifdef CONFIG_SYSFS
6649 static int netif_alloc_rx_queues(struct net_device *dev)
6650 {
6651         unsigned int i, count = dev->num_rx_queues;
6652         struct netdev_rx_queue *rx;
6653         size_t sz = count * sizeof(*rx);
6654
6655         BUG_ON(count < 1);
6656
6657         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6658         if (!rx) {
6659                 rx = vzalloc(sz);
6660                 if (!rx)
6661                         return -ENOMEM;
6662         }
6663         dev->_rx = rx;
6664
6665         for (i = 0; i < count; i++)
6666                 rx[i].dev = dev;
6667         return 0;
6668 }
6669 #endif
6670
6671 static void netdev_init_one_queue(struct net_device *dev,
6672                                   struct netdev_queue *queue, void *_unused)
6673 {
6674         /* Initialize queue lock */
6675         spin_lock_init(&queue->_xmit_lock);
6676         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6677         queue->xmit_lock_owner = -1;
6678         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6679         queue->dev = dev;
6680 #ifdef CONFIG_BQL
6681         dql_init(&queue->dql, HZ);
6682 #endif
6683 }
6684
6685 static void netif_free_tx_queues(struct net_device *dev)
6686 {
6687         kvfree(dev->_tx);
6688 }
6689
6690 static int netif_alloc_netdev_queues(struct net_device *dev)
6691 {
6692         unsigned int count = dev->num_tx_queues;
6693         struct netdev_queue *tx;
6694         size_t sz = count * sizeof(*tx);
6695
6696         if (count < 1 || count > 0xffff)
6697                 return -EINVAL;
6698
6699         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6700         if (!tx) {
6701                 tx = vzalloc(sz);
6702                 if (!tx)
6703                         return -ENOMEM;
6704         }
6705         dev->_tx = tx;
6706
6707         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6708         spin_lock_init(&dev->tx_global_lock);
6709
6710         return 0;
6711 }
6712
6713 void netif_tx_stop_all_queues(struct net_device *dev)
6714 {
6715         unsigned int i;
6716
6717         for (i = 0; i < dev->num_tx_queues; i++) {
6718                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6719                 netif_tx_stop_queue(txq);
6720         }
6721 }
6722 EXPORT_SYMBOL(netif_tx_stop_all_queues);
6723
6724 /**
6725  *      register_netdevice      - register a network device
6726  *      @dev: device to register
6727  *
6728  *      Take a completed network device structure and add it to the kernel
6729  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6730  *      chain. 0 is returned on success. A negative errno code is returned
6731  *      on a failure to set up the device, or if the name is a duplicate.
6732  *
6733  *      Callers must hold the rtnl semaphore. You may want
6734  *      register_netdev() instead of this.
6735  *
6736  *      BUGS:
6737  *      The locking appears insufficient to guarantee two parallel registers
6738  *      will not get the same name.
6739  */
6740
6741 int register_netdevice(struct net_device *dev)
6742 {
6743         int ret;
6744         struct net *net = dev_net(dev);
6745
6746         BUG_ON(dev_boot_phase);
6747         ASSERT_RTNL();
6748
6749         might_sleep();
6750
6751         /* When net_device's are persistent, this will be fatal. */
6752         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6753         BUG_ON(!net);
6754
6755         spin_lock_init(&dev->addr_list_lock);
6756         netdev_set_addr_lockdep_class(dev);
6757
6758         ret = dev_get_valid_name(net, dev, dev->name);
6759         if (ret < 0)
6760                 goto out;
6761
6762         /* Init, if this function is available */
6763         if (dev->netdev_ops->ndo_init) {
6764                 ret = dev->netdev_ops->ndo_init(dev);
6765                 if (ret) {
6766                         if (ret > 0)
6767                                 ret = -EIO;
6768                         goto out;
6769                 }
6770         }
6771
6772         if (((dev->hw_features | dev->features) &
6773              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6774             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6775              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6776                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6777                 ret = -EINVAL;
6778                 goto err_uninit;
6779         }
6780
6781         ret = -EBUSY;
6782         if (!dev->ifindex)
6783                 dev->ifindex = dev_new_index(net);
6784         else if (__dev_get_by_index(net, dev->ifindex))
6785                 goto err_uninit;
6786
6787         /* Transfer changeable features to wanted_features and enable
6788          * software offloads (GSO and GRO).
6789          */
6790         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6791         dev->features |= NETIF_F_SOFT_FEATURES;
6792         dev->wanted_features = dev->features & dev->hw_features;
6793
6794         if (!(dev->flags & IFF_LOOPBACK)) {
6795                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6796         }
6797
6798         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6799          */
6800         dev->vlan_features |= NETIF_F_HIGHDMA;
6801
6802         /* Make NETIF_F_SG inheritable to tunnel devices.
6803          */
6804         dev->hw_enc_features |= NETIF_F_SG;
6805
6806         /* Make NETIF_F_SG inheritable to MPLS.
6807          */
6808         dev->mpls_features |= NETIF_F_SG;
6809
6810         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6811         ret = notifier_to_errno(ret);
6812         if (ret)
6813                 goto err_uninit;
6814
6815         ret = netdev_register_kobject(dev);
6816         if (ret)
6817                 goto err_uninit;
6818         dev->reg_state = NETREG_REGISTERED;
6819
6820         __netdev_update_features(dev);
6821
6822         /*
6823          *      Default initial state at registry is that the
6824          *      device is present.
6825          */
6826
6827         set_bit(__LINK_STATE_PRESENT, &dev->state);
6828
6829         linkwatch_init_dev(dev);
6830
6831         dev_init_scheduler(dev);
6832         dev_hold(dev);
6833         list_netdevice(dev);
6834         add_device_randomness(dev->dev_addr, dev->addr_len);
6835
6836         /* If the device has permanent device address, driver should
6837          * set dev_addr and also addr_assign_type should be set to
6838          * NET_ADDR_PERM (default value).
6839          */
6840         if (dev->addr_assign_type == NET_ADDR_PERM)
6841                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6842
6843         /* Notify protocols, that a new device appeared. */
6844         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6845         ret = notifier_to_errno(ret);
6846         if (ret) {
6847                 rollback_registered(dev);
6848                 rcu_barrier();
6849
6850                 dev->reg_state = NETREG_UNREGISTERED;
6851                 /* We should put the kobject that hold in
6852                  * netdev_unregister_kobject(), otherwise
6853                  * the net device cannot be freed when
6854                  * driver calls free_netdev(), because the
6855                  * kobject is being hold.
6856                  */
6857                 kobject_put(&dev->dev.kobj);
6858         }
6859         /*
6860          *      Prevent userspace races by waiting until the network
6861          *      device is fully setup before sending notifications.
6862          */
6863         if (!dev->rtnl_link_ops ||
6864             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6865                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6866
6867 out:
6868         return ret;
6869
6870 err_uninit:
6871         if (dev->netdev_ops->ndo_uninit)
6872                 dev->netdev_ops->ndo_uninit(dev);
6873         goto out;
6874 }
6875 EXPORT_SYMBOL(register_netdevice);
6876
6877 /**
6878  *      init_dummy_netdev       - init a dummy network device for NAPI
6879  *      @dev: device to init
6880  *
6881  *      This takes a network device structure and initialize the minimum
6882  *      amount of fields so it can be used to schedule NAPI polls without
6883  *      registering a full blown interface. This is to be used by drivers
6884  *      that need to tie several hardware interfaces to a single NAPI
6885  *      poll scheduler due to HW limitations.
6886  */
6887 int init_dummy_netdev(struct net_device *dev)
6888 {
6889         /* Clear everything. Note we don't initialize spinlocks
6890          * are they aren't supposed to be taken by any of the
6891          * NAPI code and this dummy netdev is supposed to be
6892          * only ever used for NAPI polls
6893          */
6894         memset(dev, 0, sizeof(struct net_device));
6895
6896         /* make sure we BUG if trying to hit standard
6897          * register/unregister code path
6898          */
6899         dev->reg_state = NETREG_DUMMY;
6900
6901         /* NAPI wants this */
6902         INIT_LIST_HEAD(&dev->napi_list);
6903
6904         /* a dummy interface is started by default */
6905         set_bit(__LINK_STATE_PRESENT, &dev->state);
6906         set_bit(__LINK_STATE_START, &dev->state);
6907
6908         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6909          * because users of this 'device' dont need to change
6910          * its refcount.
6911          */
6912
6913         return 0;
6914 }
6915 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6916
6917
6918 /**
6919  *      register_netdev - register a network device
6920  *      @dev: device to register
6921  *
6922  *      Take a completed network device structure and add it to the kernel
6923  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6924  *      chain. 0 is returned on success. A negative errno code is returned
6925  *      on a failure to set up the device, or if the name is a duplicate.
6926  *
6927  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6928  *      and expands the device name if you passed a format string to
6929  *      alloc_netdev.
6930  */
6931 int register_netdev(struct net_device *dev)
6932 {
6933         int err;
6934
6935         rtnl_lock();
6936         err = register_netdevice(dev);
6937         rtnl_unlock();
6938         return err;
6939 }
6940 EXPORT_SYMBOL(register_netdev);
6941
6942 int netdev_refcnt_read(const struct net_device *dev)
6943 {
6944         int i, refcnt = 0;
6945
6946         for_each_possible_cpu(i)
6947                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6948         return refcnt;
6949 }
6950 EXPORT_SYMBOL(netdev_refcnt_read);
6951
6952 /**
6953  * netdev_wait_allrefs - wait until all references are gone.
6954  * @dev: target net_device
6955  *
6956  * This is called when unregistering network devices.
6957  *
6958  * Any protocol or device that holds a reference should register
6959  * for netdevice notification, and cleanup and put back the
6960  * reference if they receive an UNREGISTER event.
6961  * We can get stuck here if buggy protocols don't correctly
6962  * call dev_put.
6963  */
6964 static void netdev_wait_allrefs(struct net_device *dev)
6965 {
6966         unsigned long rebroadcast_time, warning_time;
6967         int refcnt;
6968
6969         linkwatch_forget_dev(dev);
6970
6971         rebroadcast_time = warning_time = jiffies;
6972         refcnt = netdev_refcnt_read(dev);
6973
6974         while (refcnt != 0) {
6975                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6976                         rtnl_lock();
6977
6978                         /* Rebroadcast unregister notification */
6979                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6980
6981                         __rtnl_unlock();
6982                         rcu_barrier();
6983                         rtnl_lock();
6984
6985                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6986                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6987                                      &dev->state)) {
6988                                 /* We must not have linkwatch events
6989                                  * pending on unregister. If this
6990                                  * happens, we simply run the queue
6991                                  * unscheduled, resulting in a noop
6992                                  * for this device.
6993                                  */
6994                                 linkwatch_run_queue();
6995                         }
6996
6997                         __rtnl_unlock();
6998
6999                         rebroadcast_time = jiffies;
7000                 }
7001
7002                 msleep(250);
7003
7004                 refcnt = netdev_refcnt_read(dev);
7005
7006                 if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
7007                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7008                                  dev->name, refcnt);
7009                         warning_time = jiffies;
7010                 }
7011         }
7012 }
7013
7014 /* The sequence is:
7015  *
7016  *      rtnl_lock();
7017  *      ...
7018  *      register_netdevice(x1);
7019  *      register_netdevice(x2);
7020  *      ...
7021  *      unregister_netdevice(y1);
7022  *      unregister_netdevice(y2);
7023  *      ...
7024  *      rtnl_unlock();
7025  *      free_netdev(y1);
7026  *      free_netdev(y2);
7027  *
7028  * We are invoked by rtnl_unlock().
7029  * This allows us to deal with problems:
7030  * 1) We can delete sysfs objects which invoke hotplug
7031  *    without deadlocking with linkwatch via keventd.
7032  * 2) Since we run with the RTNL semaphore not held, we can sleep
7033  *    safely in order to wait for the netdev refcnt to drop to zero.
7034  *
7035  * We must not return until all unregister events added during
7036  * the interval the lock was held have been completed.
7037  */
7038 void netdev_run_todo(void)
7039 {
7040         struct list_head list;
7041
7042         /* Snapshot list, allow later requests */
7043         list_replace_init(&net_todo_list, &list);
7044
7045         __rtnl_unlock();
7046
7047
7048         /* Wait for rcu callbacks to finish before next phase */
7049         if (!list_empty(&list))
7050                 rcu_barrier();
7051
7052         while (!list_empty(&list)) {
7053                 struct net_device *dev
7054                         = list_first_entry(&list, struct net_device, todo_list);
7055                 list_del(&dev->todo_list);
7056
7057                 rtnl_lock();
7058                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7059                 __rtnl_unlock();
7060
7061                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7062                         pr_err("network todo '%s' but state %d\n",
7063                                dev->name, dev->reg_state);
7064                         dump_stack();
7065                         continue;
7066                 }
7067
7068                 dev->reg_state = NETREG_UNREGISTERED;
7069
7070                 netdev_wait_allrefs(dev);
7071
7072                 /* paranoia */
7073                 BUG_ON(netdev_refcnt_read(dev));
7074                 BUG_ON(!list_empty(&dev->ptype_all));
7075                 BUG_ON(!list_empty(&dev->ptype_specific));
7076                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
7077                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7078                 WARN_ON(dev->dn_ptr);
7079
7080                 if (dev->destructor)
7081                         dev->destructor(dev);
7082
7083                 /* Report a network device has been unregistered */
7084                 rtnl_lock();
7085                 dev_net(dev)->dev_unreg_count--;
7086                 __rtnl_unlock();
7087                 wake_up(&netdev_unregistering_wq);
7088
7089                 /* Free network device */
7090                 kobject_put(&dev->dev.kobj);
7091         }
7092 }
7093
7094 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
7095  * fields in the same order, with only the type differing.
7096  */
7097 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7098                              const struct net_device_stats *netdev_stats)
7099 {
7100 #if BITS_PER_LONG == 64
7101         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
7102         memcpy(stats64, netdev_stats, sizeof(*stats64));
7103 #else
7104         size_t i, n = sizeof(*stats64) / sizeof(u64);
7105         const unsigned long *src = (const unsigned long *)netdev_stats;
7106         u64 *dst = (u64 *)stats64;
7107
7108         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
7109                      sizeof(*stats64) / sizeof(u64));
7110         for (i = 0; i < n; i++)
7111                 dst[i] = src[i];
7112 #endif
7113 }
7114 EXPORT_SYMBOL(netdev_stats_to_stats64);
7115
7116 /**
7117  *      dev_get_stats   - get network device statistics
7118  *      @dev: device to get statistics from
7119  *      @storage: place to store stats
7120  *
7121  *      Get network statistics from device. Return @storage.
7122  *      The device driver may provide its own method by setting
7123  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7124  *      otherwise the internal statistics structure is used.
7125  */
7126 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7127                                         struct rtnl_link_stats64 *storage)
7128 {
7129         const struct net_device_ops *ops = dev->netdev_ops;
7130
7131         if (ops->ndo_get_stats64) {
7132                 memset(storage, 0, sizeof(*storage));
7133                 ops->ndo_get_stats64(dev, storage);
7134         } else if (ops->ndo_get_stats) {
7135                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7136         } else {
7137                 netdev_stats_to_stats64(storage, &dev->stats);
7138         }
7139         storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
7140         storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
7141         return storage;
7142 }
7143 EXPORT_SYMBOL(dev_get_stats);
7144
7145 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7146 {
7147         struct netdev_queue *queue = dev_ingress_queue(dev);
7148
7149 #ifdef CONFIG_NET_CLS_ACT
7150         if (queue)
7151                 return queue;
7152         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7153         if (!queue)
7154                 return NULL;
7155         netdev_init_one_queue(dev, queue, NULL);
7156         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7157         queue->qdisc_sleeping = &noop_qdisc;
7158         rcu_assign_pointer(dev->ingress_queue, queue);
7159 #endif
7160         return queue;
7161 }
7162
7163 static const struct ethtool_ops default_ethtool_ops;
7164
7165 void netdev_set_default_ethtool_ops(struct net_device *dev,
7166                                     const struct ethtool_ops *ops)
7167 {
7168         if (dev->ethtool_ops == &default_ethtool_ops)
7169                 dev->ethtool_ops = ops;
7170 }
7171 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7172
7173 void netdev_freemem(struct net_device *dev)
7174 {
7175         char *addr = (char *)dev - dev->padded;
7176
7177         kvfree(addr);
7178 }
7179
7180 /**
7181  *      alloc_netdev_mqs - allocate network device
7182  *      @sizeof_priv:           size of private data to allocate space for
7183  *      @name:                  device name format string
7184  *      @name_assign_type:      origin of device name
7185  *      @setup:                 callback to initialize device
7186  *      @txqs:                  the number of TX subqueues to allocate
7187  *      @rxqs:                  the number of RX subqueues to allocate
7188  *
7189  *      Allocates a struct net_device with private data area for driver use
7190  *      and performs basic initialization.  Also allocates subqueue structs
7191  *      for each queue on the device.
7192  */
7193 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7194                 unsigned char name_assign_type,
7195                 void (*setup)(struct net_device *),
7196                 unsigned int txqs, unsigned int rxqs)
7197 {
7198         struct net_device *dev;
7199         size_t alloc_size;
7200         struct net_device *p;
7201
7202         BUG_ON(strlen(name) >= sizeof(dev->name));
7203
7204         if (txqs < 1) {
7205                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7206                 return NULL;
7207         }
7208
7209 #ifdef CONFIG_SYSFS
7210         if (rxqs < 1) {
7211                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7212                 return NULL;
7213         }
7214 #endif
7215
7216         alloc_size = sizeof(struct net_device);
7217         if (sizeof_priv) {
7218                 /* ensure 32-byte alignment of private area */
7219                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7220                 alloc_size += sizeof_priv;
7221         }
7222         /* ensure 32-byte alignment of whole construct */
7223         alloc_size += NETDEV_ALIGN - 1;
7224
7225         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7226         if (!p)
7227                 p = vzalloc(alloc_size);
7228         if (!p)
7229                 return NULL;
7230
7231         dev = PTR_ALIGN(p, NETDEV_ALIGN);
7232         dev->padded = (char *)dev - (char *)p;
7233
7234         dev->pcpu_refcnt = alloc_percpu(int);
7235         if (!dev->pcpu_refcnt)
7236                 goto free_dev;
7237
7238         if (dev_addr_init(dev))
7239                 goto free_pcpu;
7240
7241         dev_mc_init(dev);
7242         dev_uc_init(dev);
7243
7244         dev_net_set(dev, &init_net);
7245
7246         dev->gso_max_size = GSO_MAX_SIZE;
7247         dev->gso_max_segs = GSO_MAX_SEGS;
7248         dev->gso_min_segs = 0;
7249
7250         INIT_LIST_HEAD(&dev->napi_list);
7251         INIT_LIST_HEAD(&dev->unreg_list);
7252         INIT_LIST_HEAD(&dev->close_list);
7253         INIT_LIST_HEAD(&dev->link_watch_list);
7254         INIT_LIST_HEAD(&dev->adj_list.upper);
7255         INIT_LIST_HEAD(&dev->adj_list.lower);
7256         INIT_LIST_HEAD(&dev->all_adj_list.upper);
7257         INIT_LIST_HEAD(&dev->all_adj_list.lower);
7258         INIT_LIST_HEAD(&dev->ptype_all);
7259         INIT_LIST_HEAD(&dev->ptype_specific);
7260         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7261         setup(dev);
7262
7263         if (!dev->tx_queue_len) {
7264                 dev->priv_flags |= IFF_NO_QUEUE;
7265                 dev->tx_queue_len = 1;
7266         }
7267
7268         dev->num_tx_queues = txqs;
7269         dev->real_num_tx_queues = txqs;
7270         if (netif_alloc_netdev_queues(dev))
7271                 goto free_all;
7272
7273 #ifdef CONFIG_SYSFS
7274         dev->num_rx_queues = rxqs;
7275         dev->real_num_rx_queues = rxqs;
7276         if (netif_alloc_rx_queues(dev))
7277                 goto free_all;
7278 #endif
7279
7280         strcpy(dev->name, name);
7281         dev->name_assign_type = name_assign_type;
7282         dev->group = INIT_NETDEV_GROUP;
7283         if (!dev->ethtool_ops)
7284                 dev->ethtool_ops = &default_ethtool_ops;
7285
7286         nf_hook_ingress_init(dev);
7287
7288         return dev;
7289
7290 free_all:
7291         free_netdev(dev);
7292         return NULL;
7293
7294 free_pcpu:
7295         free_percpu(dev->pcpu_refcnt);
7296 free_dev:
7297         netdev_freemem(dev);
7298         return NULL;
7299 }
7300 EXPORT_SYMBOL(alloc_netdev_mqs);
7301
7302 /**
7303  *      free_netdev - free network device
7304  *      @dev: device
7305  *
7306  *      This function does the last stage of destroying an allocated device
7307  *      interface. The reference to the device object is released.
7308  *      If this is the last reference then it will be freed.
7309  */
7310 void free_netdev(struct net_device *dev)
7311 {
7312         struct napi_struct *p, *n;
7313
7314         netif_free_tx_queues(dev);
7315 #ifdef CONFIG_SYSFS
7316         kvfree(dev->_rx);
7317 #endif
7318
7319         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7320
7321         /* Flush device addresses */
7322         dev_addr_flush(dev);
7323
7324         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7325                 netif_napi_del(p);
7326
7327         free_percpu(dev->pcpu_refcnt);
7328         dev->pcpu_refcnt = NULL;
7329
7330         /*  Compatibility with error handling in drivers */
7331         if (dev->reg_state == NETREG_UNINITIALIZED) {
7332                 netdev_freemem(dev);
7333                 return;
7334         }
7335
7336         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7337         dev->reg_state = NETREG_RELEASED;
7338
7339         /* will free via device release */
7340         put_device(&dev->dev);
7341 }
7342 EXPORT_SYMBOL(free_netdev);
7343
7344 /**
7345  *      synchronize_net -  Synchronize with packet receive processing
7346  *
7347  *      Wait for packets currently being received to be done.
7348  *      Does not block later packets from starting.
7349  */
7350 void synchronize_net(void)
7351 {
7352         might_sleep();
7353         if (rtnl_is_locked())
7354                 synchronize_rcu_expedited();
7355         else
7356                 synchronize_rcu();
7357 }
7358 EXPORT_SYMBOL(synchronize_net);
7359
7360 /**
7361  *      unregister_netdevice_queue - remove device from the kernel
7362  *      @dev: device
7363  *      @head: list
7364  *
7365  *      This function shuts down a device interface and removes it
7366  *      from the kernel tables.
7367  *      If head not NULL, device is queued to be unregistered later.
7368  *
7369  *      Callers must hold the rtnl semaphore.  You may want
7370  *      unregister_netdev() instead of this.
7371  */
7372
7373 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7374 {
7375         ASSERT_RTNL();
7376
7377         if (head) {
7378                 list_move_tail(&dev->unreg_list, head);
7379         } else {
7380                 rollback_registered(dev);
7381                 /* Finish processing unregister after unlock */
7382                 net_set_todo(dev);
7383         }
7384 }
7385 EXPORT_SYMBOL(unregister_netdevice_queue);
7386
7387 /**
7388  *      unregister_netdevice_many - unregister many devices
7389  *      @head: list of devices
7390  *
7391  *  Note: As most callers use a stack allocated list_head,
7392  *  we force a list_del() to make sure stack wont be corrupted later.
7393  */
7394 void unregister_netdevice_many(struct list_head *head)
7395 {
7396         struct net_device *dev;
7397
7398         if (!list_empty(head)) {
7399                 rollback_registered_many(head);
7400                 list_for_each_entry(dev, head, unreg_list)
7401                         net_set_todo(dev);
7402                 list_del(head);
7403         }
7404 }
7405 EXPORT_SYMBOL(unregister_netdevice_many);
7406
7407 /**
7408  *      unregister_netdev - remove device from the kernel
7409  *      @dev: device
7410  *
7411  *      This function shuts down a device interface and removes it
7412  *      from the kernel tables.
7413  *
7414  *      This is just a wrapper for unregister_netdevice that takes
7415  *      the rtnl semaphore.  In general you want to use this and not
7416  *      unregister_netdevice.
7417  */
7418 void unregister_netdev(struct net_device *dev)
7419 {
7420         rtnl_lock();
7421         unregister_netdevice(dev);
7422         rtnl_unlock();
7423 }
7424 EXPORT_SYMBOL(unregister_netdev);
7425
7426 /**
7427  *      dev_change_net_namespace - move device to different nethost namespace
7428  *      @dev: device
7429  *      @net: network namespace
7430  *      @pat: If not NULL name pattern to try if the current device name
7431  *            is already taken in the destination network namespace.
7432  *
7433  *      This function shuts down a device interface and moves it
7434  *      to a new network namespace. On success 0 is returned, on
7435  *      a failure a netagive errno code is returned.
7436  *
7437  *      Callers must hold the rtnl semaphore.
7438  */
7439
7440 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7441 {
7442         int err;
7443
7444         ASSERT_RTNL();
7445
7446         /* Don't allow namespace local devices to be moved. */
7447         err = -EINVAL;
7448         if (dev->features & NETIF_F_NETNS_LOCAL)
7449                 goto out;
7450
7451         /* Ensure the device has been registrered */
7452         if (dev->reg_state != NETREG_REGISTERED)
7453                 goto out;
7454
7455         /* Get out if there is nothing todo */
7456         err = 0;
7457         if (net_eq(dev_net(dev), net))
7458                 goto out;
7459
7460         /* Pick the destination device name, and ensure
7461          * we can use it in the destination network namespace.
7462          */
7463         err = -EEXIST;
7464         if (__dev_get_by_name(net, dev->name)) {
7465                 /* We get here if we can't use the current device name */
7466                 if (!pat)
7467                         goto out;
7468                 err = dev_get_valid_name(net, dev, pat);
7469                 if (err < 0)
7470                         goto out;
7471         }
7472
7473         /*
7474          * And now a mini version of register_netdevice unregister_netdevice.
7475          */
7476
7477         /* If device is running close it first. */
7478         dev_close(dev);
7479
7480         /* And unlink it from device chain */
7481         unlist_netdevice(dev);
7482
7483         synchronize_net();
7484
7485         /* Shutdown queueing discipline. */
7486         dev_shutdown(dev);
7487
7488         /* Notify protocols, that we are about to destroy
7489            this device. They should clean all the things.
7490
7491            Note that dev->reg_state stays at NETREG_REGISTERED.
7492            This is wanted because this way 8021q and macvlan know
7493            the device is just moving and can keep their slaves up.
7494         */
7495         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7496         rcu_barrier();
7497         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7498         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7499
7500         /*
7501          *      Flush the unicast and multicast chains
7502          */
7503         dev_uc_flush(dev);
7504         dev_mc_flush(dev);
7505
7506         /* Send a netdev-removed uevent to the old namespace */
7507         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7508         netdev_adjacent_del_links(dev);
7509
7510         /* Actually switch the network namespace */
7511         dev_net_set(dev, net);
7512
7513         /* If there is an ifindex conflict assign a new one */
7514         if (__dev_get_by_index(net, dev->ifindex))
7515                 dev->ifindex = dev_new_index(net);
7516
7517         /* Send a netdev-add uevent to the new namespace */
7518         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7519         netdev_adjacent_add_links(dev);
7520
7521         /* Fixup kobjects */
7522         err = device_rename(&dev->dev, dev->name);
7523         WARN_ON(err);
7524
7525         /* Add the device back in the hashes */
7526         list_netdevice(dev);
7527
7528         /* Notify protocols, that a new device appeared. */
7529         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7530
7531         /*
7532          *      Prevent userspace races by waiting until the network
7533          *      device is fully setup before sending notifications.
7534          */
7535         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7536
7537         synchronize_net();
7538         err = 0;
7539 out:
7540         return err;
7541 }
7542 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7543
7544 static int dev_cpu_callback(struct notifier_block *nfb,
7545                             unsigned long action,
7546                             void *ocpu)
7547 {
7548         struct sk_buff **list_skb;
7549         struct sk_buff *skb;
7550         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7551         struct softnet_data *sd, *oldsd;
7552
7553         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7554                 return NOTIFY_OK;
7555
7556         local_irq_disable();
7557         cpu = smp_processor_id();
7558         sd = &per_cpu(softnet_data, cpu);
7559         oldsd = &per_cpu(softnet_data, oldcpu);
7560
7561         /* Find end of our completion_queue. */
7562         list_skb = &sd->completion_queue;
7563         while (*list_skb)
7564                 list_skb = &(*list_skb)->next;
7565         /* Append completion queue from offline CPU. */
7566         *list_skb = oldsd->completion_queue;
7567         oldsd->completion_queue = NULL;
7568
7569         /* Append output queue from offline CPU. */
7570         if (oldsd->output_queue) {
7571                 *sd->output_queue_tailp = oldsd->output_queue;
7572                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7573                 oldsd->output_queue = NULL;
7574                 oldsd->output_queue_tailp = &oldsd->output_queue;
7575         }
7576         /* Append NAPI poll list from offline CPU, with one exception :
7577          * process_backlog() must be called by cpu owning percpu backlog.
7578          * We properly handle process_queue & input_pkt_queue later.
7579          */
7580         while (!list_empty(&oldsd->poll_list)) {
7581                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7582                                                             struct napi_struct,
7583                                                             poll_list);
7584
7585                 list_del_init(&napi->poll_list);
7586                 if (napi->poll == process_backlog)
7587                         napi->state = 0;
7588                 else
7589                         ____napi_schedule(sd, napi);
7590         }
7591
7592         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7593         local_irq_enable();
7594
7595         /* Process offline CPU's input_pkt_queue */
7596         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7597                 netif_rx_ni(skb);
7598                 input_queue_head_incr(oldsd);
7599         }
7600         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7601                 netif_rx_ni(skb);
7602                 input_queue_head_incr(oldsd);
7603         }
7604
7605         return NOTIFY_OK;
7606 }
7607
7608
7609 /**
7610  *      netdev_increment_features - increment feature set by one
7611  *      @all: current feature set
7612  *      @one: new feature set
7613  *      @mask: mask feature set
7614  *
7615  *      Computes a new feature set after adding a device with feature set
7616  *      @one to the master device with current feature set @all.  Will not
7617  *      enable anything that is off in @mask. Returns the new feature set.
7618  */
7619 netdev_features_t netdev_increment_features(netdev_features_t all,
7620         netdev_features_t one, netdev_features_t mask)
7621 {
7622         if (mask & NETIF_F_GEN_CSUM)
7623                 mask |= NETIF_F_ALL_CSUM;
7624         mask |= NETIF_F_VLAN_CHALLENGED;
7625
7626         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7627         all &= one | ~NETIF_F_ALL_FOR_ALL;
7628
7629         /* If one device supports hw checksumming, set for all. */
7630         if (all & NETIF_F_GEN_CSUM)
7631                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7632
7633         return all;
7634 }
7635 EXPORT_SYMBOL(netdev_increment_features);
7636
7637 static struct hlist_head * __net_init netdev_create_hash(void)
7638 {
7639         int i;
7640         struct hlist_head *hash;
7641
7642         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7643         if (hash != NULL)
7644                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7645                         INIT_HLIST_HEAD(&hash[i]);
7646
7647         return hash;
7648 }
7649
7650 /* Initialize per network namespace state */
7651 static int __net_init netdev_init(struct net *net)
7652 {
7653         if (net != &init_net)
7654                 INIT_LIST_HEAD(&net->dev_base_head);
7655
7656         net->dev_name_head = netdev_create_hash();
7657         if (net->dev_name_head == NULL)
7658                 goto err_name;
7659
7660         net->dev_index_head = netdev_create_hash();
7661         if (net->dev_index_head == NULL)
7662                 goto err_idx;
7663
7664         return 0;
7665
7666 err_idx:
7667         kfree(net->dev_name_head);
7668 err_name:
7669         return -ENOMEM;
7670 }
7671
7672 /**
7673  *      netdev_drivername - network driver for the device
7674  *      @dev: network device
7675  *
7676  *      Determine network driver for device.
7677  */
7678 const char *netdev_drivername(const struct net_device *dev)
7679 {
7680         const struct device_driver *driver;
7681         const struct device *parent;
7682         const char *empty = "";
7683
7684         parent = dev->dev.parent;
7685         if (!parent)
7686                 return empty;
7687
7688         driver = parent->driver;
7689         if (driver && driver->name)
7690                 return driver->name;
7691         return empty;
7692 }
7693
7694 static void __netdev_printk(const char *level, const struct net_device *dev,
7695                             struct va_format *vaf)
7696 {
7697         if (dev && dev->dev.parent) {
7698                 dev_printk_emit(level[1] - '0',
7699                                 dev->dev.parent,
7700                                 "%s %s %s%s: %pV",
7701                                 dev_driver_string(dev->dev.parent),
7702                                 dev_name(dev->dev.parent),
7703                                 netdev_name(dev), netdev_reg_state(dev),
7704                                 vaf);
7705         } else if (dev) {
7706                 printk("%s%s%s: %pV",
7707                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7708         } else {
7709                 printk("%s(NULL net_device): %pV", level, vaf);
7710         }
7711 }
7712
7713 void netdev_printk(const char *level, const struct net_device *dev,
7714                    const char *format, ...)
7715 {
7716         struct va_format vaf;
7717         va_list args;
7718
7719         va_start(args, format);
7720
7721         vaf.fmt = format;
7722         vaf.va = &args;
7723
7724         __netdev_printk(level, dev, &vaf);
7725
7726         va_end(args);
7727 }
7728 EXPORT_SYMBOL(netdev_printk);
7729
7730 #define define_netdev_printk_level(func, level)                 \
7731 void func(const struct net_device *dev, const char *fmt, ...)   \
7732 {                                                               \
7733         struct va_format vaf;                                   \
7734         va_list args;                                           \
7735                                                                 \
7736         va_start(args, fmt);                                    \
7737                                                                 \
7738         vaf.fmt = fmt;                                          \
7739         vaf.va = &args;                                         \
7740                                                                 \
7741         __netdev_printk(level, dev, &vaf);                      \
7742                                                                 \
7743         va_end(args);                                           \
7744 }                                                               \
7745 EXPORT_SYMBOL(func);
7746
7747 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7748 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7749 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7750 define_netdev_printk_level(netdev_err, KERN_ERR);
7751 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7752 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7753 define_netdev_printk_level(netdev_info, KERN_INFO);
7754
7755 static void __net_exit netdev_exit(struct net *net)
7756 {
7757         kfree(net->dev_name_head);
7758         kfree(net->dev_index_head);
7759 }
7760
7761 static struct pernet_operations __net_initdata netdev_net_ops = {
7762         .init = netdev_init,
7763         .exit = netdev_exit,
7764 };
7765
7766 static void __net_exit default_device_exit(struct net *net)
7767 {
7768         struct net_device *dev, *aux;
7769         /*
7770          * Push all migratable network devices back to the
7771          * initial network namespace
7772          */
7773         rtnl_lock();
7774         for_each_netdev_safe(net, dev, aux) {
7775                 int err;
7776                 char fb_name[IFNAMSIZ];
7777
7778                 /* Ignore unmoveable devices (i.e. loopback) */
7779                 if (dev->features & NETIF_F_NETNS_LOCAL)
7780                         continue;
7781
7782                 /* Leave virtual devices for the generic cleanup */
7783                 if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
7784                         continue;
7785
7786                 /* Push remaining network devices to init_net */
7787                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7788                 if (__dev_get_by_name(&init_net, fb_name))
7789                         snprintf(fb_name, IFNAMSIZ, "dev%%d");
7790                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7791                 if (err) {
7792                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7793                                  __func__, dev->name, err);
7794                         BUG();
7795                 }
7796         }
7797         rtnl_unlock();
7798 }
7799
7800 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7801 {
7802         /* Return with the rtnl_lock held when there are no network
7803          * devices unregistering in any network namespace in net_list.
7804          */
7805         struct net *net;
7806         bool unregistering;
7807         DEFINE_WAIT_FUNC(wait, woken_wake_function);
7808
7809         add_wait_queue(&netdev_unregistering_wq, &wait);
7810         for (;;) {
7811                 unregistering = false;
7812                 rtnl_lock();
7813                 list_for_each_entry(net, net_list, exit_list) {
7814                         if (net->dev_unreg_count > 0) {
7815                                 unregistering = true;
7816                                 break;
7817                         }
7818                 }
7819                 if (!unregistering)
7820                         break;
7821                 __rtnl_unlock();
7822
7823                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7824         }
7825         remove_wait_queue(&netdev_unregistering_wq, &wait);
7826 }
7827
7828 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7829 {
7830         /* At exit all network devices most be removed from a network
7831          * namespace.  Do this in the reverse order of registration.
7832          * Do this across as many network namespaces as possible to
7833          * improve batching efficiency.
7834          */
7835         struct net_device *dev;
7836         struct net *net;
7837         LIST_HEAD(dev_kill_list);
7838
7839         /* To prevent network device cleanup code from dereferencing
7840          * loopback devices or network devices that have been freed
7841          * wait here for all pending unregistrations to complete,
7842          * before unregistring the loopback device and allowing the
7843          * network namespace be freed.
7844          *
7845          * The netdev todo list containing all network devices
7846          * unregistrations that happen in default_device_exit_batch
7847          * will run in the rtnl_unlock() at the end of
7848          * default_device_exit_batch.
7849          */
7850         rtnl_lock_unregistering(net_list);
7851         list_for_each_entry(net, net_list, exit_list) {
7852                 for_each_netdev_reverse(net, dev) {
7853                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7854                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7855                         else
7856                                 unregister_netdevice_queue(dev, &dev_kill_list);
7857                 }
7858         }
7859         unregister_netdevice_many(&dev_kill_list);
7860         rtnl_unlock();
7861 }
7862
7863 static struct pernet_operations __net_initdata default_device_ops = {
7864         .exit = default_device_exit,
7865         .exit_batch = default_device_exit_batch,
7866 };
7867
7868 /*
7869  *      Initialize the DEV module. At boot time this walks the device list and
7870  *      unhooks any devices that fail to initialise (normally hardware not
7871  *      present) and leaves us with a valid list of present and active devices.
7872  *
7873  */
7874
7875 /*
7876  *       This is called single threaded during boot, so no need
7877  *       to take the rtnl semaphore.
7878  */
7879 static int __init net_dev_init(void)
7880 {
7881         int i, rc = -ENOMEM;
7882
7883         BUG_ON(!dev_boot_phase);
7884
7885         if (dev_proc_init())
7886                 goto out;
7887
7888         if (netdev_kobject_init())
7889                 goto out;
7890
7891         INIT_LIST_HEAD(&ptype_all);
7892         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7893                 INIT_LIST_HEAD(&ptype_base[i]);
7894
7895         INIT_LIST_HEAD(&offload_base);
7896
7897         if (register_pernet_subsys(&netdev_net_ops))
7898                 goto out;
7899
7900         /*
7901          *      Initialise the packet receive queues.
7902          */
7903
7904         for_each_possible_cpu(i) {
7905                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7906
7907                 skb_queue_head_init(&sd->input_pkt_queue);
7908                 skb_queue_head_init(&sd->process_queue);
7909                 INIT_LIST_HEAD(&sd->poll_list);
7910                 sd->output_queue_tailp = &sd->output_queue;
7911 #ifdef CONFIG_RPS
7912                 sd->csd.func = rps_trigger_softirq;
7913                 sd->csd.info = sd;
7914                 sd->cpu = i;
7915 #endif
7916
7917                 sd->backlog.poll = process_backlog;
7918                 sd->backlog.weight = weight_p;
7919         }
7920
7921         dev_boot_phase = 0;
7922
7923         /* The loopback device is special if any other network devices
7924          * is present in a network namespace the loopback device must
7925          * be present. Since we now dynamically allocate and free the
7926          * loopback device ensure this invariant is maintained by
7927          * keeping the loopback device as the first device on the
7928          * list of network devices.  Ensuring the loopback devices
7929          * is the first device that appears and the last network device
7930          * that disappears.
7931          */
7932         if (register_pernet_device(&loopback_net_ops))
7933                 goto out;
7934
7935         if (register_pernet_device(&default_device_ops))
7936                 goto out;
7937
7938         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7939         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7940
7941         hotcpu_notifier(dev_cpu_callback, 0);
7942         dst_subsys_init();
7943         rc = 0;
7944 out:
7945         return rc;
7946 }
7947
7948 subsys_initcall(net_dev_init);