net/core/dev.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  *      NET3    Protocol independent device support routines.
   4  *
   5  *      Derived from the non IP parts of dev.c 1.0.19
   6  *              Authors:        Ross Biro
   7  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
   8  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
   9  *
  10  *      Additional Authors:
  11  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  12  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  13  *              David Hinds <dahinds@users.sourceforge.net>
  14  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  15  *              Adam Sulmicki <adam@cfar.umd.edu>
  16  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  17  *
  18  *      Changes:
  19  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  20  *                                      to 2 if register_netdev gets called
  21  *                                      before net_dev_init & also removed a
  22  *                                      few lines of code in the process.
  23  *              Alan Cox        :       device private ioctl copies fields back.
  24  *              Alan Cox        :       Transmit queue code does relevant
  25  *                                      stunts to keep the queue safe.
  26  *              Alan Cox        :       Fixed double lock.
  27  *              Alan Cox        :       Fixed promisc NULL pointer trap
  28  *              ????????        :       Support the full private ioctl range
  29  *              Alan Cox        :       Moved ioctl permission check into
  30  *                                      drivers
  31  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  32  *              Alan Cox        :       100 backlog just doesn't cut it when
  33  *                                      you start doing multicast video 8)
  34  *              Alan Cox        :       Rewrote net_bh and list manager.
  35  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  36  *              Alan Cox        :       Took out transmit every packet pass
  37  *                                      Saved a few bytes in the ioctl handler
  38  *              Alan Cox        :       Network driver sets packet type before
  39  *                                      calling netif_rx. Saves a function
  40  *                                      call a packet.
  41  *              Alan Cox        :       Hashed net_bh()
  42  *              Richard Kooijman:       Timestamp fixes.
  43  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  44  *              Alan Cox        :       Device lock protection.
  45  *              Alan Cox        :       Fixed nasty side effect of device close
  46  *                                      changes.
  47  *              Rudi Cilibrasi  :       Pass the right thing to
  48  *                                      set_mac_address()
  49  *              Dave Miller     :       32bit quantity for the device lock to
  50  *                                      make it work out on a Sparc.
  51  *              Bjorn Ekwall    :       Added KERNELD hack.
  52  *              Alan Cox        :       Cleaned up the backlog initialise.
  53  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  54  *                                      1 device.
  55  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  56  *                                      is no device open function.
  57  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  58  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  59  *              Cyrus Durgin    :       Cleaned for KMOD
  60  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  61  *                                      A network device unload needs to purge
  62  *                                      the backlog queue.
  63  *      Paul Rusty Russell      :       SIOCSIFNAME
  64  *              Pekka Riikonen  :       Netdev boot-time settings code
  65  *              Andrew Morton   :       Make unregister_netdevice wait
  66  *                                      indefinitely on dev->refcnt
  67  *              J Hadi Salim    :       - Backlog queue sampling
  68  *                                      - netif_rx() feedback
  69  */
  70
  71 #include <linux/uaccess.h>
  72 #include <linux/bitops.h>
  73 #include <linux/capability.h>
  74 #include <linux/cpu.h>
  75 #include <linux/types.h>
  76 #include <linux/kernel.h>
  77 #include <linux/hash.h>
  78 #include <linux/slab.h>
  79 #include <linux/sched.h>
  80 #include <linux/sched/mm.h>
  81 #include <linux/mutex.h>
  82 #include <linux/rwsem.h>
  83 #include <linux/string.h>
  84 #include <linux/mm.h>
  85 #include <linux/socket.h>
  86 #include <linux/sockios.h>
  87 #include <linux/errno.h>
  88 #include <linux/interrupt.h>
  89 #include <linux/if_ether.h>
  90 #include <linux/netdevice.h>
  91 #include <linux/etherdevice.h>
  92 #include <linux/ethtool.h>
  93 #include <linux/skbuff.h>
  94 #include <linux/bpf.h>
  95 #include <linux/bpf_trace.h>
  96 #include <net/net_namespace.h>
  97 #include <net/sock.h>
  98 #include <net/busy_poll.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dsa.h>
 102 #include <net/dst.h>
 103 #include <net/dst_metadata.h>
 104 #include <net/pkt_sched.h>
 105 #include <net/pkt_cls.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/module.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/iw_handler.h>
 115 #include <asm/current.h>
 116 #include <linux/audit.h>
 117 #include <linux/dmaengine.h>
 118 #include <linux/err.h>
 119 #include <linux/ctype.h>
 120 #include <linux/if_arp.h>
 121 #include <linux/if_vlan.h>
 122 #include <linux/ip.h>
 123 #include <net/ip.h>
 124 #include <net/mpls.h>
 125 #include <linux/ipv6.h>
 126 #include <linux/in.h>
 127 #include <linux/jhash.h>
 128 #include <linux/random.h>
 129 #include <trace/events/napi.h>
 130 #include <trace/events/net.h>
 131 #include <trace/events/skb.h>
 132 #include <linux/inetdevice.h>
 133 #include <linux/cpu_rmap.h>
 134 #include <linux/static_key.h>
 135 #include <linux/hashtable.h>
 136 #include <linux/vmalloc.h>
 137 #include <linux/if_macvlan.h>
 138 #include <linux/errqueue.h>
 139 #include <linux/hrtimer.h>
 140 #include <linux/netfilter_ingress.h>
 141 #include <linux/crash_dump.h>
 142 #include <linux/sctp.h>
 143 #include <net/udp_tunnel.h>
 144 #include <linux/net_namespace.h>
 145 #include <linux/indirect_call_wrapper.h>
 146 #include <net/devlink.h>
 147 #include <linux/pm_runtime.h>
 148 #include <linux/prandom.h>
 149
 150 #include "net-sysfs.h"
 151
 152 #define MAX_GRO_SKBS 8
 153
 154 /* This should be increased if a protocol with a bigger head is added. */
 155 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 156
 157 static DEFINE_SPINLOCK(ptype_lock);
 158 static DEFINE_SPINLOCK(offload_lock);
 159 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 160 struct list_head ptype_all __read_mostly;       /* Taps */
 161 static struct list_head offload_base __read_mostly;
 162
 163 static int netif_rx_internal(struct sk_buff *skb);
 164 static int call_netdevice_notifiers_info(unsigned long val,
 165                                          struct netdev_notifier_info *info);
 166 static int call_netdevice_notifiers_extack(unsigned long val,
 167                                            struct net_device *dev,
 168                                            struct netlink_ext_ack *extack);
 169 static struct napi_struct *napi_by_id(unsigned int napi_id);
 170
 171 /*
 172  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 173  * semaphore.
 174  *
 175  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 176  *
 177  * Writers must hold the rtnl semaphore while they loop through the
 178  * dev_base_head list, and hold dev_base_lock for writing when they do the
 179  * actual updates.  This allows pure readers to access the list even
 180  * while a writer is preparing to update it.
 181  *
 182  * To put it another way, dev_base_lock is held for writing only to
 183  * protect against pure readers; the rtnl semaphore provides the
 184  * protection against other writers.
 185  *
 186  * See, for example usages, register_netdevice() and
 187  * unregister_netdevice(), which must be called with the rtnl
 188  * semaphore held.
 189  */
 190 DEFINE_RWLOCK(dev_base_lock);
 191 EXPORT_SYMBOL(dev_base_lock);
 192
 193 static DEFINE_MUTEX(ifalias_mutex);
 194
 195 /* protects napi_hash addition/deletion and napi_gen_id */
 196 static DEFINE_SPINLOCK(napi_hash_lock);
 197
 198 static unsigned int napi_gen_id = NR_CPUS;
 199 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 200
 201 static DECLARE_RWSEM(devnet_rename_sem);
 202
 203 static inline void dev_base_seq_inc(struct net *net)
 204 {
 205         while (++net->dev_base_seq == 0)
 206                 ;
 207 }
 208
 209 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 210 {
 211         unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
 212
 213         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 214 }
 215
 216 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 217 {
 218         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 219 }
 220
 221 static inline void rps_lock(struct softnet_data *sd)
 222 {
 223 #ifdef CONFIG_RPS
 224         spin_lock(&sd->input_pkt_queue.lock);
 225 #endif
 226 }
 227
 228 static inline void rps_unlock(struct softnet_data *sd)
 229 {
 230 #ifdef CONFIG_RPS
 231         spin_unlock(&sd->input_pkt_queue.lock);
 232 #endif
 233 }
 234
 235 static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
 236                                                        const char *name)
 237 {
 238         struct netdev_name_node *name_node;
 239
 240         name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
 241         if (!name_node)
 242                 return NULL;
 243         INIT_HLIST_NODE(&name_node->hlist);
 244         name_node->dev = dev;
 245         name_node->name = name;
 246         return name_node;
 247 }
 248
 249 static struct netdev_name_node *
 250 netdev_name_node_head_alloc(struct net_device *dev)
 251 {
 252         struct netdev_name_node *name_node;
 253
 254         name_node = netdev_name_node_alloc(dev, dev->name);
 255         if (!name_node)
 256                 return NULL;
 257         INIT_LIST_HEAD(&name_node->list);
 258         return name_node;
 259 }
 260
 261 static void netdev_name_node_free(struct netdev_name_node *name_node)
 262 {
 263         kfree(name_node);
 264 }
 265
 266 static void netdev_name_node_add(struct net *net,
 267                                  struct netdev_name_node *name_node)
 268 {
 269         hlist_add_head_rcu(&name_node->hlist,
 270                            dev_name_hash(net, name_node->name));
 271 }
 272
 273 static void netdev_name_node_del(struct netdev_name_node *name_node)
 274 {
 275         hlist_del_rcu(&name_node->hlist);
 276 }
 277
 278 static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
 279                                                         const char *name)
 280 {
 281         struct hlist_head *head = dev_name_hash(net, name);
 282         struct netdev_name_node *name_node;
 283
 284         hlist_for_each_entry(name_node, head, hlist)
 285                 if (!strcmp(name_node->name, name))
 286                         return name_node;
 287         return NULL;
 288 }
 289
 290 static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
 291                                                             const char *name)
 292 {
 293         struct hlist_head *head = dev_name_hash(net, name);
 294         struct netdev_name_node *name_node;
 295
 296         hlist_for_each_entry_rcu(name_node, head, hlist)
 297                 if (!strcmp(name_node->name, name))
 298                         return name_node;
 299         return NULL;
 300 }
 301
 302 int netdev_name_node_alt_create(struct net_device *dev, const char *name)
 303 {
 304         struct netdev_name_node *name_node;
 305         struct net *net = dev_net(dev);
 306
 307         name_node = netdev_name_node_lookup(net, name);
 308         if (name_node)
 309                 return -EEXIST;
 310         name_node = netdev_name_node_alloc(dev, name);
 311         if (!name_node)
 312                 return -ENOMEM;
 313         netdev_name_node_add(net, name_node);
 314         /* The node that holds dev->name acts as a head of per-device list. */
 315         list_add_tail(&name_node->list, &dev->name_node->list);
 316
 317         return 0;
 318 }
 319 EXPORT_SYMBOL(netdev_name_node_alt_create);
 320
 321 static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
 322 {
 323         list_del(&name_node->list);
 324         netdev_name_node_del(name_node);
 325         kfree(name_node->name);
 326         netdev_name_node_free(name_node);
 327 }
 328
 329 int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
 330 {
 331         struct netdev_name_node *name_node;
 332         struct net *net = dev_net(dev);
 333
 334         name_node = netdev_name_node_lookup(net, name);
 335         if (!name_node)
 336                 return -ENOENT;
 337         /* lookup might have found our primary name or a name belonging
 338          * to another device.
 339          */
 340         if (name_node == dev->name_node || name_node->dev != dev)
 341                 return -EINVAL;
 342
 343         __netdev_name_node_alt_destroy(name_node);
 344
 345         return 0;
 346 }
 347 EXPORT_SYMBOL(netdev_name_node_alt_destroy);
 348
 349 static void netdev_name_node_alt_flush(struct net_device *dev)
 350 {
 351         struct netdev_name_node *name_node, *tmp;
 352
 353         list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
 354                 __netdev_name_node_alt_destroy(name_node);
 355 }
 356
 357 /* Device list insertion */
 358 static void list_netdevice(struct net_device *dev)
 359 {
 360         struct net *net = dev_net(dev);
 361
 362         ASSERT_RTNL();
 363
 364         write_lock_bh(&dev_base_lock);
 365         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 366         netdev_name_node_add(net, dev->name_node);
 367         hlist_add_head_rcu(&dev->index_hlist,
 368                            dev_index_hash(net, dev->ifindex));
 369         write_unlock_bh(&dev_base_lock);
 370
 371         dev_base_seq_inc(net);
 372 }
 373
 374 /* Device list removal
 375  * caller must respect a RCU grace period before freeing/reusing dev
 376  */
 377 static void unlist_netdevice(struct net_device *dev)
 378 {
 379         ASSERT_RTNL();
 380
 381         /* Unlink dev from the device chain */
 382         write_lock_bh(&dev_base_lock);
 383         list_del_rcu(&dev->dev_list);
 384         netdev_name_node_del(dev->name_node);
 385         hlist_del_rcu(&dev->index_hlist);
 386         write_unlock_bh(&dev_base_lock);
 387
 388         dev_base_seq_inc(dev_net(dev));
 389 }
 390
 391 /*
 392  *      Our notifier list
 393  */
 394
 395 static RAW_NOTIFIER_HEAD(netdev_chain);
 396
 397 /*
 398  *      Device drivers call our routines to queue packets here. We empty the
 399  *      queue in the local softnet handler.
 400  */
 401
 402 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 403 EXPORT_PER_CPU_SYMBOL(softnet_data);
 404
 405 #ifdef CONFIG_LOCKDEP
 406 /*
 407  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 408  * according to dev->type
 409  */
 410 static const unsigned short netdev_lock_type[] = {
 411          ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 412          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 413          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 414          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 415          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 416          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 417          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 418          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 419          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 420          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 421          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 422          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 423          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 424          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 425          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 426
 427 static const char *const netdev_lock_name[] = {
 428         "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 429         "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 430         "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 431         "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 432         "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 433         "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 434         "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 435         "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 436         "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 437         "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 438         "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 439         "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 440         "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 441         "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 442         "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 443
 444 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 445 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 446
 447 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 448 {
 449         int i;
 450
 451         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 452                 if (netdev_lock_type[i] == dev_type)
 453                         return i;
 454         /* the last key is used by default */
 455         return ARRAY_SIZE(netdev_lock_type) - 1;
 456 }
 457
 458 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 459                                                  unsigned short dev_type)
 460 {
 461         int i;
 462
 463         i = netdev_lock_pos(dev_type);
 464         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 465                                    netdev_lock_name[i]);
 466 }
 467
 468 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 469 {
 470         int i;
 471
 472         i = netdev_lock_pos(dev->type);
 473         lockdep_set_class_and_name(&dev->addr_list_lock,
 474                                    &netdev_addr_lock_key[i],
 475                                    netdev_lock_name[i]);
 476 }
 477 #else
 478 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 479                                                  unsigned short dev_type)
 480 {
 481 }
 482
 483 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 484 {
 485 }
 486 #endif
 487
 488 /*******************************************************************************
 489  *
 490  *              Protocol management and registration routines
 491  *
 492  *******************************************************************************/
 493
 494
 495 /*
 496  *      Add a protocol ID to the list. Now that the input handler is
 497  *      smarter we can dispense with all the messy stuff that used to be
 498  *      here.
 499  *
 500  *      BEWARE!!! Protocol handlers, mangling input packets,
 501  *      MUST BE last in hash buckets and checking protocol handlers
 502  *      MUST start from promiscuous ptype_all chain in net_bh.
 503  *      It is true now, do not change it.
 504  *      Explanation follows: if protocol handler, mangling packet, will
 505  *      be the first on list, it is not able to sense, that packet
 506  *      is cloned and should be copied-on-write, so that it will
 507  *      change it and subsequent readers will get broken packet.
 508  *                                                      --ANK (980803)
 509  */
 510
 511 static inline struct list_head *ptype_head(const struct packet_type *pt)
 512 {
 513         if (pt->type == htons(ETH_P_ALL))
 514                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 515         else
 516                 return pt->dev ? &pt->dev->ptype_specific :
 517                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 518 }
 519
 520 /**
 521  *      dev_add_pack - add packet handler
 522  *      @pt: packet type declaration
 523  *
 524  *      Add a protocol handler to the networking stack. The passed &packet_type
 525  *      is linked into kernel lists and may not be freed until it has been
 526  *      removed from the kernel lists.
 527  *
 528  *      This call does not sleep therefore it can not
 529  *      guarantee all CPU's that are in middle of receiving packets
 530  *      will see the new packet type (until the next received packet).
 531  */
 532
 533 void dev_add_pack(struct packet_type *pt)
 534 {
 535         struct list_head *head = ptype_head(pt);
 536
 537         spin_lock(&ptype_lock);
 538         list_add_rcu(&pt->list, head);
 539         spin_unlock(&ptype_lock);
 540 }
 541 EXPORT_SYMBOL(dev_add_pack);
 542
 543 /**
 544  *      __dev_remove_pack        - remove packet handler
 545  *      @pt: packet type declaration
 546  *
 547  *      Remove a protocol handler that was previously added to the kernel
 548  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 549  *      from the kernel lists and can be freed or reused once this function
 550  *      returns.
 551  *
 552  *      The packet type might still be in use by receivers
 553  *      and must not be freed until after all the CPU's have gone
 554  *      through a quiescent state.
 555  */
 556 void __dev_remove_pack(struct packet_type *pt)
 557 {
 558         struct list_head *head = ptype_head(pt);
 559         struct packet_type *pt1;
 560
 561         spin_lock(&ptype_lock);
 562
 563         list_for_each_entry(pt1, head, list) {
 564                 if (pt == pt1) {
 565                         list_del_rcu(&pt->list);
 566                         goto out;
 567                 }
 568         }
 569
 570         pr_warn("dev_remove_pack: %p not found\n", pt);
 571 out:
 572         spin_unlock(&ptype_lock);
 573 }
 574 EXPORT_SYMBOL(__dev_remove_pack);
 575
 576 /**
 577  *      dev_remove_pack  - remove packet handler
 578  *      @pt: packet type declaration
 579  *
 580  *      Remove a protocol handler that was previously added to the kernel
 581  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 582  *      from the kernel lists and can be freed or reused once this function
 583  *      returns.
 584  *
 585  *      This call sleeps to guarantee that no CPU is looking at the packet
 586  *      type after return.
 587  */
 588 void dev_remove_pack(struct packet_type *pt)
 589 {
 590         __dev_remove_pack(pt);
 591
 592         synchronize_net();
 593 }
 594 EXPORT_SYMBOL(dev_remove_pack);
 595
 596
 597 /**
 598  *      dev_add_offload - register offload handlers
 599  *      @po: protocol offload declaration
 600  *
 601  *      Add protocol offload handlers to the networking stack. The passed
 602  *      &proto_offload is linked into kernel lists and may not be freed until
 603  *      it has been removed from the kernel lists.
 604  *
 605  *      This call does not sleep therefore it can not
 606  *      guarantee all CPU's that are in middle of receiving packets
 607  *      will see the new offload handlers (until the next received packet).
 608  */
 609 void dev_add_offload(struct packet_offload *po)
 610 {
 611         struct packet_offload *elem;
 612
 613         spin_lock(&offload_lock);
 614         list_for_each_entry(elem, &offload_base, list) {
 615                 if (po->priority < elem->priority)
 616                         break;
 617         }
 618         list_add_rcu(&po->list, elem->list.prev);
 619         spin_unlock(&offload_lock);
 620 }
 621 EXPORT_SYMBOL(dev_add_offload);
 622
 623 /**
 624  *      __dev_remove_offload     - remove offload handler
 625  *      @po: packet offload declaration
 626  *
 627  *      Remove a protocol offload handler that was previously added to the
 628  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 629  *      is removed from the kernel lists and can be freed or reused once this
 630  *      function returns.
 631  *
 632  *      The packet type might still be in use by receivers
 633  *      and must not be freed until after all the CPU's have gone
 634  *      through a quiescent state.
 635  */
 636 static void __dev_remove_offload(struct packet_offload *po)
 637 {
 638         struct list_head *head = &offload_base;
 639         struct packet_offload *po1;
 640
 641         spin_lock(&offload_lock);
 642
 643         list_for_each_entry(po1, head, list) {
 644                 if (po == po1) {
 645                         list_del_rcu(&po->list);
 646                         goto out;
 647                 }
 648         }
 649
 650         pr_warn("dev_remove_offload: %p not found\n", po);
 651 out:
 652         spin_unlock(&offload_lock);
 653 }
 654
 655 /**
 656  *      dev_remove_offload       - remove packet offload handler
 657  *      @po: packet offload declaration
 658  *
 659  *      Remove a packet offload handler that was previously added to the kernel
 660  *      offload handlers by dev_add_offload(). The passed &offload_type is
 661  *      removed from the kernel lists and can be freed or reused once this
 662  *      function returns.
 663  *
 664  *      This call sleeps to guarantee that no CPU is looking at the packet
 665  *      type after return.
 666  */
 667 void dev_remove_offload(struct packet_offload *po)
 668 {
 669         __dev_remove_offload(po);
 670
 671         synchronize_net();
 672 }
 673 EXPORT_SYMBOL(dev_remove_offload);
 674
 675 /******************************************************************************
 676  *
 677  *                    Device Boot-time Settings Routines
 678  *
 679  ******************************************************************************/
 680
 681 /* Boot time configuration table */
 682 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 683
 684 /**
 685  *      netdev_boot_setup_add   - add new setup entry
 686  *      @name: name of the device
 687  *      @map: configured settings for the device
 688  *
 689  *      Adds new setup entry to the dev_boot_setup list.  The function
 690  *      returns 0 on error and 1 on success.  This is a generic routine to
 691  *      all netdevices.
 692  */
 693 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 694 {
 695         struct netdev_boot_setup *s;
 696         int i;
 697
 698         s = dev_boot_setup;
 699         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 700                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 701                         memset(s[i].name, 0, sizeof(s[i].name));
 702                         strlcpy(s[i].name, name, IFNAMSIZ);
 703                         memcpy(&s[i].map, map, sizeof(s[i].map));
 704                         break;
 705                 }
 706         }
 707
 708         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 709 }
 710
 711 /**
 712  * netdev_boot_setup_check      - check boot time settings
 713  * @dev: the netdevice
 714  *
 715  * Check boot time settings for the device.
 716  * The found settings are set for the device to be used
 717  * later in the device probing.
 718  * Returns 0 if no settings found, 1 if they are.
 719  */
 720 int netdev_boot_setup_check(struct net_device *dev)
 721 {
 722         struct netdev_boot_setup *s = dev_boot_setup;
 723         int i;
 724
 725         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 726                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 727                     !strcmp(dev->name, s[i].name)) {
 728                         dev->irq = s[i].map.irq;
 729                         dev->base_addr = s[i].map.base_addr;
 730                         dev->mem_start = s[i].map.mem_start;
 731                         dev->mem_end = s[i].map.mem_end;
 732                         return 1;
 733                 }
 734         }
 735         return 0;
 736 }
 737 EXPORT_SYMBOL(netdev_boot_setup_check);
 738
 739
 740 /**
 741  * netdev_boot_base     - get address from boot time settings
 742  * @prefix: prefix for network device
 743  * @unit: id for network device
 744  *
 745  * Check boot time settings for the base address of device.
 746  * The found settings are set for the device to be used
 747  * later in the device probing.
 748  * Returns 0 if no settings found.
 749  */
 750 unsigned long netdev_boot_base(const char *prefix, int unit)
 751 {
 752         const struct netdev_boot_setup *s = dev_boot_setup;
 753         char name[IFNAMSIZ];
 754         int i;
 755
 756         sprintf(name, "%s%d", prefix, unit);
 757
 758         /*
 759          * If device already registered then return base of 1
 760          * to indicate not to probe for this interface
 761          */
 762         if (__dev_get_by_name(&init_net, name))
 763                 return 1;
 764
 765         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 766                 if (!strcmp(name, s[i].name))
 767                         return s[i].map.base_addr;
 768         return 0;
 769 }
 770
 771 /*
 772  * Saves at boot time configured settings for any netdevice.
 773  */
 774 int __init netdev_boot_setup(char *str)
 775 {
 776         int ints[5];
 777         struct ifmap map;
 778
 779         str = get_options(str, ARRAY_SIZE(ints), ints);
 780         if (!str || !*str)
 781                 return 0;
 782
 783         /* Save settings */
 784         memset(&map, 0, sizeof(map));
 785         if (ints[0] > 0)
 786                 map.irq = ints[1];
 787         if (ints[0] > 1)
 788                 map.base_addr = ints[2];
 789         if (ints[0] > 2)
 790                 map.mem_start = ints[3];
 791         if (ints[0] > 3)
 792                 map.mem_end = ints[4];
 793
 794         /* Add new entry to the list */
 795         return netdev_boot_setup_add(str, &map);
 796 }
 797
 798 __setup("netdev=", netdev_boot_setup);
 799
 800 /*******************************************************************************
 801  *
 802  *                          Device Interface Subroutines
 803  *
 804  *******************************************************************************/
 805
 806 /**
 807  *      dev_get_iflink  - get 'iflink' value of a interface
 808  *      @dev: targeted interface
 809  *
 810  *      Indicates the ifindex the interface is linked to.
 811  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 812  */
 813
 814 int dev_get_iflink(const struct net_device *dev)
 815 {
 816         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 817                 return dev->netdev_ops->ndo_get_iflink(dev);
 818
 819         return dev->ifindex;
 820 }
 821 EXPORT_SYMBOL(dev_get_iflink);
 822
 823 /**
 824  *      dev_fill_metadata_dst - Retrieve tunnel egress information.
 825  *      @dev: targeted interface
 826  *      @skb: The packet.
 827  *
 828  *      For better visibility of tunnel traffic OVS needs to retrieve
 829  *      egress tunnel information for a packet. Following API allows
 830  *      user to get this info.
 831  */
 832 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 833 {
 834         struct ip_tunnel_info *info;
 835
 836         if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 837                 return -EINVAL;
 838
 839         info = skb_tunnel_info_unclone(skb);
 840         if (!info)
 841                 return -ENOMEM;
 842         if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 843                 return -EINVAL;
 844
 845         return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 846 }
 847 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 848
 849 /**
 850  *      __dev_get_by_name       - find a device by its name
 851  *      @net: the applicable net namespace
 852  *      @name: name to find
 853  *
 854  *      Find an interface by name. Must be called under RTNL semaphore
 855  *      or @dev_base_lock. If the name is found a pointer to the device
 856  *      is returned. If the name is not found then %NULL is returned. The
 857  *      reference counters are not incremented so the caller must be
 858  *      careful with locks.
 859  */
 860
 861 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 862 {
 863         struct netdev_name_node *node_name;
 864
 865         node_name = netdev_name_node_lookup(net, name);
 866         return node_name ? node_name->dev : NULL;
 867 }
 868 EXPORT_SYMBOL(__dev_get_by_name);
 869
 870 /**
 871  * dev_get_by_name_rcu  - find a device by its name
 872  * @net: the applicable net namespace
 873  * @name: name to find
 874  *
 875  * Find an interface by name.
 876  * If the name is found a pointer to the device is returned.
 877  * If the name is not found then %NULL is returned.
 878  * The reference counters are not incremented so the caller must be
 879  * careful with locks. The caller must hold RCU lock.
 880  */
 881
 882 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 883 {
 884         struct netdev_name_node *node_name;
 885
 886         node_name = netdev_name_node_lookup_rcu(net, name);
 887         return node_name ? node_name->dev : NULL;
 888 }
 889 EXPORT_SYMBOL(dev_get_by_name_rcu);
 890
 891 /**
 892  *      dev_get_by_name         - find a device by its name
 893  *      @net: the applicable net namespace
 894  *      @name: name to find
 895  *
 896  *      Find an interface by name. This can be called from any
 897  *      context and does its own locking. The returned handle has
 898  *      the usage count incremented and the caller must use dev_put() to
 899  *      release it when it is no longer needed. %NULL is returned if no
 900  *      matching device is found.
 901  */
 902
 903 struct net_device *dev_get_by_name(struct net *net, const char *name)
 904 {
 905         struct net_device *dev;
 906
 907         rcu_read_lock();
 908         dev = dev_get_by_name_rcu(net, name);
 909         if (dev)
 910                 dev_hold(dev);
 911         rcu_read_unlock();
 912         return dev;
 913 }
 914 EXPORT_SYMBOL(dev_get_by_name);
 915
 916 /**
 917  *      __dev_get_by_index - find a device by its ifindex
 918  *      @net: the applicable net namespace
 919  *      @ifindex: index of device
 920  *
 921  *      Search for an interface by index. Returns %NULL if the device
 922  *      is not found or a pointer to the device. The device has not
 923  *      had its reference counter increased so the caller must be careful
 924  *      about locking. The caller must hold either the RTNL semaphore
 925  *      or @dev_base_lock.
 926  */
 927
 928 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 929 {
 930         struct net_device *dev;
 931         struct hlist_head *head = dev_index_hash(net, ifindex);
 932
 933         hlist_for_each_entry(dev, head, index_hlist)
 934                 if (dev->ifindex == ifindex)
 935                         return dev;
 936
 937         return NULL;
 938 }
 939 EXPORT_SYMBOL(__dev_get_by_index);
 940
 941 /**
 942  *      dev_get_by_index_rcu - find a device by its ifindex
 943  *      @net: the applicable net namespace
 944  *      @ifindex: index of device
 945  *
 946  *      Search for an interface by index. Returns %NULL if the device
 947  *      is not found or a pointer to the device. The device has not
 948  *      had its reference counter increased so the caller must be careful
 949  *      about locking. The caller must hold RCU lock.
 950  */
 951
 952 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 953 {
 954         struct net_device *dev;
 955         struct hlist_head *head = dev_index_hash(net, ifindex);
 956
 957         hlist_for_each_entry_rcu(dev, head, index_hlist)
 958                 if (dev->ifindex == ifindex)
 959                         return dev;
 960
 961         return NULL;
 962 }
 963 EXPORT_SYMBOL(dev_get_by_index_rcu);
 964
 965
 966 /**
 967  *      dev_get_by_index - find a device by its ifindex
 968  *      @net: the applicable net namespace
 969  *      @ifindex: index of device
 970  *
 971  *      Search for an interface by index. Returns NULL if the device
 972  *      is not found or a pointer to the device. The device returned has
 973  *      had a reference added and the pointer is safe until the user calls
 974  *      dev_put to indicate they have finished with it.
 975  */
 976
 977 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 978 {
 979         struct net_device *dev;
 980
 981         rcu_read_lock();
 982         dev = dev_get_by_index_rcu(net, ifindex);
 983         if (dev)
 984                 dev_hold(dev);
 985         rcu_read_unlock();
 986         return dev;
 987 }
 988 EXPORT_SYMBOL(dev_get_by_index);
 989
 990 /**
 991  *      dev_get_by_napi_id - find a device by napi_id
 992  *      @napi_id: ID of the NAPI struct
 993  *
 994  *      Search for an interface by NAPI ID. Returns %NULL if the device
 995  *      is not found or a pointer to the device. The device has not had
 996  *      its reference counter increased so the caller must be careful
 997  *      about locking. The caller must hold RCU lock.
 998  */
 999
1000 struct net_device *dev_get_by_napi_id(unsigned int napi_id)
1001 {
1002         struct napi_struct *napi;
1003
1004         WARN_ON_ONCE(!rcu_read_lock_held());
1005
1006         if (napi_id < MIN_NAPI_ID)
1007                 return NULL;
1008
1009         napi = napi_by_id(napi_id);
1010
1011         return napi ? napi->dev : NULL;
1012 }
1013 EXPORT_SYMBOL(dev_get_by_napi_id);
1014
1015 /**
1016  *      netdev_get_name - get a netdevice name, knowing its ifindex.
1017  *      @net: network namespace
1018  *      @name: a pointer to the buffer where the name will be stored.
1019  *      @ifindex: the ifindex of the interface to get the name from.
1020  */
1021 int netdev_get_name(struct net *net, char *name, int ifindex)
1022 {
1023         struct net_device *dev;
1024         int ret;
1025
1026         down_read(&devnet_rename_sem);
1027         rcu_read_lock();
1028
1029         dev = dev_get_by_index_rcu(net, ifindex);
1030         if (!dev) {
1031                 ret = -ENODEV;
1032                 goto out;
1033         }
1034
1035         strcpy(name, dev->name);
1036
1037         ret = 0;
1038 out:
1039         rcu_read_unlock();
1040         up_read(&devnet_rename_sem);
1041         return ret;
1042 }
1043
1044 /**
1045  *      dev_getbyhwaddr_rcu - find a device by its hardware address
1046  *      @net: the applicable net namespace
1047  *      @type: media type of device
1048  *      @ha: hardware address
1049  *
1050  *      Search for an interface by MAC address. Returns NULL if the device
1051  *      is not found or a pointer to the device.
1052  *      The caller must hold RCU or RTNL.
1053  *      The returned device has not had its ref count increased
1054  *      and the caller must therefore be careful about locking
1055  *
1056  */
1057
1058 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
1059                                        const char *ha)
1060 {
1061         struct net_device *dev;
1062
1063         for_each_netdev_rcu(net, dev)
1064                 if (dev->type == type &&
1065                     !memcmp(dev->dev_addr, ha, dev->addr_len))
1066                         return dev;
1067
1068         return NULL;
1069 }
1070 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
1071
1072 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
1073 {
1074         struct net_device *dev;
1075
1076         ASSERT_RTNL();
1077         for_each_netdev(net, dev)
1078                 if (dev->type == type)
1079                         return dev;
1080
1081         return NULL;
1082 }
1083 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
1084
1085 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
1086 {
1087         struct net_device *dev, *ret = NULL;
1088
1089         rcu_read_lock();
1090         for_each_netdev_rcu(net, dev)
1091                 if (dev->type == type) {
1092                         dev_hold(dev);
1093                         ret = dev;
1094                         break;
1095                 }
1096         rcu_read_unlock();
1097         return ret;
1098 }
1099 EXPORT_SYMBOL(dev_getfirstbyhwtype);
1100
1101 /**
1102  *      __dev_get_by_flags - find any device with given flags
1103  *      @net: the applicable net namespace
1104  *      @if_flags: IFF_* values
1105  *      @mask: bitmask of bits in if_flags to check
1106  *
1107  *      Search for any interface with the given flags. Returns NULL if a device
1108  *      is not found or a pointer to the device. Must be called inside
1109  *      rtnl_lock(), and result refcount is unchanged.
1110  */
1111
1112 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
1113                                       unsigned short mask)
1114 {
1115         struct net_device *dev, *ret;
1116
1117         ASSERT_RTNL();
1118
1119         ret = NULL;
1120         for_each_netdev(net, dev) {
1121                 if (((dev->flags ^ if_flags) & mask) == 0) {
1122                         ret = dev;
1123                         break;
1124                 }
1125         }
1126         return ret;
1127 }
1128 EXPORT_SYMBOL(__dev_get_by_flags);
1129
1130 /**
1131  *      dev_valid_name - check if name is okay for network device
1132  *      @name: name string
1133  *
1134  *      Network device names need to be valid file names to
1135  *      allow sysfs to work.  We also disallow any kind of
1136  *      whitespace.
1137  */
1138 bool dev_valid_name(const char *name)
1139 {
1140         if (*name == '\0')
1141                 return false;
1142         if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1143                 return false;
1144         if (!strcmp(name, ".") || !strcmp(name, ".."))
1145                 return false;
1146
1147         while (*name) {
1148                 if (*name == '/' || *name == ':' || isspace(*name))
1149                         return false;
1150                 name++;
1151         }
1152         return true;
1153 }
1154 EXPORT_SYMBOL(dev_valid_name);
1155
1156 /**
1157  *      __dev_alloc_name - allocate a name for a device
1158  *      @net: network namespace to allocate the device name in
1159  *      @name: name format string
1160  *      @buf:  scratch buffer and result name string
1161  *
1162  *      Passed a format string - eg "lt%d" it will try and find a suitable
1163  *      id. It scans list of devices to build up a free map, then chooses
1164  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1165  *      while allocating the name and adding the device in order to avoid
1166  *      duplicates.
1167  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1168  *      Returns the number of the unit assigned or a negative errno code.
1169  */
1170
1171 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1172 {
1173         int i = 0;
1174         const char *p;
1175         const int max_netdevices = 8*PAGE_SIZE;
1176         unsigned long *inuse;
1177         struct net_device *d;
1178
1179         if (!dev_valid_name(name))
1180                 return -EINVAL;
1181
1182         p = strchr(name, '%');
1183         if (p) {
1184                 /*
1185                  * Verify the string as this thing may have come from
1186                  * the user.  There must be either one "%d" and no other "%"
1187                  * characters.
1188                  */
1189                 if (p[1] != 'd' || strchr(p + 2, '%'))
1190                         return -EINVAL;
1191
1192                 /* Use one page as a bit array of possible slots */
1193                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1194                 if (!inuse)
1195                         return -ENOMEM;
1196
1197                 for_each_netdev(net, d) {
1198                         struct netdev_name_node *name_node;
1199                         list_for_each_entry(name_node, &d->name_node->list, list) {
1200                                 if (!sscanf(name_node->name, name, &i))
1201                                         continue;
1202                                 if (i < 0 || i >= max_netdevices)
1203                                         continue;
1204
1205                                 /*  avoid cases where sscanf is not exact inverse of printf */
1206                                 snprintf(buf, IFNAMSIZ, name, i);
1207                                 if (!strncmp(buf, name_node->name, IFNAMSIZ))
1208                                         set_bit(i, inuse);
1209                         }
1210                         if (!sscanf(d->name, name, &i))
1211                                 continue;
1212                         if (i < 0 || i >= max_netdevices)
1213                                 continue;
1214
1215                         /*  avoid cases where sscanf is not exact inverse of printf */
1216                         snprintf(buf, IFNAMSIZ, name, i);
1217                         if (!strncmp(buf, d->name, IFNAMSIZ))
1218                                 set_bit(i, inuse);
1219                 }
1220
1221                 i = find_first_zero_bit(inuse, max_netdevices);
1222                 free_page((unsigned long) inuse);
1223         }
1224
1225         snprintf(buf, IFNAMSIZ, name, i);
1226         if (!__dev_get_by_name(net, buf))
1227                 return i;
1228
1229         /* It is possible to run out of possible slots
1230          * when the name is long and there isn't enough space left
1231          * for the digits, or if all bits are used.
1232          */
1233         return -ENFILE;
1234 }
1235
1236 static int dev_alloc_name_ns(struct net *net,
1237                              struct net_device *dev,
1238                              const char *name)
1239 {
1240         char buf[IFNAMSIZ];
1241         int ret;
1242
1243         BUG_ON(!net);
1244         ret = __dev_alloc_name(net, name, buf);
1245         if (ret >= 0)
1246                 strlcpy(dev->name, buf, IFNAMSIZ);
1247         return ret;
1248 }
1249
1250 /**
1251  *      dev_alloc_name - allocate a name for a device
1252  *      @dev: device
1253  *      @name: name format string
1254  *
1255  *      Passed a format string - eg "lt%d" it will try and find a suitable
1256  *      id. It scans list of devices to build up a free map, then chooses
1257  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1258  *      while allocating the name and adding the device in order to avoid
1259  *      duplicates.
1260  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1261  *      Returns the number of the unit assigned or a negative errno code.
1262  */
1263
1264 int dev_alloc_name(struct net_device *dev, const char *name)
1265 {
1266         return dev_alloc_name_ns(dev_net(dev), dev, name);
1267 }
1268 EXPORT_SYMBOL(dev_alloc_name);
1269
1270 static int dev_get_valid_name(struct net *net, struct net_device *dev,
1271                               const char *name)
1272 {
1273         BUG_ON(!net);
1274
1275         if (!dev_valid_name(name))
1276                 return -EINVAL;
1277
1278         if (strchr(name, '%'))
1279                 return dev_alloc_name_ns(net, dev, name);
1280         else if (__dev_get_by_name(net, name))
1281                 return -EEXIST;
1282         else if (dev->name != name)
1283                 strlcpy(dev->name, name, IFNAMSIZ);
1284
1285         return 0;
1286 }
1287
1288 /**
1289  *      dev_change_name - change name of a device
1290  *      @dev: device
1291  *      @newname: name (or format string) must be at least IFNAMSIZ
1292  *
1293  *      Change name of a device, can pass format strings "eth%d".
1294  *      for wildcarding.
1295  */
1296 int dev_change_name(struct net_device *dev, const char *newname)
1297 {
1298         unsigned char old_assign_type;
1299         char oldname[IFNAMSIZ];
1300         int err = 0;
1301         int ret;
1302         struct net *net;
1303
1304         ASSERT_RTNL();
1305         BUG_ON(!dev_net(dev));
1306
1307         net = dev_net(dev);
1308
1309         /* Some auto-enslaved devices e.g. failover slaves are
1310          * special, as userspace might rename the device after
1311          * the interface had been brought up and running since
1312          * the point kernel initiated auto-enslavement. Allow
1313          * live name change even when these slave devices are
1314          * up and running.
1315          *
1316          * Typically, users of these auto-enslaving devices
1317          * don't actually care about slave name change, as
1318          * they are supposed to operate on master interface
1319          * directly.
1320          */
1321         if (dev->flags & IFF_UP &&
1322             likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
1323                 return -EBUSY;
1324
1325         down_write(&devnet_rename_sem);
1326
1327         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1328                 up_write(&devnet_rename_sem);
1329                 return 0;
1330         }
1331
1332         memcpy(oldname, dev->name, IFNAMSIZ);
1333
1334         err = dev_get_valid_name(net, dev, newname);
1335         if (err < 0) {
1336                 up_write(&devnet_rename_sem);
1337                 return err;
1338         }
1339
1340         if (oldname[0] && !strchr(oldname, '%'))
1341                 netdev_info(dev, "renamed from %s\n", oldname);
1342
1343         old_assign_type = dev->name_assign_type;
1344         dev->name_assign_type = NET_NAME_RENAMED;
1345
1346 rollback:
1347         ret = device_rename(&dev->dev, dev->name);
1348         if (ret) {
1349                 memcpy(dev->name, oldname, IFNAMSIZ);
1350                 dev->name_assign_type = old_assign_type;
1351                 up_write(&devnet_rename_sem);
1352                 return ret;
1353         }
1354
1355         up_write(&devnet_rename_sem);
1356
1357         netdev_adjacent_rename_links(dev, oldname);
1358
1359         write_lock_bh(&dev_base_lock);
1360         netdev_name_node_del(dev->name_node);
1361         write_unlock_bh(&dev_base_lock);
1362
1363         synchronize_rcu();
1364
1365         write_lock_bh(&dev_base_lock);
1366         netdev_name_node_add(net, dev->name_node);
1367         write_unlock_bh(&dev_base_lock);
1368
1369         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1370         ret = notifier_to_errno(ret);
1371
1372         if (ret) {
1373                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1374                 if (err >= 0) {
1375                         err = ret;
1376                         down_write(&devnet_rename_sem);
1377                         memcpy(dev->name, oldname, IFNAMSIZ);
1378                         memcpy(oldname, newname, IFNAMSIZ);
1379                         dev->name_assign_type = old_assign_type;
1380                         old_assign_type = NET_NAME_RENAMED;
1381                         goto rollback;
1382                 } else {
1383                         pr_err("%s: name change rollback failed: %d\n",
1384                                dev->name, ret);
1385                 }
1386         }
1387
1388         return err;
1389 }
1390
1391 /**
1392  *      dev_set_alias - change ifalias of a device
1393  *      @dev: device
1394  *      @alias: name up to IFALIASZ
1395  *      @len: limit of bytes to copy from info
1396  *
1397  *      Set ifalias for a device,
1398  */
1399 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1400 {
1401         struct dev_ifalias *new_alias = NULL;
1402
1403         if (len >= IFALIASZ)
1404                 return -EINVAL;
1405
1406         if (len) {
1407                 new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1408                 if (!new_alias)
1409                         return -ENOMEM;
1410
1411                 memcpy(new_alias->ifalias, alias, len);
1412                 new_alias->ifalias[len] = 0;
1413         }
1414
1415         mutex_lock(&ifalias_mutex);
1416         new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
1417                                         mutex_is_locked(&ifalias_mutex));
1418         mutex_unlock(&ifalias_mutex);
1419
1420         if (new_alias)
1421                 kfree_rcu(new_alias, rcuhead);
1422
1423         return len;
1424 }
1425 EXPORT_SYMBOL(dev_set_alias);
1426
1427 /**
1428  *      dev_get_alias - get ifalias of a device
1429  *      @dev: device
1430  *      @name: buffer to store name of ifalias
1431  *      @len: size of buffer
1432  *
1433  *      get ifalias for a device.  Caller must make sure dev cannot go
1434  *      away,  e.g. rcu read lock or own a reference count to device.
1435  */
1436 int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1437 {
1438         const struct dev_ifalias *alias;
1439         int ret = 0;
1440
1441         rcu_read_lock();
1442         alias = rcu_dereference(dev->ifalias);
1443         if (alias)
1444                 ret = snprintf(name, len, "%s", alias->ifalias);
1445         rcu_read_unlock();
1446
1447         return ret;
1448 }
1449
1450 /**
1451  *      netdev_features_change - device changes features
1452  *      @dev: device to cause notification
1453  *
1454  *      Called to indicate a device has changed features.
1455  */
1456 void netdev_features_change(struct net_device *dev)
1457 {
1458         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1459 }
1460 EXPORT_SYMBOL(netdev_features_change);
1461
1462 /**
1463  *      netdev_state_change - device changes state
1464  *      @dev: device to cause notification
1465  *
1466  *      Called to indicate a device has changed state. This function calls
1467  *      the notifier chains for netdev_chain and sends a NEWLINK message
1468  *      to the routing socket.
1469  */
1470 void netdev_state_change(struct net_device *dev)
1471 {
1472         if (dev->flags & IFF_UP) {
1473                 struct netdev_notifier_change_info change_info = {
1474                         .info.dev = dev,
1475                 };
1476
1477                 call_netdevice_notifiers_info(NETDEV_CHANGE,
1478                                               &change_info.info);
1479                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1480         }
1481 }
1482 EXPORT_SYMBOL(netdev_state_change);
1483
1484 /**
1485  * netdev_notify_peers - notify network peers about existence of @dev
1486  * @dev: network device
1487  *
1488  * Generate traffic such that interested network peers are aware of
1489  * @dev, such as by generating a gratuitous ARP. This may be used when
1490  * a device wants to inform the rest of the network about some sort of
1491  * reconfiguration such as a failover event or virtual machine
1492  * migration.
1493  */
1494 void netdev_notify_peers(struct net_device *dev)
1495 {
1496         rtnl_lock();
1497         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1498         call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1499         rtnl_unlock();
1500 }
1501 EXPORT_SYMBOL(netdev_notify_peers);
1502
1503 static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1504 {
1505         const struct net_device_ops *ops = dev->netdev_ops;
1506         int ret;
1507
1508         ASSERT_RTNL();
1509
1510         if (!netif_device_present(dev)) {
1511                 /* may be detached because parent is runtime-suspended */
1512                 if (dev->dev.parent)
1513                         pm_runtime_resume(dev->dev.parent);
1514                 if (!netif_device_present(dev))
1515                         return -ENODEV;
1516         }
1517
1518         /* Block netpoll from trying to do any rx path servicing.
1519          * If we don't do this there is a chance ndo_poll_controller
1520          * or ndo_poll may be running while we open the device
1521          */
1522         netpoll_poll_disable(dev);
1523
1524         ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
1525         ret = notifier_to_errno(ret);
1526         if (ret)
1527                 return ret;
1528
1529         set_bit(__LINK_STATE_START, &dev->state);
1530
1531         if (ops->ndo_validate_addr)
1532                 ret = ops->ndo_validate_addr(dev);
1533
1534         if (!ret && ops->ndo_open)
1535                 ret = ops->ndo_open(dev);
1536
1537         netpoll_poll_enable(dev);
1538
1539         if (ret)
1540                 clear_bit(__LINK_STATE_START, &dev->state);
1541         else {
1542                 dev->flags |= IFF_UP;
1543                 dev_set_rx_mode(dev);
1544                 dev_activate(dev);
1545                 add_device_randomness(dev->dev_addr, dev->addr_len);
1546         }
1547
1548         return ret;
1549 }
1550
1551 /**
1552  *      dev_open        - prepare an interface for use.
1553  *      @dev: device to open
1554  *      @extack: netlink extended ack
1555  *
1556  *      Takes a device from down to up state. The device's private open
1557  *      function is invoked and then the multicast lists are loaded. Finally
1558  *      the device is moved into the up state and a %NETDEV_UP message is
1559  *      sent to the netdev notifier chain.
1560  *
1561  *      Calling this function on an active interface is a nop. On a failure
1562  *      a negative errno code is returned.
1563  */
1564 int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1565 {
1566         int ret;
1567
1568         if (dev->flags & IFF_UP)
1569                 return 0;
1570
1571         ret = __dev_open(dev, extack);
1572         if (ret < 0)
1573                 return ret;
1574
1575         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1576         call_netdevice_notifiers(NETDEV_UP, dev);
1577
1578         return ret;
1579 }
1580 EXPORT_SYMBOL(dev_open);
1581
1582 static void __dev_close_many(struct list_head *head)
1583 {
1584         struct net_device *dev;
1585
1586         ASSERT_RTNL();
1587         might_sleep();
1588
1589         list_for_each_entry(dev, head, close_list) {
1590                 /* Temporarily disable netpoll until the interface is down */
1591                 netpoll_poll_disable(dev);
1592
1593                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1594
1595                 clear_bit(__LINK_STATE_START, &dev->state);
1596
1597                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1598                  * can be even on different cpu. So just clear netif_running().
1599                  *
1600                  * dev->stop() will invoke napi_disable() on all of it's
1601                  * napi_struct instances on this device.
1602                  */
1603                 smp_mb__after_atomic(); /* Commit netif_running(). */
1604         }
1605
1606         dev_deactivate_many(head);
1607
1608         list_for_each_entry(dev, head, close_list) {
1609                 const struct net_device_ops *ops = dev->netdev_ops;
1610
1611                 /*
1612                  *      Call the device specific close. This cannot fail.
1613                  *      Only if device is UP
1614                  *
1615                  *      We allow it to be called even after a DETACH hot-plug
1616                  *      event.
1617                  */
1618                 if (ops->ndo_stop)
1619                         ops->ndo_stop(dev);
1620
1621                 dev->flags &= ~IFF_UP;
1622                 netpoll_poll_enable(dev);
1623         }
1624 }
1625
1626 static void __dev_close(struct net_device *dev)
1627 {
1628         LIST_HEAD(single);
1629
1630         list_add(&dev->close_list, &single);
1631         __dev_close_many(&single);
1632         list_del(&single);
1633 }
1634
1635 void dev_close_many(struct list_head *head, bool unlink)
1636 {
1637         struct net_device *dev, *tmp;
1638
1639         /* Remove the devices that don't need to be closed */
1640         list_for_each_entry_safe(dev, tmp, head, close_list)
1641                 if (!(dev->flags & IFF_UP))
1642                         list_del_init(&dev->close_list);
1643
1644         __dev_close_many(head);
1645
1646         list_for_each_entry_safe(dev, tmp, head, close_list) {
1647                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1648                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1649                 if (unlink)
1650                         list_del_init(&dev->close_list);
1651         }
1652 }
1653 EXPORT_SYMBOL(dev_close_many);
1654
1655 /**
1656  *      dev_close - shutdown an interface.
1657  *      @dev: device to shutdown
1658  *
1659  *      This function moves an active device into down state. A
1660  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1661  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1662  *      chain.
1663  */
1664 void dev_close(struct net_device *dev)
1665 {
1666         if (dev->flags & IFF_UP) {
1667                 LIST_HEAD(single);
1668
1669                 list_add(&dev->close_list, &single);
1670                 dev_close_many(&single, true);
1671                 list_del(&single);
1672         }
1673 }
1674 EXPORT_SYMBOL(dev_close);
1675
1676
1677 /**
1678  *      dev_disable_lro - disable Large Receive Offload on a device
1679  *      @dev: device
1680  *
1681  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1682  *      called under RTNL.  This is needed if received packets may be
1683  *      forwarded to another interface.
1684  */
1685 void dev_disable_lro(struct net_device *dev)
1686 {
1687         struct net_device *lower_dev;
1688         struct list_head *iter;
1689
1690         dev->wanted_features &= ~NETIF_F_LRO;
1691         netdev_update_features(dev);
1692
1693         if (unlikely(dev->features & NETIF_F_LRO))
1694                 netdev_WARN(dev, "failed to disable LRO!\n");
1695
1696         netdev_for_each_lower_dev(dev, lower_dev, iter)
1697                 dev_disable_lro(lower_dev);
1698 }
1699 EXPORT_SYMBOL(dev_disable_lro);
1700
1701 /**
1702  *      dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1703  *      @dev: device
1704  *
1705  *      Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
1706  *      called under RTNL.  This is needed if Generic XDP is installed on
1707  *      the device.
1708  */
1709 static void dev_disable_gro_hw(struct net_device *dev)
1710 {
1711         dev->wanted_features &= ~NETIF_F_GRO_HW;
1712         netdev_update_features(dev);
1713
1714         if (unlikely(dev->features & NETIF_F_GRO_HW))
1715                 netdev_WARN(dev, "failed to disable GRO_HW!\n");
1716 }
1717
1718 const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1719 {
1720 #define N(val)                                          \
1721         case NETDEV_##val:                              \
1722                 return "NETDEV_" __stringify(val);
1723         switch (cmd) {
1724         N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1725         N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1726         N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1727         N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
1728         N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
1729         N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
1730         N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1731         N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1732         N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1733         N(PRE_CHANGEADDR)
1734         }
1735 #undef N
1736         return "UNKNOWN_NETDEV_EVENT";
1737 }
1738 EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1739
1740 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1741                                    struct net_device *dev)
1742 {
1743         struct netdev_notifier_info info = {
1744                 .dev = dev,
1745         };
1746
1747         return nb->notifier_call(nb, val, &info);
1748 }
1749
1750 static int call_netdevice_register_notifiers(struct notifier_block *nb,
1751                                              struct net_device *dev)
1752 {
1753         int err;
1754
1755         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1756         err = notifier_to_errno(err);
1757         if (err)
1758                 return err;
1759
1760         if (!(dev->flags & IFF_UP))
1761                 return 0;
1762
1763         call_netdevice_notifier(nb, NETDEV_UP, dev);
1764         return 0;
1765 }
1766
1767 static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
1768                                                 struct net_device *dev)
1769 {
1770         if (dev->flags & IFF_UP) {
1771                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1772                                         dev);
1773                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1774         }
1775         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1776 }
1777
1778 static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
1779                                                  struct net *net)
1780 {
1781         struct net_device *dev;
1782         int err;
1783
1784         for_each_netdev(net, dev) {
1785                 err = call_netdevice_register_notifiers(nb, dev);
1786                 if (err)
1787                         goto rollback;
1788         }
1789         return 0;
1790
1791 rollback:
1792         for_each_netdev_continue_reverse(net, dev)
1793                 call_netdevice_unregister_notifiers(nb, dev);
1794         return err;
1795 }
1796
1797 static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
1798                                                     struct net *net)
1799 {
1800         struct net_device *dev;
1801
1802         for_each_netdev(net, dev)
1803                 call_netdevice_unregister_notifiers(nb, dev);
1804 }
1805
1806 static int dev_boot_phase = 1;
1807
1808 /**
1809  * register_netdevice_notifier - register a network notifier block
1810  * @nb: notifier
1811  *
1812  * Register a notifier to be called when network device events occur.
1813  * The notifier passed is linked into the kernel structures and must
1814  * not be reused until it has been unregistered. A negative errno code
1815  * is returned on a failure.
1816  *
1817  * When registered all registration and up events are replayed
1818  * to the new notifier to allow device to have a race free
1819  * view of the network device list.
1820  */
1821
1822 int register_netdevice_notifier(struct notifier_block *nb)
1823 {
1824         struct net *net;
1825         int err;
1826
1827         /* Close race with setup_net() and cleanup_net() */
1828         down_write(&pernet_ops_rwsem);
1829         rtnl_lock();
1830         err = raw_notifier_chain_register(&netdev_chain, nb);
1831         if (err)
1832                 goto unlock;
1833         if (dev_boot_phase)
1834                 goto unlock;
1835         for_each_net(net) {
1836                 err = call_netdevice_register_net_notifiers(nb, net);
1837                 if (err)
1838                         goto rollback;
1839         }
1840
1841 unlock:
1842         rtnl_unlock();
1843         up_write(&pernet_ops_rwsem);
1844         return err;
1845
1846 rollback:
1847         for_each_net_continue_reverse(net)
1848                 call_netdevice_unregister_net_notifiers(nb, net);
1849
1850         raw_notifier_chain_unregister(&netdev_chain, nb);
1851         goto unlock;
1852 }
1853 EXPORT_SYMBOL(register_netdevice_notifier);
1854
1855 /**
1856  * unregister_netdevice_notifier - unregister a network notifier block
1857  * @nb: notifier
1858  *
1859  * Unregister a notifier previously registered by
1860  * register_netdevice_notifier(). The notifier is unlinked into the
1861  * kernel structures and may then be reused. A negative errno code
1862  * is returned on a failure.
1863  *
1864  * After unregistering unregister and down device events are synthesized
1865  * for all devices on the device list to the removed notifier to remove
1866  * the need for special case cleanup code.
1867  */
1868
1869 int unregister_netdevice_notifier(struct notifier_block *nb)
1870 {
1871         struct net *net;
1872         int err;
1873
1874         /* Close race with setup_net() and cleanup_net() */
1875         down_write(&pernet_ops_rwsem);
1876         rtnl_lock();
1877         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1878         if (err)
1879                 goto unlock;
1880
1881         for_each_net(net)
1882                 call_netdevice_unregister_net_notifiers(nb, net);
1883
1884 unlock:
1885         rtnl_unlock();
1886         up_write(&pernet_ops_rwsem);
1887         return err;
1888 }
1889 EXPORT_SYMBOL(unregister_netdevice_notifier);
1890
1891 static int __register_netdevice_notifier_net(struct net *net,
1892                                              struct notifier_block *nb,
1893                                              bool ignore_call_fail)
1894 {
1895         int err;
1896
1897         err = raw_notifier_chain_register(&net->netdev_chain, nb);
1898         if (err)
1899                 return err;
1900         if (dev_boot_phase)
1901                 return 0;
1902
1903         err = call_netdevice_register_net_notifiers(nb, net);
1904         if (err && !ignore_call_fail)
1905                 goto chain_unregister;
1906
1907         return 0;
1908
1909 chain_unregister:
1910         raw_notifier_chain_unregister(&net->netdev_chain, nb);
1911         return err;
1912 }
1913
1914 static int __unregister_netdevice_notifier_net(struct net *net,
1915                                                struct notifier_block *nb)
1916 {
1917         int err;
1918
1919         err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
1920         if (err)
1921                 return err;
1922
1923         call_netdevice_unregister_net_notifiers(nb, net);
1924         return 0;
1925 }
1926
1927 /**
1928  * register_netdevice_notifier_net - register a per-netns network notifier block
1929  * @net: network namespace
1930  * @nb: notifier
1931  *
1932  * Register a notifier to be called when network device events occur.
1933  * The notifier passed is linked into the kernel structures and must
1934  * not be reused until it has been unregistered. A negative errno code
1935  * is returned on a failure.
1936  *
1937  * When registered all registration and up events are replayed
1938  * to the new notifier to allow device to have a race free
1939  * view of the network device list.
1940  */
1941
1942 int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
1943 {
1944         int err;
1945
1946         rtnl_lock();
1947         err = __register_netdevice_notifier_net(net, nb, false);
1948         rtnl_unlock();
1949         return err;
1950 }
1951 EXPORT_SYMBOL(register_netdevice_notifier_net);
1952
1953 /**
1954  * unregister_netdevice_notifier_net - unregister a per-netns
1955  *                                     network notifier block
1956  * @net: network namespace
1957  * @nb: notifier
1958  *
1959  * Unregister a notifier previously registered by
1960  * register_netdevice_notifier(). The notifier is unlinked into the
1961  * kernel structures and may then be reused. A negative errno code
1962  * is returned on a failure.
1963  *
1964  * After unregistering unregister and down device events are synthesized
1965  * for all devices on the device list to the removed notifier to remove
1966  * the need for special case cleanup code.
1967  */
1968
1969 int unregister_netdevice_notifier_net(struct net *net,
1970                                       struct notifier_block *nb)
1971 {
1972         int err;
1973
1974         rtnl_lock();
1975         err = __unregister_netdevice_notifier_net(net, nb);
1976         rtnl_unlock();
1977         return err;
1978 }
1979 EXPORT_SYMBOL(unregister_netdevice_notifier_net);
1980
1981 int register_netdevice_notifier_dev_net(struct net_device *dev,
1982                                         struct notifier_block *nb,
1983                                         struct netdev_net_notifier *nn)
1984 {
1985         int err;
1986
1987         rtnl_lock();
1988         err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
1989         if (!err) {
1990                 nn->nb = nb;
1991                 list_add(&nn->list, &dev->net_notifier_list);
1992         }
1993         rtnl_unlock();
1994         return err;
1995 }
1996 EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
1997
1998 int unregister_netdevice_notifier_dev_net(struct net_device *dev,
1999                                           struct notifier_block *nb,
2000                                           struct netdev_net_notifier *nn)
2001 {
2002         int err;
2003
2004         rtnl_lock();
2005         list_del(&nn->list);
2006         err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
2007         rtnl_unlock();
2008         return err;
2009 }
2010 EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
2011
2012 static void move_netdevice_notifiers_dev_net(struct net_device *dev,
2013                                              struct net *net)
2014 {
2015         struct netdev_net_notifier *nn;
2016
2017         list_for_each_entry(nn, &dev->net_notifier_list, list) {
2018                 __unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
2019                 __register_netdevice_notifier_net(net, nn->nb, true);
2020         }
2021 }
2022
2023 /**
2024  *      call_netdevice_notifiers_info - call all network notifier blocks
2025  *      @val: value passed unmodified to notifier function
2026  *      @info: notifier information data
2027  *
2028  *      Call all network notifier blocks.  Parameters and return value
2029  *      are as for raw_notifier_call_chain().
2030  */
2031
2032 static int call_netdevice_notifiers_info(unsigned long val,
2033                                          struct netdev_notifier_info *info)
2034 {
2035         struct net *net = dev_net(info->dev);
2036         int ret;
2037
2038         ASSERT_RTNL();
2039
2040         /* Run per-netns notifier block chain first, then run the global one.
2041          * Hopefully, one day, the global one is going to be removed after
2042          * all notifier block registrators get converted to be per-netns.
2043          */
2044         ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
2045         if (ret & NOTIFY_STOP_MASK)
2046                 return ret;
2047         return raw_notifier_call_chain(&netdev_chain, val, info);
2048 }
2049
2050 static int call_netdevice_notifiers_extack(unsigned long val,
2051                                            struct net_device *dev,
2052                                            struct netlink_ext_ack *extack)
2053 {
2054         struct netdev_notifier_info info = {
2055                 .dev = dev,
2056                 .extack = extack,
2057         };
2058
2059         return call_netdevice_notifiers_info(val, &info);
2060 }
2061
2062 /**
2063  *      call_netdevice_notifiers - call all network notifier blocks
2064  *      @val: value passed unmodified to notifier function
2065  *      @dev: net_device pointer passed unmodified to notifier function
2066  *
2067  *      Call all network notifier blocks.  Parameters and return value
2068  *      are as for raw_notifier_call_chain().
2069  */
2070
2071 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
2072 {
2073         return call_netdevice_notifiers_extack(val, dev, NULL);
2074 }
2075 EXPORT_SYMBOL(call_netdevice_notifiers);
2076
2077 /**
2078  *      call_netdevice_notifiers_mtu - call all network notifier blocks
2079  *      @val: value passed unmodified to notifier function
2080  *      @dev: net_device pointer passed unmodified to notifier function
2081  *      @arg: additional u32 argument passed to the notifier function
2082  *
2083  *      Call all network notifier blocks.  Parameters and return value
2084  *      are as for raw_notifier_call_chain().
2085  */
2086 static int call_netdevice_notifiers_mtu(unsigned long val,
2087                                         struct net_device *dev, u32 arg)
2088 {
2089         struct netdev_notifier_info_ext info = {
2090                 .info.dev = dev,
2091                 .ext.mtu = arg,
2092         };
2093
2094         BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
2095
2096         return call_netdevice_notifiers_info(val, &info.info);
2097 }
2098
2099 #ifdef CONFIG_NET_INGRESS
2100 static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
2101
2102 void net_inc_ingress_queue(void)
2103 {
2104         static_branch_inc(&ingress_needed_key);
2105 }
2106 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
2107
2108 void net_dec_ingress_queue(void)
2109 {
2110         static_branch_dec(&ingress_needed_key);
2111 }
2112 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
2113 #endif
2114
2115 #ifdef CONFIG_NET_EGRESS
2116 static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
2117
2118 void net_inc_egress_queue(void)
2119 {
2120         static_branch_inc(&egress_needed_key);
2121 }
2122 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
2123
2124 void net_dec_egress_queue(void)
2125 {
2126         static_branch_dec(&egress_needed_key);
2127 }
2128 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
2129 #endif
2130
2131 static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
2132 #ifdef CONFIG_JUMP_LABEL
2133 static atomic_t netstamp_needed_deferred;
2134 static atomic_t netstamp_wanted;
2135 static void netstamp_clear(struct work_struct *work)
2136 {
2137         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
2138         int wanted;
2139
2140         wanted = atomic_add_return(deferred, &netstamp_wanted);
2141         if (wanted > 0)
2142                 static_branch_enable(&netstamp_needed_key);
2143         else
2144                 static_branch_disable(&netstamp_needed_key);
2145 }
2146 static DECLARE_WORK(netstamp_work, netstamp_clear);
2147 #endif
2148
2149 void net_enable_timestamp(void)
2150 {
2151 #ifdef CONFIG_JUMP_LABEL
2152         int wanted;
2153
2154         while (1) {
2155                 wanted = atomic_read(&netstamp_wanted);
2156                 if (wanted <= 0)
2157                         break;
2158                 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
2159                         return;
2160         }
2161         atomic_inc(&netstamp_needed_deferred);
2162         schedule_work(&netstamp_work);
2163 #else
2164         static_branch_inc(&netstamp_needed_key);
2165 #endif
2166 }
2167 EXPORT_SYMBOL(net_enable_timestamp);
2168
2169 void net_disable_timestamp(void)
2170 {
2171 #ifdef CONFIG_JUMP_LABEL
2172         int wanted;
2173
2174         while (1) {
2175                 wanted = atomic_read(&netstamp_wanted);
2176                 if (wanted <= 1)
2177                         break;
2178                 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
2179                         return;
2180         }
2181         atomic_dec(&netstamp_needed_deferred);
2182         schedule_work(&netstamp_work);
2183 #else
2184         static_branch_dec(&netstamp_needed_key);
2185 #endif
2186 }
2187 EXPORT_SYMBOL(net_disable_timestamp);
2188
2189 static inline void net_timestamp_set(struct sk_buff *skb)
2190 {
2191         skb->tstamp = 0;
2192         if (static_branch_unlikely(&netstamp_needed_key))
2193                 __net_timestamp(skb);
2194 }
2195
2196 #define net_timestamp_check(COND, SKB)                          \
2197         if (static_branch_unlikely(&netstamp_needed_key)) {     \
2198                 if ((COND) && !(SKB)->tstamp)                   \
2199                         __net_timestamp(SKB);                   \
2200         }                                                       \
2201
2202 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
2203 {
2204         unsigned int len;
2205
2206         if (!(dev->flags & IFF_UP))
2207                 return false;
2208
2209         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
2210         if (skb->len <= len)
2211                 return true;
2212
2213         /* if TSO is enabled, we don't care about the length as the packet
2214          * could be forwarded without being segmented before
2215          */
2216         if (skb_is_gso(skb))
2217                 return true;
2218
2219         return false;
2220 }
2221 EXPORT_SYMBOL_GPL(is_skb_forwardable);
2222
2223 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2224 {
2225         int ret = ____dev_forward_skb(dev, skb);
2226
2227         if (likely(!ret)) {
2228                 skb->protocol = eth_type_trans(skb, dev);
2229                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
2230         }
2231
2232         return ret;
2233 }
2234 EXPORT_SYMBOL_GPL(__dev_forward_skb);
2235
2236 /**
2237  * dev_forward_skb - loopback an skb to another netif
2238  *
2239  * @dev: destination network device
2240  * @skb: buffer to forward
2241  *
2242  * return values:
2243  *      NET_RX_SUCCESS  (no congestion)
2244  *      NET_RX_DROP     (packet was dropped, but freed)
2245  *
2246  * dev_forward_skb can be used for injecting an skb from the
2247  * start_xmit function of one device into the receive queue
2248  * of another device.
2249  *
2250  * The receiving device may be in another namespace, so
2251  * we have to clear all information in the skb that could
2252  * impact namespace isolation.
2253  */
2254 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2255 {
2256         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
2257 }
2258 EXPORT_SYMBOL_GPL(dev_forward_skb);
2259
2260 static inline int deliver_skb(struct sk_buff *skb,
2261                               struct packet_type *pt_prev,
2262                               struct net_device *orig_dev)
2263 {
2264         if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
2265                 return -ENOMEM;
2266         refcount_inc(&skb->users);
2267         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2268 }
2269
2270 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
2271                                           struct packet_type **pt,
2272                                           struct net_device *orig_dev,
2273                                           __be16 type,
2274                                           struct list_head *ptype_list)
2275 {
2276         struct packet_type *ptype, *pt_prev = *pt;
2277
2278         list_for_each_entry_rcu(ptype, ptype_list, list) {
2279                 if (ptype->type != type)
2280                         continue;
2281                 if (pt_prev)
2282                         deliver_skb(skb, pt_prev, orig_dev);
2283                 pt_prev = ptype;
2284         }
2285         *pt = pt_prev;
2286 }
2287
2288 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
2289 {
2290         if (!ptype->af_packet_priv || !skb->sk)
2291                 return false;
2292
2293         if (ptype->id_match)
2294                 return ptype->id_match(ptype, skb->sk);
2295         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
2296                 return true;
2297
2298         return false;
2299 }
2300
2301 /**
2302  * dev_nit_active - return true if any network interface taps are in use
2303  *
2304  * @dev: network device to check for the presence of taps
2305  */
2306 bool dev_nit_active(struct net_device *dev)
2307 {
2308         return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
2309 }
2310 EXPORT_SYMBOL_GPL(dev_nit_active);
2311
2312 /*
2313  *      Support routine. Sends outgoing frames to any network
2314  *      taps currently in use.
2315  */
2316
2317 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
2318 {
2319         struct packet_type *ptype;
2320         struct sk_buff *skb2 = NULL;
2321         struct packet_type *pt_prev = NULL;
2322         struct list_head *ptype_list = &ptype_all;
2323
2324         rcu_read_lock();
2325 again:
2326         list_for_each_entry_rcu(ptype, ptype_list, list) {
2327                 if (READ_ONCE(ptype->ignore_outgoing))
2328                         continue;
2329
2330                 /* Never send packets back to the socket
2331                  * they originated from - MvS (miquels@drinkel.ow.org)
2332                  */
2333                 if (skb_loop_sk(ptype, skb))
2334                         continue;
2335
2336                 if (pt_prev) {
2337                         deliver_skb(skb2, pt_prev, skb->dev);
2338                         pt_prev = ptype;
2339                         continue;
2340                 }
2341
2342                 /* need to clone skb, done only once */
2343                 skb2 = skb_clone(skb, GFP_ATOMIC);
2344                 if (!skb2)
2345                         goto out_unlock;
2346
2347                 net_timestamp_set(skb2);
2348
2349                 /* skb->nh should be correctly
2350                  * set by sender, so that the second statement is
2351                  * just protection against buggy protocols.
2352                  */
2353                 skb_reset_mac_header(skb2);
2354
2355                 if (skb_network_header(skb2) < skb2->data ||
2356                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2357                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2358                                              ntohs(skb2->protocol),
2359                                              dev->name);
2360                         skb_reset_network_header(skb2);
2361                 }
2362
2363                 skb2->transport_header = skb2->network_header;
2364                 skb2->pkt_type = PACKET_OUTGOING;
2365                 pt_prev = ptype;
2366         }
2367
2368         if (ptype_list == &ptype_all) {
2369                 ptype_list = &dev->ptype_all;
2370                 goto again;
2371         }
2372 out_unlock:
2373         if (pt_prev) {
2374                 if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2375                         pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2376                 else
2377                         kfree_skb(skb2);
2378         }
2379         rcu_read_unlock();
2380 }
2381 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2382
2383 /**
2384  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2385  * @dev: Network device
2386  * @txq: number of queues available
2387  *
2388  * If real_num_tx_queues is changed the tc mappings may no longer be
2389  * valid. To resolve this verify the tc mapping remains valid and if
2390  * not NULL the mapping. With no priorities mapping to this
2391  * offset/count pair it will no longer be used. In the worst case TC0
2392  * is invalid nothing can be done so disable priority mappings. If is
2393  * expected that drivers will fix this mapping if they can before
2394  * calling netif_set_real_num_tx_queues.
2395  */
2396 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2397 {
2398         int i;
2399         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2400
2401         /* If TC0 is invalidated disable TC mapping */
2402         if (tc->offset + tc->count > txq) {
2403                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2404                 dev->num_tc = 0;
2405                 return;
2406         }
2407
2408         /* Invalidated prio to tc mappings set to TC0 */
2409         for (i = 1; i < TC_BITMASK + 1; i++) {
2410                 int q = netdev_get_prio_tc_map(dev, i);
2411
2412                 tc = &dev->tc_to_txq[q];
2413                 if (tc->offset + tc->count > txq) {
2414                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2415                                 i, q);
2416                         netdev_set_prio_tc_map(dev, i, 0);
2417                 }
2418         }
2419 }
2420
2421 int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2422 {
2423         if (dev->num_tc) {
2424                 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2425                 int i;
2426
2427                 /* walk through the TCs and see if it falls into any of them */
2428                 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2429                         if ((txq - tc->offset) < tc->count)
2430                                 return i;
2431                 }
2432
2433                 /* didn't find it, just return -1 to indicate no match */
2434                 return -1;
2435         }
2436
2437         return 0;
2438 }
2439 EXPORT_SYMBOL(netdev_txq_to_tc);
2440
2441 #ifdef CONFIG_XPS
2442 struct static_key xps_needed __read_mostly;
2443 EXPORT_SYMBOL(xps_needed);
2444 struct static_key xps_rxqs_needed __read_mostly;
2445 EXPORT_SYMBOL(xps_rxqs_needed);
2446 static DEFINE_MUTEX(xps_map_mutex);
2447 #define xmap_dereference(P)             \
2448         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2449
2450 static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2451                              int tci, u16 index)
2452 {
2453         struct xps_map *map = NULL;
2454         int pos;
2455
2456         if (dev_maps)
2457                 map = xmap_dereference(dev_maps->attr_map[tci]);
2458         if (!map)
2459                 return false;
2460
2461         for (pos = map->len; pos--;) {
2462                 if (map->queues[pos] != index)
2463                         continue;
2464
2465                 if (map->len > 1) {
2466                         map->queues[pos] = map->queues[--map->len];
2467                         break;
2468                 }
2469
2470                 RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2471                 kfree_rcu(map, rcu);
2472                 return false;
2473         }
2474
2475         return true;
2476 }
2477
2478 static bool remove_xps_queue_cpu(struct net_device *dev,
2479                                  struct xps_dev_maps *dev_maps,
2480                                  int cpu, u16 offset, u16 count)
2481 {
2482         int num_tc = dev->num_tc ? : 1;
2483         bool active = false;
2484         int tci;
2485
2486         for (tci = cpu * num_tc; num_tc--; tci++) {
2487                 int i, j;
2488
2489                 for (i = count, j = offset; i--; j++) {
2490                         if (!remove_xps_queue(dev_maps, tci, j))
2491                                 break;
2492                 }
2493
2494                 active |= i < 0;
2495         }
2496
2497         return active;
2498 }
2499
2500 static void reset_xps_maps(struct net_device *dev,
2501                            struct xps_dev_maps *dev_maps,
2502                            bool is_rxqs_map)
2503 {
2504         if (is_rxqs_map) {
2505                 static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
2506                 RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
2507         } else {
2508                 RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
2509         }
2510         static_key_slow_dec_cpuslocked(&xps_needed);
2511         kfree_rcu(dev_maps, rcu);
2512 }
2513
2514 static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
2515                            struct xps_dev_maps *dev_maps, unsigned int nr_ids,
2516                            u16 offset, u16 count, bool is_rxqs_map)
2517 {
2518         bool active = false;
2519         int i, j;
2520
2521         for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
2522              j < nr_ids;)
2523                 active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
2524                                                count);
2525         if (!active)
2526                 reset_xps_maps(dev, dev_maps, is_rxqs_map);
2527
2528         if (!is_rxqs_map) {
2529                 for (i = offset + (count - 1); count--; i--) {
2530                         netdev_queue_numa_node_write(
2531                                 netdev_get_tx_queue(dev, i),
2532                                 NUMA_NO_NODE);
2533                 }
2534         }
2535 }
2536
2537 static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2538                                    u16 count)
2539 {
2540         const unsigned long *possible_mask = NULL;
2541         struct xps_dev_maps *dev_maps;
2542         unsigned int nr_ids;
2543
2544         if (!static_key_false(&xps_needed))
2545                 return;
2546
2547         cpus_read_lock();
2548         mutex_lock(&xps_map_mutex);
2549
2550         if (static_key_false(&xps_rxqs_needed)) {
2551                 dev_maps = xmap_dereference(dev->xps_rxqs_map);
2552                 if (dev_maps) {
2553                         nr_ids = dev->num_rx_queues;
2554                         clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
2555                                        offset, count, true);
2556                 }
2557         }
2558
2559         dev_maps = xmap_dereference(dev->xps_cpus_map);
2560         if (!dev_maps)
2561                 goto out_no_maps;
2562
2563         if (num_possible_cpus() > 1)
2564                 possible_mask = cpumask_bits(cpu_possible_mask);
2565         nr_ids = nr_cpu_ids;
2566         clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
2567                        false);
2568
2569 out_no_maps:
2570         mutex_unlock(&xps_map_mutex);
2571         cpus_read_unlock();
2572 }
2573
2574 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2575 {
2576         netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2577 }
2578
2579 static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2580                                       u16 index, bool is_rxqs_map)
2581 {
2582         struct xps_map *new_map;
2583         int alloc_len = XPS_MIN_MAP_ALLOC;
2584         int i, pos;
2585
2586         for (pos = 0; map && pos < map->len; pos++) {
2587                 if (map->queues[pos] != index)
2588                         continue;
2589                 return map;
2590         }
2591
2592         /* Need to add tx-queue to this CPU's/rx-queue's existing map */
2593         if (map) {
2594                 if (pos < map->alloc_len)
2595                         return map;
2596
2597                 alloc_len = map->alloc_len * 2;
2598         }
2599
2600         /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2601          *  map
2602          */
2603         if (is_rxqs_map)
2604                 new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2605         else
2606                 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2607                                        cpu_to_node(attr_index));
2608         if (!new_map)
2609                 return NULL;
2610
2611         for (i = 0; i < pos; i++)
2612                 new_map->queues[i] = map->queues[i];
2613         new_map->alloc_len = alloc_len;
2614         new_map->len = pos;
2615
2616         return new_map;
2617 }
2618
2619 /* Must be called under cpus_read_lock */
2620 int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2621                           u16 index, bool is_rxqs_map)
2622 {
2623         const unsigned long *online_mask = NULL, *possible_mask = NULL;
2624         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2625         int i, j, tci, numa_node_id = -2;
2626         int maps_sz, num_tc = 1, tc = 0;
2627         struct xps_map *map, *new_map;
2628         bool active = false;
2629         unsigned int nr_ids;
2630
2631         WARN_ON_ONCE(index >= dev->num_tx_queues);
2632
2633         if (dev->num_tc) {
2634                 /* Do not allow XPS on subordinate device directly */
2635                 num_tc = dev->num_tc;
2636                 if (num_tc < 0)
2637                         return -EINVAL;
2638
2639                 /* If queue belongs to subordinate dev use its map */
2640                 dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2641
2642                 tc = netdev_txq_to_tc(dev, index);
2643                 if (tc < 0)
2644                         return -EINVAL;
2645         }
2646
2647         mutex_lock(&xps_map_mutex);
2648         if (is_rxqs_map) {
2649                 maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2650                 dev_maps = xmap_dereference(dev->xps_rxqs_map);
2651                 nr_ids = dev->num_rx_queues;
2652         } else {
2653                 maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2654                 if (num_possible_cpus() > 1) {
2655                         online_mask = cpumask_bits(cpu_online_mask);
2656                         possible_mask = cpumask_bits(cpu_possible_mask);
2657                 }
2658                 dev_maps = xmap_dereference(dev->xps_cpus_map);
2659                 nr_ids = nr_cpu_ids;
2660         }
2661
2662         if (maps_sz < L1_CACHE_BYTES)
2663                 maps_sz = L1_CACHE_BYTES;
2664
2665         /* allocate memory for queue storage */
2666         for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
2667              j < nr_ids;) {
2668                 if (!new_dev_maps)
2669                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2670                 if (!new_dev_maps) {
2671                         mutex_unlock(&xps_map_mutex);
2672                         return -ENOMEM;
2673                 }
2674
2675                 tci = j * num_tc + tc;
2676                 map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
2677                                  NULL;
2678
2679                 map = expand_xps_map(map, j, index, is_rxqs_map);
2680                 if (!map)
2681                         goto error;
2682
2683                 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2684         }
2685
2686         if (!new_dev_maps)
2687                 goto out_no_new_maps;
2688
2689         if (!dev_maps) {
2690                 /* Increment static keys at most once per type */
2691                 static_key_slow_inc_cpuslocked(&xps_needed);
2692                 if (is_rxqs_map)
2693                         static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
2694         }
2695
2696         for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2697              j < nr_ids;) {
2698                 /* copy maps belonging to foreign traffic classes */
2699                 for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
2700                         /* fill in the new device map from the old device map */
2701                         map = xmap_dereference(dev_maps->attr_map[tci]);
2702                         RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2703                 }
2704
2705                 /* We need to explicitly update tci as prevous loop
2706                  * could break out early if dev_maps is NULL.
2707                  */
2708                 tci = j * num_tc + tc;
2709
2710                 if (netif_attr_test_mask(j, mask, nr_ids) &&
2711                     netif_attr_test_online(j, online_mask, nr_ids)) {
2712                         /* add tx-queue to CPU/rx-queue maps */
2713                         int pos = 0;
2714
2715                         map = xmap_dereference(new_dev_maps->attr_map[tci]);
2716                         while ((pos < map->len) && (map->queues[pos] != index))
2717                                 pos++;
2718
2719                         if (pos == map->len)
2720                                 map->queues[map->len++] = index;
2721 #ifdef CONFIG_NUMA
2722                         if (!is_rxqs_map) {
2723                                 if (numa_node_id == -2)
2724                                         numa_node_id = cpu_to_node(j);
2725                                 else if (numa_node_id != cpu_to_node(j))
2726                                         numa_node_id = -1;
2727                         }
2728 #endif
2729                 } else if (dev_maps) {
2730                         /* fill in the new device map from the old device map */
2731                         map = xmap_dereference(dev_maps->attr_map[tci]);
2732                         RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2733                 }
2734
2735                 /* copy maps belonging to foreign traffic classes */
2736                 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2737                         /* fill in the new device map from the old device map */
2738                         map = xmap_dereference(dev_maps->attr_map[tci]);
2739                         RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2740                 }
2741         }
2742
2743         if (is_rxqs_map)
2744                 rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
2745         else
2746                 rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
2747
2748         /* Cleanup old maps */
2749         if (!dev_maps)
2750                 goto out_no_old_maps;
2751
2752         for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2753              j < nr_ids;) {
2754                 for (i = num_tc, tci = j * num_tc; i--; tci++) {
2755                         new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2756                         map = xmap_dereference(dev_maps->attr_map[tci]);
2757                         if (map && map != new_map)
2758                                 kfree_rcu(map, rcu);
2759                 }
2760         }
2761
2762         kfree_rcu(dev_maps, rcu);
2763
2764 out_no_old_maps:
2765         dev_maps = new_dev_maps;
2766         active = true;
2767
2768 out_no_new_maps:
2769         if (!is_rxqs_map) {
2770                 /* update Tx queue numa node */
2771                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2772                                              (numa_node_id >= 0) ?
2773                                              numa_node_id : NUMA_NO_NODE);
2774         }
2775
2776         if (!dev_maps)
2777                 goto out_no_maps;
2778
2779         /* removes tx-queue from unused CPUs/rx-queues */
2780         for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2781              j < nr_ids;) {
2782                 for (i = tc, tci = j * num_tc; i--; tci++)
2783                         active |= remove_xps_queue(dev_maps, tci, index);
2784                 if (!netif_attr_test_mask(j, mask, nr_ids) ||
2785                     !netif_attr_test_online(j, online_mask, nr_ids))
2786                         active |= remove_xps_queue(dev_maps, tci, index);
2787                 for (i = num_tc - tc, tci++; --i; tci++)
2788                         active |= remove_xps_queue(dev_maps, tci, index);
2789         }
2790
2791         /* free map if not active */
2792         if (!active)
2793                 reset_xps_maps(dev, dev_maps, is_rxqs_map);
2794
2795 out_no_maps:
2796         mutex_unlock(&xps_map_mutex);
2797
2798         return 0;
2799 error:
2800         /* remove any maps that we added */
2801         for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2802              j < nr_ids;) {
2803                 for (i = num_tc, tci = j * num_tc; i--; tci++) {
2804                         new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2805                         map = dev_maps ?
2806                               xmap_dereference(dev_maps->attr_map[tci]) :
2807                               NULL;
2808                         if (new_map && new_map != map)
2809                                 kfree(new_map);
2810                 }
2811         }
2812
2813         mutex_unlock(&xps_map_mutex);
2814
2815         kfree(new_dev_maps);
2816         return -ENOMEM;
2817 }
2818 EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
2819
2820 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2821                         u16 index)
2822 {
2823         int ret;
2824
2825         cpus_read_lock();
2826         ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
2827         cpus_read_unlock();
2828
2829         return ret;
2830 }
2831 EXPORT_SYMBOL(netif_set_xps_queue);
2832
2833 #endif
2834 static void netdev_unbind_all_sb_channels(struct net_device *dev)
2835 {
2836         struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2837
2838         /* Unbind any subordinate channels */
2839         while (txq-- != &dev->_tx[0]) {
2840                 if (txq->sb_dev)
2841                         netdev_unbind_sb_channel(dev, txq->sb_dev);
2842         }
2843 }
2844
2845 void netdev_reset_tc(struct net_device *dev)
2846 {
2847 #ifdef CONFIG_XPS
2848         netif_reset_xps_queues_gt(dev, 0);
2849 #endif
2850         netdev_unbind_all_sb_channels(dev);
2851
2852         /* Reset TC configuration of device */
2853         dev->num_tc = 0;
2854         memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2855         memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2856 }
2857 EXPORT_SYMBOL(netdev_reset_tc);
2858
2859 int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2860 {
2861         if (tc >= dev->num_tc)
2862                 return -EINVAL;
2863
2864 #ifdef CONFIG_XPS
2865         netif_reset_xps_queues(dev, offset, count);
2866 #endif
2867         dev->tc_to_txq[tc].count = count;
2868         dev->tc_to_txq[tc].offset = offset;
2869         return 0;
2870 }
2871 EXPORT_SYMBOL(netdev_set_tc_queue);
2872
2873 int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2874 {
2875         if (num_tc > TC_MAX_QUEUE)
2876                 return -EINVAL;
2877
2878 #ifdef CONFIG_XPS
2879         netif_reset_xps_queues_gt(dev, 0);
2880 #endif
2881         netdev_unbind_all_sb_channels(dev);
2882
2883         dev->num_tc = num_tc;
2884         return 0;
2885 }
2886 EXPORT_SYMBOL(netdev_set_num_tc);
2887
2888 void netdev_unbind_sb_channel(struct net_device *dev,
2889                               struct net_device *sb_dev)
2890 {
2891         struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2892
2893 #ifdef CONFIG_XPS
2894         netif_reset_xps_queues_gt(sb_dev, 0);
2895 #endif
2896         memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
2897         memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
2898
2899         while (txq-- != &dev->_tx[0]) {
2900                 if (txq->sb_dev == sb_dev)
2901                         txq->sb_dev = NULL;
2902         }
2903 }
2904 EXPORT_SYMBOL(netdev_unbind_sb_channel);
2905
2906 int netdev_bind_sb_channel_queue(struct net_device *dev,
2907                                  struct net_device *sb_dev,
2908                                  u8 tc, u16 count, u16 offset)
2909 {
2910         /* Make certain the sb_dev and dev are already configured */
2911         if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
2912                 return -EINVAL;
2913
2914         /* We cannot hand out queues we don't have */
2915         if ((offset + count) > dev->real_num_tx_queues)
2916                 return -EINVAL;
2917
2918         /* Record the mapping */
2919         sb_dev->tc_to_txq[tc].count = count;
2920         sb_dev->tc_to_txq[tc].offset = offset;
2921
2922         /* Provide a way for Tx queue to find the tc_to_txq map or
2923          * XPS map for itself.
2924          */
2925         while (count--)
2926                 netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
2927
2928         return 0;
2929 }
2930 EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
2931
2932 int netdev_set_sb_channel(struct net_device *dev, u16 channel)
2933 {
2934         /* Do not use a multiqueue device to represent a subordinate channel */
2935         if (netif_is_multiqueue(dev))
2936                 return -ENODEV;
2937
2938         /* We allow channels 1 - 32767 to be used for subordinate channels.
2939          * Channel 0 is meant to be "native" mode and used only to represent
2940          * the main root device. We allow writing 0 to reset the device back
2941          * to normal mode after being used as a subordinate channel.
2942          */
2943         if (channel > S16_MAX)
2944                 return -EINVAL;
2945
2946         dev->num_tc = -channel;
2947
2948         return 0;
2949 }
2950 EXPORT_SYMBOL(netdev_set_sb_channel);
2951
2952 /*
2953  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2954  * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2955  */
2956 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2957 {
2958         bool disabling;
2959         int rc;
2960
2961         disabling = txq < dev->real_num_tx_queues;
2962
2963         if (txq < 1 || txq > dev->num_tx_queues)
2964                 return -EINVAL;
2965
2966         if (dev->reg_state == NETREG_REGISTERED ||
2967             dev->reg_state == NETREG_UNREGISTERING) {
2968                 ASSERT_RTNL();
2969
2970                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2971                                                   txq);
2972                 if (rc)
2973                         return rc;
2974
2975                 if (dev->num_tc)
2976                         netif_setup_tc(dev, txq);
2977
2978                 dev_qdisc_change_real_num_tx(dev, txq);
2979
2980                 dev->real_num_tx_queues = txq;
2981
2982                 if (disabling) {
2983                         synchronize_net();
2984                         qdisc_reset_all_tx_gt(dev, txq);
2985 #ifdef CONFIG_XPS
2986                         netif_reset_xps_queues_gt(dev, txq);
2987 #endif
2988                 }
2989         } else {
2990                 dev->real_num_tx_queues = txq;
2991         }
2992
2993         return 0;
2994 }
2995 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2996
2997 #ifdef CONFIG_SYSFS
2998 /**
2999  *      netif_set_real_num_rx_queues - set actual number of RX queues used
3000  *      @dev: Network device
3001  *      @rxq: Actual number of RX queues
3002  *
3003  *      This must be called either with the rtnl_lock held or before
3004  *      registration of the net device.  Returns 0 on success, or a
3005  *      negative error code.  If called before registration, it always
3006  *      succeeds.
3007  */
3008 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
3009 {
3010         int rc;
3011
3012         if (rxq < 1 || rxq > dev->num_rx_queues)
3013                 return -EINVAL;
3014
3015         if (dev->reg_state == NETREG_REGISTERED) {
3016                 ASSERT_RTNL();
3017
3018                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
3019                                                   rxq);
3020                 if (rc)
3021                         return rc;
3022         }
3023
3024         dev->real_num_rx_queues = rxq;
3025         return 0;
3026 }
3027 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
3028 #endif
3029
3030 /**
3031  * netif_get_num_default_rss_queues - default number of RSS queues
3032  *
3033  * This routine should set an upper limit on the number of RSS queues
3034  * used by default by multiqueue devices.
3035  */
3036 int netif_get_num_default_rss_queues(void)
3037 {
3038         return is_kdump_kernel() ?
3039                 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
3040 }
3041 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
3042
3043 static void __netif_reschedule(struct Qdisc *q)
3044 {
3045         struct softnet_data *sd;
3046         unsigned long flags;
3047
3048         local_irq_save(flags);
3049         sd = this_cpu_ptr(&softnet_data);
3050         q->next_sched = NULL;
3051         *sd->output_queue_tailp = q;
3052         sd->output_queue_tailp = &q->next_sched;
3053         raise_softirq_irqoff(NET_TX_SOFTIRQ);
3054         local_irq_restore(flags);
3055 }
3056
3057 void __netif_schedule(struct Qdisc *q)
3058 {
3059         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
3060                 __netif_reschedule(q);
3061 }
3062 EXPORT_SYMBOL(__netif_schedule);
3063
3064 struct dev_kfree_skb_cb {
3065         enum skb_free_reason reason;
3066 };
3067
3068 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
3069 {
3070         return (struct dev_kfree_skb_cb *)skb->cb;
3071 }
3072
3073 void netif_schedule_queue(struct netdev_queue *txq)
3074 {
3075         rcu_read_lock();
3076         if (!netif_xmit_stopped(txq)) {
3077                 struct Qdisc *q = rcu_dereference(txq->qdisc);
3078
3079                 __netif_schedule(q);
3080         }
3081         rcu_read_unlock();
3082 }
3083 EXPORT_SYMBOL(netif_schedule_queue);
3084
3085 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
3086 {
3087         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
3088                 struct Qdisc *q;
3089
3090                 rcu_read_lock();
3091                 q = rcu_dereference(dev_queue->qdisc);
3092                 __netif_schedule(q);
3093                 rcu_read_unlock();
3094         }
3095 }
3096 EXPORT_SYMBOL(netif_tx_wake_queue);
3097
3098 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
3099 {
3100         unsigned long flags;
3101
3102         if (unlikely(!skb))
3103                 return;
3104
3105         if (likely(refcount_read(&skb->users) == 1)) {
3106                 smp_rmb();
3107                 refcount_set(&skb->users, 0);
3108         } else if (likely(!refcount_dec_and_test(&skb->users))) {
3109                 return;
3110         }
3111         get_kfree_skb_cb(skb)->reason = reason;
3112         local_irq_save(flags);
3113         skb->next = __this_cpu_read(softnet_data.completion_queue);
3114         __this_cpu_write(softnet_data.completion_queue, skb);
3115         raise_softirq_irqoff(NET_TX_SOFTIRQ);
3116         local_irq_restore(flags);
3117 }
3118 EXPORT_SYMBOL(__dev_kfree_skb_irq);
3119
3120 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
3121 {
3122         if (in_irq() || irqs_disabled())
3123                 __dev_kfree_skb_irq(skb, reason);
3124         else if (unlikely(reason == SKB_REASON_DROPPED))
3125                 kfree_skb(skb);
3126         else
3127                 consume_skb(skb);
3128 }
3129 EXPORT_SYMBOL(__dev_kfree_skb_any);
3130
3131
3132 /**
3133  * netif_device_detach - mark device as removed
3134  * @dev: network device
3135  *
3136  * Mark device as removed from system and therefore no longer available.
3137  */
3138 void netif_device_detach(struct net_device *dev)
3139 {
3140         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
3141             netif_running(dev)) {
3142                 netif_tx_stop_all_queues(dev);
3143         }
3144 }
3145 EXPORT_SYMBOL(netif_device_detach);
3146
3147 /**
3148  * netif_device_attach - mark device as attached
3149  * @dev: network device
3150  *
3151  * Mark device as attached from system and restart if needed.
3152  */
3153 void netif_device_attach(struct net_device *dev)
3154 {
3155         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
3156             netif_running(dev)) {
3157                 netif_tx_wake_all_queues(dev);
3158                 __netdev_watchdog_up(dev);
3159         }
3160 }
3161 EXPORT_SYMBOL(netif_device_attach);
3162
3163 /*
3164  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
3165  * to be used as a distribution range.
3166  */
3167 static u16 skb_tx_hash(const struct net_device *dev,
3168                        const struct net_device *sb_dev,
3169                        struct sk_buff *skb)
3170 {
3171         u32 hash;
3172         u16 qoffset = 0;
3173         u16 qcount = dev->real_num_tx_queues;
3174
3175         if (dev->num_tc) {
3176                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
3177
3178                 qoffset = sb_dev->tc_to_txq[tc].offset;
3179                 qcount = sb_dev->tc_to_txq[tc].count;
3180                 if (unlikely(!qcount)) {
3181                         net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
3182                                              sb_dev->name, qoffset, tc);
3183                         qoffset = 0;
3184                         qcount = dev->real_num_tx_queues;
3185                 }
3186         }
3187
3188         if (skb_rx_queue_recorded(skb)) {
3189                 hash = skb_get_rx_queue(skb);
3190                 if (hash >= qoffset)
3191                         hash -= qoffset;
3192                 while (unlikely(hash >= qcount))
3193                         hash -= qcount;
3194                 return hash + qoffset;
3195         }
3196
3197         return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
3198 }
3199
3200 static void skb_warn_bad_offload(const struct sk_buff *skb)
3201 {
3202         static const netdev_features_t null_features;
3203         struct net_device *dev = skb->dev;
3204         const char *name = "";
3205
3206         if (!net_ratelimit())
3207                 return;
3208
3209         if (dev) {
3210                 if (dev->dev.parent)
3211                         name = dev_driver_string(dev->dev.parent);
3212                 else
3213                         name = netdev_name(dev);
3214         }
3215         skb_dump(KERN_WARNING, skb, false);
3216         WARN(1, "%s: caps=(%pNF, %pNF)\n",
3217              name, dev ? &dev->features : &null_features,
3218              skb->sk ? &skb->sk->sk_route_caps : &null_features);
3219 }
3220
3221 /*
3222  * Invalidate hardware checksum when packet is to be mangled, and
3223  * complete checksum manually on outgoing path.
3224  */
3225 int skb_checksum_help(struct sk_buff *skb)
3226 {
3227         __wsum csum;
3228         int ret = 0, offset;
3229
3230         if (skb->ip_summed == CHECKSUM_COMPLETE)
3231                 goto out_set_summed;
3232
3233         if (unlikely(skb_shinfo(skb)->gso_size)) {
3234                 skb_warn_bad_offload(skb);
3235                 return -EINVAL;
3236         }
3237
3238         /* Before computing a checksum, we should make sure no frag could
3239          * be modified by an external entity : checksum could be wrong.
3240          */
3241         if (skb_has_shared_frag(skb)) {
3242                 ret = __skb_linearize(skb);
3243                 if (ret)
3244                         goto out;
3245         }
3246
3247         offset = skb_checksum_start_offset(skb);
3248         ret = -EINVAL;
3249         if (WARN_ON_ONCE(offset >= skb_headlen(skb)))
3250                 goto out;
3251
3252         csum = skb_checksum(skb, offset, skb->len - offset, 0);
3253
3254         offset += skb->csum_offset;
3255         if (WARN_ON_ONCE(offset + sizeof(__sum16) > skb_headlen(skb)))
3256                 goto out;
3257
3258         ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
3259         if (ret)
3260                 goto out;
3261
3262         *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
3263 out_set_summed:
3264         skb->ip_summed = CHECKSUM_NONE;
3265 out:
3266         return ret;
3267 }
3268 EXPORT_SYMBOL(skb_checksum_help);
3269
3270 int skb_crc32c_csum_help(struct sk_buff *skb)
3271 {
3272         __le32 crc32c_csum;
3273         int ret = 0, offset, start;
3274
3275         if (skb->ip_summed != CHECKSUM_PARTIAL)
3276                 goto out;
3277
3278         if (unlikely(skb_is_gso(skb)))
3279                 goto out;
3280
3281         /* Before computing a checksum, we should make sure no frag could
3282          * be modified by an external entity : checksum could be wrong.
3283          */
3284         if (unlikely(skb_has_shared_frag(skb))) {
3285                 ret = __skb_linearize(skb);
3286                 if (ret)
3287                         goto out;
3288         }
3289         start = skb_checksum_start_offset(skb);
3290         offset = start + offsetof(struct sctphdr, checksum);
3291         if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
3292                 ret = -EINVAL;
3293                 goto out;
3294         }
3295
3296         ret = skb_ensure_writable(skb, offset + sizeof(__le32));
3297         if (ret)
3298                 goto out;
3299
3300         crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
3301                                                   skb->len - start, ~(__u32)0,
3302                                                   crc32c_csum_stub));
3303         *(__le32 *)(skb->data + offset) = crc32c_csum;
3304         skb->ip_summed = CHECKSUM_NONE;
3305         skb->csum_not_inet = 0;
3306 out:
3307         return ret;
3308 }
3309
3310 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
3311 {
3312         __be16 type = skb->protocol;
3313
3314         /* Tunnel gso handlers can set protocol to ethernet. */
3315         if (type == htons(ETH_P_TEB)) {
3316                 struct ethhdr *eth;
3317
3318                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
3319                         return 0;
3320
3321                 eth = (struct ethhdr *)skb->data;
3322                 type = eth->h_proto;
3323         }
3324
3325         return vlan_get_protocol_and_depth(skb, type, depth);
3326 }
3327
3328 /**
3329  *      skb_mac_gso_segment - mac layer segmentation handler.
3330  *      @skb: buffer to segment
3331  *      @features: features for the output path (see dev->features)
3332  */
3333 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
3334                                     netdev_features_t features)
3335 {
3336         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
3337         struct packet_offload *ptype;
3338         int vlan_depth = skb->mac_len;
3339         __be16 type = skb_network_protocol(skb, &vlan_depth);
3340
3341         if (unlikely(!type))
3342                 return ERR_PTR(-EINVAL);
3343
3344         __skb_pull(skb, vlan_depth);
3345
3346         rcu_read_lock();
3347         list_for_each_entry_rcu(ptype, &offload_base, list) {
3348                 if (ptype->type == type && ptype->callbacks.gso_segment) {
3349                         segs = ptype->callbacks.gso_segment(skb, features);
3350                         break;
3351                 }
3352         }
3353         rcu_read_unlock();
3354
3355         __skb_push(skb, skb->data - skb_mac_header(skb));
3356
3357         return segs;
3358 }
3359 EXPORT_SYMBOL(skb_mac_gso_segment);
3360
3361
3362 /* openvswitch calls this on rx path, so we need a different check.
3363  */
3364 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
3365 {
3366         if (tx_path)
3367                 return skb->ip_summed != CHECKSUM_PARTIAL &&
3368                        skb->ip_summed != CHECKSUM_UNNECESSARY;
3369
3370         return skb->ip_summed == CHECKSUM_NONE;
3371 }
3372
3373 /**
3374  *      __skb_gso_segment - Perform segmentation on skb.
3375  *      @skb: buffer to segment
3376  *      @features: features for the output path (see dev->features)
3377  *      @tx_path: whether it is called in TX path
3378  *
3379  *      This function segments the given skb and returns a list of segments.
3380  *
3381  *      It may return NULL if the skb requires no segmentation.  This is
3382  *      only possible when GSO is used for verifying header integrity.
3383  *
3384  *      Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
3385  */
3386 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
3387                                   netdev_features_t features, bool tx_path)
3388 {
3389         struct sk_buff *segs;
3390
3391         if (unlikely(skb_needs_check(skb, tx_path))) {
3392                 int err;
3393
3394                 /* We're going to init ->check field in TCP or UDP header */
3395                 err = skb_cow_head(skb, 0);
3396                 if (err < 0)
3397                         return ERR_PTR(err);
3398         }
3399
3400         /* Only report GSO partial support if it will enable us to
3401          * support segmentation on this frame without needing additional
3402          * work.
3403          */
3404         if (features & NETIF_F_GSO_PARTIAL) {
3405                 netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
3406                 struct net_device *dev = skb->dev;
3407
3408                 partial_features |= dev->features & dev->gso_partial_features;
3409                 if (!skb_gso_ok(skb, features | partial_features))
3410                         features &= ~NETIF_F_GSO_PARTIAL;
3411         }
3412
3413         BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
3414                      sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
3415
3416         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
3417         SKB_GSO_CB(skb)->encap_level = 0;
3418
3419         skb_reset_mac_header(skb);
3420         skb_reset_mac_len(skb);
3421
3422         segs = skb_mac_gso_segment(skb, features);
3423
3424         if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
3425                 skb_warn_bad_offload(skb);
3426
3427         return segs;
3428 }
3429 EXPORT_SYMBOL(__skb_gso_segment);
3430
3431 /* Take action when hardware reception checksum errors are detected. */
3432 #ifdef CONFIG_BUG
3433 void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3434 {
3435         if (net_ratelimit()) {
3436                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
3437                 skb_dump(KERN_ERR, skb, true);
3438                 dump_stack();
3439         }
3440 }
3441 EXPORT_SYMBOL(netdev_rx_csum_fault);
3442 #endif
3443
3444 /* XXX: check that highmem exists at all on the given machine. */
3445 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
3446 {
3447 #ifdef CONFIG_HIGHMEM
3448         int i;
3449
3450         if (!(dev->features & NETIF_F_HIGHDMA)) {
3451                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3452                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3453
3454                         if (PageHighMem(skb_frag_page(frag)))
3455                                 return 1;
3456                 }
3457         }
3458 #endif
3459         return 0;
3460 }
3461
3462 /* If MPLS offload request, verify we are testing hardware MPLS features
3463  * instead of standard features for the netdev.
3464  */
3465 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3466 static netdev_features_t net_mpls_features(struct sk_buff *skb,
3467                                            netdev_features_t features,
3468                                            __be16 type)
3469 {
3470         if (eth_p_mpls(type))
3471                 features &= skb->dev->mpls_features;
3472
3473         return features;
3474 }
3475 #else
3476 static netdev_features_t net_mpls_features(struct sk_buff *skb,
3477                                            netdev_features_t features,
3478                                            __be16 type)
3479 {
3480         return features;
3481 }
3482 #endif
3483
3484 static netdev_features_t harmonize_features(struct sk_buff *skb,
3485         netdev_features_t features)
3486 {
3487         __be16 type;
3488
3489         type = skb_network_protocol(skb, NULL);
3490         features = net_mpls_features(skb, features, type);
3491
3492         if (skb->ip_summed != CHECKSUM_NONE &&
3493             !can_checksum_protocol(features, type)) {
3494                 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3495         }
3496         if (illegal_highdma(skb->dev, skb))
3497                 features &= ~NETIF_F_SG;
3498
3499         return features;
3500 }
3501
3502 netdev_features_t passthru_features_check(struct sk_buff *skb,
3503                                           struct net_device *dev,
3504                                           netdev_features_t features)
3505 {
3506         return features;
3507 }
3508 EXPORT_SYMBOL(passthru_features_check);
3509
3510 static netdev_features_t dflt_features_check(struct sk_buff *skb,
3511                                              struct net_device *dev,
3512                                              netdev_features_t features)
3513 {
3514         return vlan_features_check(skb, features);
3515 }
3516
3517 static netdev_features_t gso_features_check(const struct sk_buff *skb,
3518                                             struct net_device *dev,
3519                                             netdev_features_t features)
3520 {
3521         u16 gso_segs = skb_shinfo(skb)->gso_segs;
3522
3523         if (gso_segs > dev->gso_max_segs)
3524                 return features & ~NETIF_F_GSO_MASK;
3525
3526         if (unlikely(skb->len >= READ_ONCE(dev->gso_max_size)))
3527                 return features & ~NETIF_F_GSO_MASK;
3528
3529         if (!skb_shinfo(skb)->gso_type) {
3530                 skb_warn_bad_offload(skb);
3531                 return features & ~NETIF_F_GSO_MASK;
3532         }
3533
3534         /* Support for GSO partial features requires software
3535          * intervention before we can actually process the packets
3536          * so we need to strip support for any partial features now
3537          * and we can pull them back in after we have partially
3538          * segmented the frame.
3539          */
3540         if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
3541                 features &= ~dev->gso_partial_features;
3542
3543         /* Make sure to clear the IPv4 ID mangling feature if the
3544          * IPv4 header has the potential to be fragmented.
3545          */
3546         if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3547                 struct iphdr *iph = skb->encapsulation ?
3548                                     inner_ip_hdr(skb) : ip_hdr(skb);
3549
3550                 if (!(iph->frag_off & htons(IP_DF)))
3551                         features &= ~NETIF_F_TSO_MANGLEID;
3552         }
3553
3554         return features;
3555 }
3556
3557 netdev_features_t netif_skb_features(struct sk_buff *skb)
3558 {
3559         struct net_device *dev = skb->dev;
3560         netdev_features_t features = dev->features;
3561
3562         if (skb_is_gso(skb))
3563                 features = gso_features_check(skb, dev, features);
3564
3565         /* If encapsulation offload request, verify we are testing
3566          * hardware encapsulation features instead of standard
3567          * features for the netdev
3568          */
3569         if (skb->encapsulation)
3570                 features &= dev->hw_enc_features;
3571
3572         if (skb_vlan_tagged(skb))
3573                 features = netdev_intersect_features(features,
3574                                                      dev->vlan_features |
3575                                                      NETIF_F_HW_VLAN_CTAG_TX |
3576                                                      NETIF_F_HW_VLAN_STAG_TX);
3577
3578         if (dev->netdev_ops->ndo_features_check)
3579                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
3580                                                                 features);
3581         else
3582                 features &= dflt_features_check(skb, dev, features);
3583
3584         return harmonize_features(skb, features);
3585 }
3586 EXPORT_SYMBOL(netif_skb_features);
3587
3588 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3589                     struct netdev_queue *txq, bool more)
3590 {
3591         unsigned int len;
3592         int rc;
3593
3594         if (dev_nit_active(dev))
3595                 dev_queue_xmit_nit(skb, dev);
3596
3597         len = skb->len;
3598         PRANDOM_ADD_NOISE(skb, dev, txq, len + jiffies);
3599         trace_net_dev_start_xmit(skb, dev);
3600         rc = netdev_start_xmit(skb, dev, txq, more);
3601         trace_net_dev_xmit(skb, rc, dev, len);
3602
3603         return rc;
3604 }
3605
3606 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3607                                     struct netdev_queue *txq, int *ret)
3608 {
3609         struct sk_buff *skb = first;
3610         int rc = NETDEV_TX_OK;
3611
3612         while (skb) {
3613                 struct sk_buff *next = skb->next;
3614
3615                 skb_mark_not_on_list(skb);
3616                 rc = xmit_one(skb, dev, txq, next != NULL);
3617                 if (unlikely(!dev_xmit_complete(rc))) {
3618                         skb->next = next;
3619                         goto out;
3620                 }
3621
3622                 skb = next;
3623                 if (netif_tx_queue_stopped(txq) && skb) {
3624                         rc = NETDEV_TX_BUSY;
3625                         break;
3626                 }
3627         }
3628
3629 out:
3630         *ret = rc;
3631         return skb;
3632 }
3633
3634 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3635                                           netdev_features_t features)
3636 {
3637         if (skb_vlan_tag_present(skb) &&
3638             !vlan_hw_offload_capable(features, skb->vlan_proto))
3639                 skb = __vlan_hwaccel_push_inside(skb);
3640         return skb;
3641 }
3642
3643 int skb_csum_hwoffload_help(struct sk_buff *skb,
3644                             const netdev_features_t features)
3645 {
3646         if (unlikely(skb_csum_is_sctp(skb)))
3647                 return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3648                         skb_crc32c_csum_help(skb);
3649
3650         return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
3651 }
3652 EXPORT_SYMBOL(skb_csum_hwoffload_help);
3653
3654 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3655 {
3656         netdev_features_t features;
3657
3658         features = netif_skb_features(skb);
3659         skb = validate_xmit_vlan(skb, features);
3660         if (unlikely(!skb))
3661                 goto out_null;
3662
3663         skb = sk_validate_xmit_skb(skb, dev);
3664         if (unlikely(!skb))
3665                 goto out_null;
3666
3667         if (netif_needs_gso(skb, features)) {
3668                 struct sk_buff *segs;
3669
3670                 segs = skb_gso_segment(skb, features);
3671                 if (IS_ERR(segs)) {
3672                         goto out_kfree_skb;
3673                 } else if (segs) {
3674                         consume_skb(skb);
3675                         skb = segs;
3676                 }
3677         } else {
3678                 if (skb_needs_linearize(skb, features) &&
3679                     __skb_linearize(skb))
3680                         goto out_kfree_skb;
3681
3682                 /* If packet is not checksummed and device does not
3683                  * support checksumming for this protocol, complete
3684                  * checksumming here.
3685                  */
3686                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
3687                         if (skb->encapsulation)
3688                                 skb_set_inner_transport_header(skb,
3689                                                                skb_checksum_start_offset(skb));
3690                         else
3691                                 skb_set_transport_header(skb,
3692                                                          skb_checksum_start_offset(skb));
3693                         if (skb_csum_hwoffload_help(skb, features))
3694                                 goto out_kfree_skb;
3695                 }
3696         }
3697
3698         skb = validate_xmit_xfrm(skb, features, again);
3699
3700         return skb;
3701
3702 out_kfree_skb:
3703         kfree_skb(skb);
3704 out_null:
3705         atomic_long_inc(&dev->tx_dropped);
3706         return NULL;
3707 }
3708
3709 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3710 {
3711         struct sk_buff *next, *head = NULL, *tail;
3712
3713         for (; skb != NULL; skb = next) {
3714                 next = skb->next;
3715                 skb_mark_not_on_list(skb);
3716
3717                 /* in case skb wont be segmented, point to itself */
3718                 skb->prev = skb;
3719
3720                 skb = validate_xmit_skb(skb, dev, again);
3721                 if (!skb)
3722                         continue;
3723
3724                 if (!head)
3725                         head = skb;
3726                 else
3727                         tail->next = skb;
3728                 /* If skb was segmented, skb->prev points to
3729                  * the last segment. If not, it still contains skb.
3730                  */
3731                 tail = skb->prev;
3732         }
3733         return head;
3734 }
3735 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3736
3737 static void qdisc_pkt_len_init(struct sk_buff *skb)
3738 {
3739         const struct skb_shared_info *shinfo = skb_shinfo(skb);
3740
3741         qdisc_skb_cb(skb)->pkt_len = skb->len;
3742
3743         /* To get more precise estimation of bytes sent on wire,
3744          * we add to pkt_len the headers size of all segments
3745          */
3746         if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
3747                 unsigned int hdr_len;
3748                 u16 gso_segs = shinfo->gso_segs;
3749
3750                 /* mac layer + network layer */
3751                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3752
3753                 /* + transport layer */
3754                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3755                         const struct tcphdr *th;
3756                         struct tcphdr _tcphdr;
3757
3758                         th = skb_header_pointer(skb, skb_transport_offset(skb),
3759                                                 sizeof(_tcphdr), &_tcphdr);
3760                         if (likely(th))
3761                                 hdr_len += __tcp_hdrlen(th);
3762                 } else {
3763                         struct udphdr _udphdr;
3764
3765                         if (skb_header_pointer(skb, skb_transport_offset(skb),
3766                                                sizeof(_udphdr), &_udphdr))
3767                                 hdr_len += sizeof(struct udphdr);
3768                 }
3769
3770                 if (shinfo->gso_type & SKB_GSO_DODGY)
3771                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3772                                                 shinfo->gso_size);
3773
3774                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3775         }
3776 }
3777
3778 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3779                                  struct net_device *dev,
3780                                  struct netdev_queue *txq)
3781 {
3782         spinlock_t *root_lock = qdisc_lock(q);
3783         struct sk_buff *to_free = NULL;
3784         bool contended;
3785         int rc;
3786
3787         qdisc_calculate_pkt_len(skb, q);
3788
3789         if (q->flags & TCQ_F_NOLOCK) {
3790                 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3791                 if (likely(!netif_xmit_frozen_or_stopped(txq)))
3792                         qdisc_run(q);
3793
3794                 if (unlikely(to_free))
3795                         kfree_skb_list(to_free);
3796                 return rc;
3797         }
3798
3799         /*
3800          * Heuristic to force contended enqueues to serialize on a
3801          * separate lock before trying to get qdisc main lock.
3802          * This permits qdisc->running owner to get the lock more
3803          * often and dequeue packets faster.
3804          */
3805         contended = qdisc_is_running(q);
3806         if (unlikely(contended))
3807                 spin_lock(&q->busylock);
3808
3809         spin_lock(root_lock);
3810         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3811                 __qdisc_drop(skb, &to_free);
3812                 rc = NET_XMIT_DROP;
3813         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3814                    qdisc_run_begin(q)) {
3815                 /*
3816                  * This is a work-conserving queue; there are no old skbs
3817                  * waiting to be sent out; and the qdisc is not running -
3818                  * xmit the skb directly.
3819                  */
3820
3821                 qdisc_bstats_update(q, skb);
3822
3823                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3824                         if (unlikely(contended)) {
3825                                 spin_unlock(&q->busylock);
3826                                 contended = false;
3827                         }
3828                         __qdisc_run(q);
3829                 }
3830
3831                 qdisc_run_end(q);
3832                 rc = NET_XMIT_SUCCESS;
3833         } else {
3834                 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3835                 if (qdisc_run_begin(q)) {
3836                         if (unlikely(contended)) {
3837                                 spin_unlock(&q->busylock);
3838                                 contended = false;
3839                         }
3840                         __qdisc_run(q);
3841                         qdisc_run_end(q);
3842                 }
3843         }
3844         spin_unlock(root_lock);
3845         if (unlikely(to_free))
3846                 kfree_skb_list(to_free);
3847         if (unlikely(contended))
3848                 spin_unlock(&q->busylock);
3849         return rc;
3850 }
3851
3852 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3853 static void skb_update_prio(struct sk_buff *skb)
3854 {
3855         const struct netprio_map *map;
3856         const struct sock *sk;
3857         unsigned int prioidx;
3858
3859         if (skb->priority)
3860                 return;
3861         map = rcu_dereference_bh(skb->dev->priomap);
3862         if (!map)
3863                 return;
3864         sk = skb_to_full_sk(skb);
3865         if (!sk)
3866                 return;
3867
3868         prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3869
3870         if (prioidx < map->priomap_len)
3871                 skb->priority = map->priomap[prioidx];
3872 }
3873 #else
3874 #define skb_update_prio(skb)
3875 #endif
3876
3877 /**
3878  *      dev_loopback_xmit - loop back @skb
3879  *      @net: network namespace this loopback is happening in
3880  *      @sk:  sk needed to be a netfilter okfn
3881  *      @skb: buffer to transmit
3882  */
3883 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3884 {
3885         skb_reset_mac_header(skb);
3886         __skb_pull(skb, skb_network_offset(skb));
3887         skb->pkt_type = PACKET_LOOPBACK;
3888         if (skb->ip_summed == CHECKSUM_NONE)
3889                 skb->ip_summed = CHECKSUM_UNNECESSARY;
3890         WARN_ON(!skb_dst(skb));
3891         skb_dst_force(skb);
3892         netif_rx_ni(skb);
3893         return 0;
3894 }
3895 EXPORT_SYMBOL(dev_loopback_xmit);
3896
3897 #ifdef CONFIG_NET_EGRESS
3898 static struct sk_buff *
3899 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3900 {
3901         struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
3902         struct tcf_result cl_res;
3903
3904         if (!miniq)
3905                 return skb;
3906
3907         /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3908         qdisc_skb_cb(skb)->mru = 0;
3909         mini_qdisc_bstats_cpu_update(miniq, skb);
3910
3911         switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
3912         case TC_ACT_OK:
3913         case TC_ACT_RECLASSIFY:
3914                 skb->tc_index = TC_H_MIN(cl_res.classid);
3915                 break;
3916         case TC_ACT_SHOT:
3917                 mini_qdisc_qstats_cpu_drop(miniq);
3918                 *ret = NET_XMIT_DROP;
3919                 kfree_skb(skb);
3920                 return NULL;
3921         case TC_ACT_STOLEN:
3922         case TC_ACT_QUEUED:
3923         case TC_ACT_TRAP:
3924                 *ret = NET_XMIT_SUCCESS;
3925                 consume_skb(skb);
3926                 return NULL;
3927         case TC_ACT_REDIRECT:
3928                 /* No need to push/pop skb's mac_header here on egress! */
3929                 skb_do_redirect(skb);
3930                 *ret = NET_XMIT_SUCCESS;
3931                 return NULL;
3932         default:
3933                 break;
3934         }
3935
3936         return skb;
3937 }
3938 #endif /* CONFIG_NET_EGRESS */
3939
3940 #ifdef CONFIG_XPS
3941 static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
3942                                struct xps_dev_maps *dev_maps, unsigned int tci)
3943 {
3944         struct xps_map *map;
3945         int queue_index = -1;
3946
3947         if (dev->num_tc) {
3948                 tci *= dev->num_tc;
3949                 tci += netdev_get_prio_tc_map(dev, skb->priority);
3950         }
3951
3952         map = rcu_dereference(dev_maps->attr_map[tci]);
3953         if (map) {
3954                 if (map->len == 1)
3955                         queue_index = map->queues[0];
3956                 else
3957                         queue_index = map->queues[reciprocal_scale(
3958                                                 skb_get_hash(skb), map->len)];
3959                 if (unlikely(queue_index >= dev->real_num_tx_queues))
3960                         queue_index = -1;
3961         }
3962         return queue_index;
3963 }
3964 #endif
3965
3966 static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
3967                          struct sk_buff *skb)
3968 {
3969 #ifdef CONFIG_XPS
3970         struct xps_dev_maps *dev_maps;
3971         struct sock *sk = skb->sk;
3972         int queue_index = -1;
3973
3974         if (!static_key_false(&xps_needed))
3975                 return -1;
3976
3977         rcu_read_lock();
3978         if (!static_key_false(&xps_rxqs_needed))
3979                 goto get_cpus_map;
3980
3981         dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
3982         if (dev_maps) {
3983                 int tci = sk_rx_queue_get(sk);
3984
3985                 if (tci >= 0 && tci < dev->num_rx_queues)
3986                         queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3987                                                           tci);
3988         }
3989
3990 get_cpus_map:
3991         if (queue_index < 0) {
3992                 dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
3993                 if (dev_maps) {
3994                         unsigned int tci = skb->sender_cpu - 1;
3995
3996                         queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3997                                                           tci);
3998                 }
3999         }
4000         rcu_read_unlock();
4001
4002         return queue_index;
4003 #else
4004         return -1;
4005 #endif
4006 }
4007
4008 u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
4009                      struct net_device *sb_dev)
4010 {
4011         return 0;
4012 }
4013 EXPORT_SYMBOL(dev_pick_tx_zero);
4014
4015 u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
4016                        struct net_device *sb_dev)
4017 {
4018         return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
4019 }
4020 EXPORT_SYMBOL(dev_pick_tx_cpu_id);
4021
4022 u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
4023                      struct net_device *sb_dev)
4024 {
4025         struct sock *sk = skb->sk;
4026         int queue_index = sk_tx_queue_get(sk);
4027
4028         sb_dev = sb_dev ? : dev;
4029
4030         if (queue_index < 0 || skb->ooo_okay ||
4031             queue_index >= dev->real_num_tx_queues) {
4032                 int new_index = get_xps_queue(dev, sb_dev, skb);
4033
4034                 if (new_index < 0)
4035                         new_index = skb_tx_hash(dev, sb_dev, skb);
4036
4037                 if (queue_index != new_index && sk &&
4038                     sk_fullsock(sk) &&
4039                     rcu_access_pointer(sk->sk_dst_cache))
4040                         sk_tx_queue_set(sk, new_index);
4041
4042                 queue_index = new_index;
4043         }
4044
4045         return queue_index;
4046 }
4047 EXPORT_SYMBOL(netdev_pick_tx);
4048
4049 struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
4050                                          struct sk_buff *skb,
4051                                          struct net_device *sb_dev)
4052 {
4053         int queue_index = 0;
4054
4055 #ifdef CONFIG_XPS
4056         u32 sender_cpu = skb->sender_cpu - 1;
4057
4058         if (sender_cpu >= (u32)NR_CPUS)
4059                 skb->sender_cpu = raw_smp_processor_id() + 1;
4060 #endif
4061
4062         if (dev->real_num_tx_queues != 1) {
4063                 const struct net_device_ops *ops = dev->netdev_ops;
4064
4065                 if (ops->ndo_select_queue)
4066                         queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
4067                 else
4068                         queue_index = netdev_pick_tx(dev, skb, sb_dev);
4069
4070                 queue_index = netdev_cap_txqueue(dev, queue_index);
4071         }
4072
4073         skb_set_queue_mapping(skb, queue_index);
4074         return netdev_get_tx_queue(dev, queue_index);
4075 }
4076
4077 /**
4078  *      __dev_queue_xmit - transmit a buffer
4079  *      @skb: buffer to transmit
4080  *      @sb_dev: suboordinate device used for L2 forwarding offload
4081  *
4082  *      Queue a buffer for transmission to a network device. The caller must
4083  *      have set the device and priority and built the buffer before calling
4084  *      this function. The function can be called from an interrupt.
4085  *
4086  *      A negative errno code is returned on a failure. A success does not
4087  *      guarantee the frame will be transmitted as it may be dropped due
4088  *      to congestion or traffic shaping.
4089  *
4090  * -----------------------------------------------------------------------------------
4091  *      I notice this method can also return errors from the queue disciplines,
4092  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
4093  *      be positive.
4094  *
4095  *      Regardless of the return value, the skb is consumed, so it is currently
4096  *      difficult to retry a send to this method.  (You can bump the ref count
4097  *      before sending to hold a reference for retry if you are careful.)
4098  *
4099  *      When calling this method, interrupts MUST be enabled.  This is because
4100  *      the BH enable code must have IRQs enabled so that it will not deadlock.
4101  *          --BLG
4102  */
4103 static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
4104 {
4105         struct net_device *dev = skb->dev;
4106         struct netdev_queue *txq;
4107         struct Qdisc *q;
4108         int rc = -ENOMEM;
4109         bool again = false;
4110
4111         skb_reset_mac_header(skb);
4112         skb_assert_len(skb);
4113
4114         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
4115                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
4116
4117         /* Disable soft irqs for various locks below. Also
4118          * stops preemption for RCU.
4119          */
4120         rcu_read_lock_bh();
4121
4122         skb_update_prio(skb);
4123
4124         qdisc_pkt_len_init(skb);
4125 #ifdef CONFIG_NET_CLS_ACT
4126         skb->tc_at_ingress = 0;
4127 # ifdef CONFIG_NET_EGRESS
4128         if (static_branch_unlikely(&egress_needed_key)) {
4129                 skb = sch_handle_egress(skb, &rc, dev);
4130                 if (!skb)
4131                         goto out;
4132         }
4133 # endif
4134 #endif
4135         /* If device/qdisc don't need skb->dst, release it right now while
4136          * its hot in this cpu cache.
4137          */
4138         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
4139                 skb_dst_drop(skb);
4140         else
4141                 skb_dst_force(skb);
4142
4143         txq = netdev_core_pick_tx(dev, skb, sb_dev);
4144         q = rcu_dereference_bh(txq->qdisc);
4145
4146         trace_net_dev_queue(skb);
4147         if (q->enqueue) {
4148                 rc = __dev_xmit_skb(skb, q, dev, txq);
4149                 goto out;
4150         }
4151
4152         /* The device has no queue. Common case for software devices:
4153          * loopback, all the sorts of tunnels...
4154
4155          * Really, it is unlikely that netif_tx_lock protection is necessary
4156          * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
4157          * counters.)
4158          * However, it is possible, that they rely on protection
4159          * made by us here.
4160
4161          * Check this and shot the lock. It is not prone from deadlocks.
4162          *Either shot noqueue qdisc, it is even simpler 8)
4163          */
4164         if (dev->flags & IFF_UP) {
4165                 int cpu = smp_processor_id(); /* ok because BHs are off */
4166
4167                 /* Other cpus might concurrently change txq->xmit_lock_owner
4168                  * to -1 or to their cpu id, but not to our id.
4169                  */
4170                 if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
4171                         if (dev_xmit_recursion())
4172                                 goto recursion_alert;
4173
4174                         skb = validate_xmit_skb(skb, dev, &again);
4175                         if (!skb)
4176                                 goto out;
4177
4178                         PRANDOM_ADD_NOISE(skb, dev, txq, jiffies);
4179                         HARD_TX_LOCK(dev, txq, cpu);
4180
4181                         if (!netif_xmit_stopped(txq)) {
4182                                 dev_xmit_recursion_inc();
4183                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
4184                                 dev_xmit_recursion_dec();
4185                                 if (dev_xmit_complete(rc)) {
4186                                         HARD_TX_UNLOCK(dev, txq);
4187                                         goto out;
4188                                 }
4189                         }
4190                         HARD_TX_UNLOCK(dev, txq);
4191                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
4192                                              dev->name);
4193                 } else {
4194                         /* Recursion is detected! It is possible,
4195                          * unfortunately
4196                          */
4197 recursion_alert:
4198                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
4199                                              dev->name);
4200                 }
4201         }
4202
4203         rc = -ENETDOWN;
4204         rcu_read_unlock_bh();
4205
4206         atomic_long_inc(&dev->tx_dropped);
4207         kfree_skb_list(skb);
4208         return rc;
4209 out:
4210         rcu_read_unlock_bh();
4211         return rc;
4212 }
4213
4214 int dev_queue_xmit(struct sk_buff *skb)
4215 {
4216         return __dev_queue_xmit(skb, NULL);
4217 }
4218 EXPORT_SYMBOL(dev_queue_xmit);
4219
4220 int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
4221 {
4222         return __dev_queue_xmit(skb, sb_dev);
4223 }
4224 EXPORT_SYMBOL(dev_queue_xmit_accel);
4225
4226 int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
4227 {
4228         struct net_device *dev = skb->dev;
4229         struct sk_buff *orig_skb = skb;
4230         struct netdev_queue *txq;
4231         int ret = NETDEV_TX_BUSY;
4232         bool again = false;
4233
4234         if (unlikely(!netif_running(dev) ||
4235                      !netif_carrier_ok(dev)))
4236                 goto drop;
4237
4238         skb = validate_xmit_skb_list(skb, dev, &again);
4239         if (skb != orig_skb)
4240                 goto drop;
4241
4242         skb_set_queue_mapping(skb, queue_id);
4243         txq = skb_get_tx_queue(dev, skb);
4244         PRANDOM_ADD_NOISE(skb, dev, txq, jiffies);
4245
4246         local_bh_disable();
4247
4248         dev_xmit_recursion_inc();
4249         HARD_TX_LOCK(dev, txq, smp_processor_id());
4250         if (!netif_xmit_frozen_or_drv_stopped(txq))
4251                 ret = netdev_start_xmit(skb, dev, txq, false);
4252         HARD_TX_UNLOCK(dev, txq);
4253         dev_xmit_recursion_dec();
4254
4255         local_bh_enable();
4256         return ret;
4257 drop:
4258         atomic_long_inc(&dev->tx_dropped);
4259         kfree_skb_list(skb);
4260         return NET_XMIT_DROP;
4261 }
4262 EXPORT_SYMBOL(__dev_direct_xmit);
4263
4264 /*************************************************************************
4265  *                      Receiver routines
4266  *************************************************************************/
4267
4268 int netdev_max_backlog __read_mostly = 1000;
4269 EXPORT_SYMBOL(netdev_max_backlog);
4270
4271 int netdev_tstamp_prequeue __read_mostly = 1;
4272 int netdev_budget __read_mostly = 300;
4273 /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
4274 unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
4275 int weight_p __read_mostly = 64;           /* old backlog weight */
4276 int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
4277 int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
4278 int dev_rx_weight __read_mostly = 64;
4279 int dev_tx_weight __read_mostly = 64;
4280 /* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
4281 int gro_normal_batch __read_mostly = 8;
4282
4283 /* Called with irq disabled */
4284 static inline void ____napi_schedule(struct softnet_data *sd,
4285                                      struct napi_struct *napi)
4286 {
4287         list_add_tail(&napi->poll_list, &sd->poll_list);
4288         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4289 }
4290
4291 #ifdef CONFIG_RPS
4292
4293 /* One global table that all flow-based protocols share. */
4294 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
4295 EXPORT_SYMBOL(rps_sock_flow_table);
4296 u32 rps_cpu_mask __read_mostly;
4297 EXPORT_SYMBOL(rps_cpu_mask);
4298
4299 struct static_key_false rps_needed __read_mostly;
4300 EXPORT_SYMBOL(rps_needed);
4301 struct static_key_false rfs_needed __read_mostly;
4302 EXPORT_SYMBOL(rfs_needed);
4303
4304 static struct rps_dev_flow *
4305 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4306             struct rps_dev_flow *rflow, u16 next_cpu)
4307 {
4308         if (next_cpu < nr_cpu_ids) {
4309 #ifdef CONFIG_RFS_ACCEL
4310                 struct netdev_rx_queue *rxqueue;
4311                 struct rps_dev_flow_table *flow_table;
4312                 struct rps_dev_flow *old_rflow;
4313                 u32 flow_id;
4314                 u16 rxq_index;
4315                 int rc;
4316
4317                 /* Should we steer this flow to a different hardware queue? */
4318                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
4319                     !(dev->features & NETIF_F_NTUPLE))
4320                         goto out;
4321                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
4322                 if (rxq_index == skb_get_rx_queue(skb))
4323                         goto out;
4324
4325                 rxqueue = dev->_rx + rxq_index;
4326                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
4327                 if (!flow_table)
4328                         goto out;
4329                 flow_id = skb_get_hash(skb) & flow_table->mask;
4330                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
4331                                                         rxq_index, flow_id);
4332                 if (rc < 0)
4333                         goto out;
4334                 old_rflow = rflow;
4335                 rflow = &flow_table->flows[flow_id];
4336                 rflow->filter = rc;
4337                 if (old_rflow->filter == rflow->filter)
4338                         old_rflow->filter = RPS_NO_FILTER;
4339         out:
4340 #endif
4341                 rflow->last_qtail =
4342                         per_cpu(softnet_data, next_cpu).input_queue_head;
4343         }
4344
4345         rflow->cpu = next_cpu;
4346         return rflow;
4347 }
4348
4349 /*
4350  * get_rps_cpu is called from netif_receive_skb and returns the target
4351  * CPU from the RPS map of the receiving queue for a given skb.
4352  * rcu_read_lock must be held on entry.
4353  */
4354 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4355                        struct rps_dev_flow **rflowp)
4356 {
4357         const struct rps_sock_flow_table *sock_flow_table;
4358         struct netdev_rx_queue *rxqueue = dev->_rx;
4359         struct rps_dev_flow_table *flow_table;
4360         struct rps_map *map;
4361         int cpu = -1;
4362         u32 tcpu;
4363         u32 hash;
4364
4365         if (skb_rx_queue_recorded(skb)) {
4366                 u16 index = skb_get_rx_queue(skb);
4367
4368                 if (unlikely(index >= dev->real_num_rx_queues)) {
4369                         WARN_ONCE(dev->real_num_rx_queues > 1,
4370                                   "%s received packet on queue %u, but number "
4371                                   "of RX queues is %u\n",
4372                                   dev->name, index, dev->real_num_rx_queues);
4373                         goto done;
4374                 }
4375                 rxqueue += index;
4376         }
4377
4378         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
4379
4380         flow_table = rcu_dereference(rxqueue->rps_flow_table);
4381         map = rcu_dereference(rxqueue->rps_map);
4382         if (!flow_table && !map)
4383                 goto done;
4384
4385         skb_reset_network_header(skb);
4386         hash = skb_get_hash(skb);
4387         if (!hash)
4388                 goto done;
4389
4390         sock_flow_table = rcu_dereference(rps_sock_flow_table);
4391         if (flow_table && sock_flow_table) {
4392                 struct rps_dev_flow *rflow;
4393                 u32 next_cpu;
4394                 u32 ident;
4395
4396                 /* First check into global flow table if there is a match.
4397                  * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
4398                  */
4399                 ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
4400                 if ((ident ^ hash) & ~rps_cpu_mask)
4401                         goto try_rps;
4402
4403                 next_cpu = ident & rps_cpu_mask;
4404
4405                 /* OK, now we know there is a match,
4406                  * we can look at the local (per receive queue) flow table
4407                  */
4408                 rflow = &flow_table->flows[hash & flow_table->mask];
4409                 tcpu = rflow->cpu;
4410
4411                 /*
4412                  * If the desired CPU (where last recvmsg was done) is
4413                  * different from current CPU (one in the rx-queue flow
4414                  * table entry), switch if one of the following holds:
4415                  *   - Current CPU is unset (>= nr_cpu_ids).
4416                  *   - Current CPU is offline.
4417                  *   - The current CPU's queue tail has advanced beyond the
4418                  *     last packet that was enqueued using this table entry.
4419                  *     This guarantees that all previous packets for the flow
4420                  *     have been dequeued, thus preserving in order delivery.
4421                  */
4422                 if (unlikely(tcpu != next_cpu) &&
4423                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
4424                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
4425                       rflow->last_qtail)) >= 0)) {
4426                         tcpu = next_cpu;
4427                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
4428                 }
4429
4430                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
4431                         *rflowp = rflow;
4432                         cpu = tcpu;
4433                         goto done;
4434                 }
4435         }
4436
4437 try_rps:
4438
4439         if (map) {
4440                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
4441                 if (cpu_online(tcpu)) {
4442                         cpu = tcpu;
4443                         goto done;
4444                 }
4445         }
4446
4447 done:
4448         return cpu;
4449 }
4450
4451 #ifdef CONFIG_RFS_ACCEL
4452
4453 /**
4454  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
4455  * @dev: Device on which the filter was set
4456  * @rxq_index: RX queue index
4457  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
4458  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
4459  *
4460  * Drivers that implement ndo_rx_flow_steer() should periodically call
4461  * this function for each installed filter and remove the filters for
4462  * which it returns %true.
4463  */
4464 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
4465                          u32 flow_id, u16 filter_id)
4466 {
4467         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
4468         struct rps_dev_flow_table *flow_table;
4469         struct rps_dev_flow *rflow;
4470         bool expire = true;
4471         unsigned int cpu;
4472
4473         rcu_read_lock();
4474         flow_table = rcu_dereference(rxqueue->rps_flow_table);
4475         if (flow_table && flow_id <= flow_table->mask) {
4476                 rflow = &flow_table->flows[flow_id];
4477                 cpu = READ_ONCE(rflow->cpu);
4478                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
4479                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
4480                            rflow->last_qtail) <
4481                      (int)(10 * flow_table->mask)))
4482                         expire = false;
4483         }
4484         rcu_read_unlock();
4485         return expire;
4486 }
4487 EXPORT_SYMBOL(rps_may_expire_flow);
4488
4489 #endif /* CONFIG_RFS_ACCEL */
4490
4491 /* Called from hardirq (IPI) context */
4492 static void rps_trigger_softirq(void *data)
4493 {
4494         struct softnet_data *sd = data;
4495
4496         ____napi_schedule(sd, &sd->backlog);
4497         sd->received_rps++;
4498 }
4499
4500 #endif /* CONFIG_RPS */
4501
4502 /*
4503  * Check if this softnet_data structure is another cpu one
4504  * If yes, queue it to our IPI list and return 1
4505  * If no, return 0
4506  */
4507 static int rps_ipi_queued(struct softnet_data *sd)
4508 {
4509 #ifdef CONFIG_RPS
4510         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
4511
4512         if (sd != mysd) {
4513                 sd->rps_ipi_next = mysd->rps_ipi_list;
4514                 mysd->rps_ipi_list = sd;
4515
4516                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4517                 return 1;
4518         }
4519 #endif /* CONFIG_RPS */
4520         return 0;
4521 }
4522
4523 #ifdef CONFIG_NET_FLOW_LIMIT
4524 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
4525 #endif
4526
4527 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
4528 {
4529 #ifdef CONFIG_NET_FLOW_LIMIT
4530         struct sd_flow_limit *fl;
4531         struct softnet_data *sd;
4532         unsigned int old_flow, new_flow;
4533
4534         if (qlen < (READ_ONCE(netdev_max_backlog) >> 1))
4535                 return false;
4536
4537         sd = this_cpu_ptr(&softnet_data);
4538
4539         rcu_read_lock();
4540         fl = rcu_dereference(sd->flow_limit);
4541         if (fl) {
4542                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
4543                 old_flow = fl->history[fl->history_head];
4544                 fl->history[fl->history_head] = new_flow;
4545
4546                 fl->history_head++;
4547                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
4548
4549                 if (likely(fl->buckets[old_flow]))
4550                         fl->buckets[old_flow]--;
4551
4552                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
4553                         fl->count++;
4554                         rcu_read_unlock();
4555                         return true;
4556                 }
4557         }
4558         rcu_read_unlock();
4559 #endif
4560         return false;
4561 }
4562
4563 /*
4564  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
4565  * queue (may be a remote CPU queue).
4566  */
4567 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
4568                               unsigned int *qtail)
4569 {
4570         struct softnet_data *sd;
4571         unsigned long flags;
4572         unsigned int qlen;
4573
4574         sd = &per_cpu(softnet_data, cpu);
4575
4576         local_irq_save(flags);
4577
4578         rps_lock(sd);
4579         if (!netif_running(skb->dev))
4580                 goto drop;
4581         qlen = skb_queue_len(&sd->input_pkt_queue);
4582         if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) {
4583                 if (qlen) {
4584 enqueue:
4585                         __skb_queue_tail(&sd->input_pkt_queue, skb);
4586                         input_queue_tail_incr_save(sd, qtail);
4587                         rps_unlock(sd);
4588                         local_irq_restore(flags);
4589                         return NET_RX_SUCCESS;
4590                 }
4591
4592                 /* Schedule NAPI for backlog device
4593                  * We can use non atomic operation since we own the queue lock
4594                  */
4595                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
4596                         if (!rps_ipi_queued(sd))
4597                                 ____napi_schedule(sd, &sd->backlog);
4598                 }
4599                 goto enqueue;
4600         }
4601
4602 drop:
4603         sd->dropped++;
4604         rps_unlock(sd);
4605
4606         local_irq_restore(flags);
4607
4608         atomic_long_inc(&skb->dev->rx_dropped);
4609         kfree_skb(skb);
4610         return NET_RX_DROP;
4611 }
4612
4613 static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
4614 {
4615         struct net_device *dev = skb->dev;
4616         struct netdev_rx_queue *rxqueue;
4617
4618         rxqueue = dev->_rx;
4619
4620         if (skb_rx_queue_recorded(skb)) {
4621                 u16 index = skb_get_rx_queue(skb);
4622
4623                 if (unlikely(index >= dev->real_num_rx_queues)) {
4624                         WARN_ONCE(dev->real_num_rx_queues > 1,
4625                                   "%s received packet on queue %u, but number "
4626                                   "of RX queues is %u\n",
4627                                   dev->name, index, dev->real_num_rx_queues);
4628
4629                         return rxqueue; /* Return first rxqueue */
4630                 }
4631                 rxqueue += index;
4632         }
4633         return rxqueue;
4634 }
4635
4636 static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4637                                      struct xdp_buff *xdp,
4638                                      struct bpf_prog *xdp_prog)
4639 {
4640         struct netdev_rx_queue *rxqueue;
4641         void *orig_data, *orig_data_end;
4642         u32 metalen, act = XDP_DROP;
4643         __be16 orig_eth_type;
4644         struct ethhdr *eth;
4645         bool orig_bcast;
4646         int hlen, off;
4647         u32 mac_len;
4648
4649         /* Reinjected packets coming from act_mirred or similar should
4650          * not get XDP generic processing.
4651          */
4652         if (skb_is_redirected(skb))
4653                 return XDP_PASS;
4654
4655         /* XDP packets must be linear and must have sufficient headroom
4656          * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
4657          * native XDP provides, thus we need to do it here as well.
4658          */
4659         if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
4660             skb_headroom(skb) < XDP_PACKET_HEADROOM) {
4661                 int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
4662                 int troom = skb->tail + skb->data_len - skb->end;
4663
4664                 /* In case we have to go down the path and also linearize,
4665                  * then lets do the pskb_expand_head() work just once here.
4666                  */
4667                 if (pskb_expand_head(skb,
4668                                      hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
4669                                      troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
4670                         goto do_drop;
4671                 if (skb_linearize(skb))
4672                         goto do_drop;
4673         }
4674
4675         /* The XDP program wants to see the packet starting at the MAC
4676          * header.
4677          */
4678         mac_len = skb->data - skb_mac_header(skb);
4679         hlen = skb_headlen(skb) + mac_len;
4680         xdp->data = skb->data - mac_len;
4681         xdp->data_meta = xdp->data;
4682         xdp->data_end = xdp->data + hlen;
4683         xdp->data_hard_start = skb->data - skb_headroom(skb);
4684
4685         /* SKB "head" area always have tailroom for skb_shared_info */
4686         xdp->frame_sz  = (void *)skb_end_pointer(skb) - xdp->data_hard_start;
4687         xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
4688
4689         orig_data_end = xdp->data_end;
4690         orig_data = xdp->data;
4691         eth = (struct ethhdr *)xdp->data;
4692         orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
4693         orig_eth_type = eth->h_proto;
4694
4695         rxqueue = netif_get_rxqueue(skb);
4696         xdp->rxq = &rxqueue->xdp_rxq;
4697
4698         act = bpf_prog_run_xdp(xdp_prog, xdp);
4699
4700         /* check if bpf_xdp_adjust_head was used */
4701         off = xdp->data - orig_data;
4702         if (off) {
4703                 if (off > 0)
4704                         __skb_pull(skb, off);
4705                 else if (off < 0)
4706                         __skb_push(skb, -off);
4707
4708                 skb->mac_header += off;
4709                 skb_reset_network_header(skb);
4710         }
4711
4712         /* check if bpf_xdp_adjust_tail was used */
4713         off = xdp->data_end - orig_data_end;
4714         if (off != 0) {
4715                 skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4716                 skb->len += off; /* positive on grow, negative on shrink */
4717         }
4718
4719         /* check if XDP changed eth hdr such SKB needs update */
4720         eth = (struct ethhdr *)xdp->data;
4721         if ((orig_eth_type != eth->h_proto) ||
4722             (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
4723                 __skb_push(skb, ETH_HLEN);
4724                 skb->protocol = eth_type_trans(skb, skb->dev);
4725         }
4726
4727         switch (act) {
4728         case XDP_REDIRECT:
4729         case XDP_TX:
4730                 __skb_push(skb, mac_len);
4731                 break;
4732         case XDP_PASS:
4733                 metalen = xdp->data - xdp->data_meta;
4734                 if (metalen)
4735                         skb_metadata_set(skb, metalen);
4736                 break;
4737         default:
4738                 bpf_warn_invalid_xdp_action(act);
4739                 fallthrough;
4740         case XDP_ABORTED:
4741                 trace_xdp_exception(skb->dev, xdp_prog, act);
4742                 fallthrough;
4743         case XDP_DROP:
4744         do_drop:
4745                 kfree_skb(skb);
4746                 break;
4747         }
4748
4749         return act;
4750 }
4751
4752 /* When doing generic XDP we have to bypass the qdisc layer and the
4753  * network taps in order to match in-driver-XDP behavior.
4754  */
4755 void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
4756 {
4757         struct net_device *dev = skb->dev;
4758         struct netdev_queue *txq;
4759         bool free_skb = true;
4760         int cpu, rc;
4761
4762         txq = netdev_core_pick_tx(dev, skb, NULL);
4763         cpu = smp_processor_id();
4764         HARD_TX_LOCK(dev, txq, cpu);
4765         if (!netif_xmit_stopped(txq)) {
4766                 rc = netdev_start_xmit(skb, dev, txq, 0);
4767                 if (dev_xmit_complete(rc))
4768                         free_skb = false;
4769         }
4770         HARD_TX_UNLOCK(dev, txq);
4771         if (free_skb) {
4772                 trace_xdp_exception(dev, xdp_prog, XDP_TX);
4773                 kfree_skb(skb);
4774         }
4775 }
4776
4777 static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
4778
4779 int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
4780 {
4781         if (xdp_prog) {
4782                 struct xdp_buff xdp;
4783                 u32 act;
4784                 int err;
4785
4786                 act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
4787                 if (act != XDP_PASS) {
4788                         switch (act) {
4789                         case XDP_REDIRECT:
4790                                 err = xdp_do_generic_redirect(skb->dev, skb,
4791                                                               &xdp, xdp_prog);
4792                                 if (err)
4793                                         goto out_redir;
4794                                 break;
4795                         case XDP_TX:
4796                                 generic_xdp_tx(skb, xdp_prog);
4797                                 break;
4798                         }
4799                         return XDP_DROP;
4800                 }
4801         }
4802         return XDP_PASS;
4803 out_redir:
4804         kfree_skb(skb);
4805         return XDP_DROP;
4806 }
4807 EXPORT_SYMBOL_GPL(do_xdp_generic);
4808
4809 static int netif_rx_internal(struct sk_buff *skb)
4810 {
4811         int ret;
4812
4813         net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
4814
4815         trace_netif_rx(skb);
4816
4817 #ifdef CONFIG_RPS
4818         if (static_branch_unlikely(&rps_needed)) {
4819                 struct rps_dev_flow voidflow, *rflow = &voidflow;
4820                 int cpu;
4821
4822                 preempt_disable();
4823                 rcu_read_lock();
4824
4825                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
4826                 if (cpu < 0)
4827                         cpu = smp_processor_id();
4828
4829                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4830
4831                 rcu_read_unlock();
4832                 preempt_enable();
4833         } else
4834 #endif
4835         {
4836                 unsigned int qtail;
4837
4838                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
4839                 put_cpu();
4840         }
4841         return ret;
4842 }
4843
4844 /**
4845  *      netif_rx        -       post buffer to the network code
4846  *      @skb: buffer to post
4847  *
4848  *      This function receives a packet from a device driver and queues it for
4849  *      the upper (protocol) levels to process.  It always succeeds. The buffer
4850  *      may be dropped during processing for congestion control or by the
4851  *      protocol layers.
4852  *
4853  *      return values:
4854  *      NET_RX_SUCCESS  (no congestion)
4855  *      NET_RX_DROP     (packet was dropped)
4856  *
4857  */
4858
4859 int netif_rx(struct sk_buff *skb)
4860 {
4861         int ret;
4862
4863         trace_netif_rx_entry(skb);
4864
4865         ret = netif_rx_internal(skb);
4866         trace_netif_rx_exit(ret);
4867
4868         return ret;
4869 }
4870 EXPORT_SYMBOL(netif_rx);
4871
4872 int netif_rx_ni(struct sk_buff *skb)
4873 {
4874         int err;
4875
4876         trace_netif_rx_ni_entry(skb);
4877
4878         preempt_disable();
4879         err = netif_rx_internal(skb);
4880         if (local_softirq_pending())
4881                 do_softirq();
4882         preempt_enable();
4883         trace_netif_rx_ni_exit(err);
4884
4885         return err;
4886 }
4887 EXPORT_SYMBOL(netif_rx_ni);
4888
4889 int netif_rx_any_context(struct sk_buff *skb)
4890 {
4891         /*
4892          * If invoked from contexts which do not invoke bottom half
4893          * processing either at return from interrupt or when softrqs are
4894          * reenabled, use netif_rx_ni() which invokes bottomhalf processing
4895          * directly.
4896          */
4897         if (in_interrupt())
4898                 return netif_rx(skb);
4899         else
4900                 return netif_rx_ni(skb);
4901 }
4902 EXPORT_SYMBOL(netif_rx_any_context);
4903
4904 static __latent_entropy void net_tx_action(struct softirq_action *h)
4905 {
4906         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4907
4908         if (sd->completion_queue) {
4909                 struct sk_buff *clist;
4910
4911                 local_irq_disable();
4912                 clist = sd->completion_queue;
4913                 sd->completion_queue = NULL;
4914                 local_irq_enable();
4915
4916                 while (clist) {
4917                         struct sk_buff *skb = clist;
4918
4919                         clist = clist->next;
4920
4921                         WARN_ON(refcount_read(&skb->users));
4922                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
4923                                 trace_consume_skb(skb);
4924                         else
4925                                 trace_kfree_skb(skb, net_tx_action);
4926
4927                         if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
4928                                 __kfree_skb(skb);
4929                         else
4930                                 __kfree_skb_defer(skb);
4931                 }
4932
4933                 __kfree_skb_flush();
4934         }
4935
4936         if (sd->output_queue) {
4937                 struct Qdisc *head;
4938
4939                 local_irq_disable();
4940                 head = sd->output_queue;
4941                 sd->output_queue = NULL;
4942                 sd->output_queue_tailp = &sd->output_queue;
4943                 local_irq_enable();
4944
4945                 rcu_read_lock();
4946
4947                 while (head) {
4948                         struct Qdisc *q = head;
4949                         spinlock_t *root_lock = NULL;
4950
4951                         head = head->next_sched;
4952
4953                         /* We need to make sure head->next_sched is read
4954                          * before clearing __QDISC_STATE_SCHED
4955                          */
4956                         smp_mb__before_atomic();
4957
4958                         if (!(q->flags & TCQ_F_NOLOCK)) {
4959                                 root_lock = qdisc_lock(q);
4960                                 spin_lock(root_lock);
4961                         } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
4962                                                      &q->state))) {
4963                                 /* There is a synchronize_net() between
4964                                  * STATE_DEACTIVATED flag being set and
4965                                  * qdisc_reset()/some_qdisc_is_busy() in
4966                                  * dev_deactivate(), so we can safely bail out
4967                                  * early here to avoid data race between
4968                                  * qdisc_deactivate() and some_qdisc_is_busy()
4969                                  * for lockless qdisc.
4970                                  */
4971                                 clear_bit(__QDISC_STATE_SCHED, &q->state);
4972                                 continue;
4973                         }
4974
4975                         clear_bit(__QDISC_STATE_SCHED, &q->state);
4976                         qdisc_run(q);
4977                         if (root_lock)
4978                                 spin_unlock(root_lock);
4979                 }
4980
4981                 rcu_read_unlock();
4982         }
4983
4984         xfrm_dev_backlog(sd);
4985 }
4986
4987 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
4988 /* This hook is defined here for ATM LANE */
4989 int (*br_fdb_test_addr_hook)(struct net_device *dev,
4990                              unsigned char *addr) __read_mostly;
4991 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
4992 #endif
4993
4994 static inline struct sk_buff *
4995 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4996                    struct net_device *orig_dev, bool *another)
4997 {
4998 #ifdef CONFIG_NET_CLS_ACT
4999         struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
5000         struct tcf_result cl_res;
5001
5002         /* If there's at least one ingress present somewhere (so
5003          * we get here via enabled static key), remaining devices
5004          * that are not configured with an ingress qdisc will bail
5005          * out here.
5006          */
5007         if (!miniq)
5008                 return skb;
5009
5010         if (*pt_prev) {
5011                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
5012                 *pt_prev = NULL;
5013         }
5014
5015         qdisc_skb_cb(skb)->pkt_len = skb->len;
5016         qdisc_skb_cb(skb)->mru = 0;
5017         skb->tc_at_ingress = 1;
5018         mini_qdisc_bstats_cpu_update(miniq, skb);
5019
5020         switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list,
5021                                      &cl_res, false)) {
5022         case TC_ACT_OK:
5023         case TC_ACT_RECLASSIFY:
5024                 skb->tc_index = TC_H_MIN(cl_res.classid);
5025                 break;
5026         case TC_ACT_SHOT:
5027                 mini_qdisc_qstats_cpu_drop(miniq);
5028                 kfree_skb(skb);
5029                 return NULL;
5030         case TC_ACT_STOLEN:
5031         case TC_ACT_QUEUED:
5032         case TC_ACT_TRAP:
5033                 consume_skb(skb);
5034                 return NULL;
5035         case TC_ACT_REDIRECT:
5036                 /* skb_mac_header check was done by cls/act_bpf, so
5037                  * we can safely push the L2 header back before
5038                  * redirecting to another netdev
5039                  */
5040                 __skb_push(skb, skb->mac_len);
5041                 if (skb_do_redirect(skb) == -EAGAIN) {
5042                         __skb_pull(skb, skb->mac_len);
5043                         *another = true;
5044                         break;
5045                 }
5046                 return NULL;
5047         case TC_ACT_CONSUMED:
5048                 return NULL;
5049         default:
5050                 break;
5051         }
5052 #endif /* CONFIG_NET_CLS_ACT */
5053         return skb;
5054 }
5055
5056 /**
5057  *      netdev_is_rx_handler_busy - check if receive handler is registered
5058  *      @dev: device to check
5059  *
5060  *      Check if a receive handler is already registered for a given device.
5061  *      Return true if there one.
5062  *
5063  *      The caller must hold the rtnl_mutex.
5064  */
5065 bool netdev_is_rx_handler_busy(struct net_device *dev)
5066 {
5067         ASSERT_RTNL();
5068         return dev && rtnl_dereference(dev->rx_handler);
5069 }
5070 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
5071
5072 /**
5073  *      netdev_rx_handler_register - register receive handler
5074  *      @dev: device to register a handler for
5075  *      @rx_handler: receive handler to register
5076  *      @rx_handler_data: data pointer that is used by rx handler
5077  *
5078  *      Register a receive handler for a device. This handler will then be
5079  *      called from __netif_receive_skb. A negative errno code is returned
5080  *      on a failure.
5081  *
5082  *      The caller must hold the rtnl_mutex.
5083  *
5084  *      For a general description of rx_handler, see enum rx_handler_result.
5085  */
5086 int netdev_rx_handler_register(struct net_device *dev,
5087                                rx_handler_func_t *rx_handler,
5088                                void *rx_handler_data)
5089 {
5090         if (netdev_is_rx_handler_busy(dev))
5091                 return -EBUSY;
5092
5093         if (dev->priv_flags & IFF_NO_RX_HANDLER)
5094                 return -EINVAL;
5095
5096         /* Note: rx_handler_data must be set before rx_handler */
5097         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
5098         rcu_assign_pointer(dev->rx_handler, rx_handler);
5099
5100         return 0;
5101 }
5102 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
5103
5104 /**
5105  *      netdev_rx_handler_unregister - unregister receive handler
5106  *      @dev: device to unregister a handler from
5107  *
5108  *      Unregister a receive handler from a device.
5109  *
5110  *      The caller must hold the rtnl_mutex.
5111  */
5112 void netdev_rx_handler_unregister(struct net_device *dev)
5113 {
5114
5115         ASSERT_RTNL();
5116         RCU_INIT_POINTER(dev->rx_handler, NULL);
5117         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
5118          * section has a guarantee to see a non NULL rx_handler_data
5119          * as well.
5120          */
5121         synchronize_net();
5122         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
5123 }
5124 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
5125
5126 /*
5127  * Limit the use of PFMEMALLOC reserves to those protocols that implement
5128  * the special handling of PFMEMALLOC skbs.
5129  */
5130 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
5131 {
5132         switch (skb->protocol) {
5133         case htons(ETH_P_ARP):
5134         case htons(ETH_P_IP):
5135         case htons(ETH_P_IPV6):
5136         case htons(ETH_P_8021Q):
5137         case htons(ETH_P_8021AD):
5138                 return true;
5139         default:
5140                 return false;
5141         }
5142 }
5143
5144 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
5145                              int *ret, struct net_device *orig_dev)
5146 {
5147         if (nf_hook_ingress_active(skb)) {
5148                 int ingress_retval;
5149
5150                 if (*pt_prev) {
5151                         *ret = deliver_skb(skb, *pt_prev, orig_dev);
5152                         *pt_prev = NULL;
5153                 }
5154
5155                 rcu_read_lock();
5156                 ingress_retval = nf_hook_ingress(skb);
5157                 rcu_read_unlock();
5158                 return ingress_retval;
5159         }
5160         return 0;
5161 }
5162
5163 static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
5164                                     struct packet_type **ppt_prev)
5165 {
5166         struct packet_type *ptype, *pt_prev;
5167         rx_handler_func_t *rx_handler;
5168         struct sk_buff *skb = *pskb;
5169         struct net_device *orig_dev;
5170         bool deliver_exact = false;
5171         int ret = NET_RX_DROP;
5172         __be16 type;
5173
5174         net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb);
5175
5176         trace_netif_receive_skb(skb);
5177
5178         orig_dev = skb->dev;
5179
5180         skb_reset_network_header(skb);
5181         if (!skb_transport_header_was_set(skb))
5182                 skb_reset_transport_header(skb);
5183         skb_reset_mac_len(skb);
5184
5185         pt_prev = NULL;
5186
5187 another_round:
5188         skb->skb_iif = skb->dev->ifindex;
5189
5190         __this_cpu_inc(softnet_data.processed);
5191
5192         if (static_branch_unlikely(&generic_xdp_needed_key)) {
5193                 int ret2;
5194
5195                 preempt_disable();
5196                 ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
5197                 preempt_enable();
5198
5199                 if (ret2 != XDP_PASS) {
5200                         ret = NET_RX_DROP;
5201                         goto out;
5202                 }
5203                 skb_reset_mac_len(skb);
5204         }
5205
5206         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
5207             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
5208                 skb = skb_vlan_untag(skb);
5209                 if (unlikely(!skb))
5210                         goto out;
5211         }
5212
5213         if (skb_skip_tc_classify(skb))
5214                 goto skip_classify;
5215
5216         if (pfmemalloc)
5217                 goto skip_taps;
5218
5219         list_for_each_entry_rcu(ptype, &ptype_all, list) {
5220                 if (pt_prev)
5221                         ret = deliver_skb(skb, pt_prev, orig_dev);
5222                 pt_prev = ptype;
5223         }
5224
5225         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
5226                 if (pt_prev)
5227                         ret = deliver_skb(skb, pt_prev, orig_dev);
5228                 pt_prev = ptype;
5229         }
5230
5231 skip_taps:
5232 #ifdef CONFIG_NET_INGRESS
5233         if (static_branch_unlikely(&ingress_needed_key)) {
5234                 bool another = false;
5235
5236                 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
5237                                          &another);
5238                 if (another)
5239                         goto another_round;
5240                 if (!skb)
5241                         goto out;
5242
5243                 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
5244                         goto out;
5245         }
5246 #endif
5247         skb_reset_redirect(skb);
5248 skip_classify:
5249         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
5250                 goto drop;
5251
5252         if (skb_vlan_tag_present(skb)) {
5253                 if (pt_prev) {
5254                         ret = deliver_skb(skb, pt_prev, orig_dev);
5255                         pt_prev = NULL;
5256                 }
5257                 if (vlan_do_receive(&skb))
5258                         goto another_round;
5259                 else if (unlikely(!skb))
5260                         goto out;
5261         }
5262
5263         rx_handler = rcu_dereference(skb->dev->rx_handler);
5264         if (rx_handler) {
5265                 if (pt_prev) {
5266                         ret = deliver_skb(skb, pt_prev, orig_dev);
5267                         pt_prev = NULL;
5268                 }
5269                 switch (rx_handler(&skb)) {
5270                 case RX_HANDLER_CONSUMED:
5271                         ret = NET_RX_SUCCESS;
5272                         goto out;
5273                 case RX_HANDLER_ANOTHER:
5274                         goto another_round;
5275                 case RX_HANDLER_EXACT:
5276                         deliver_exact = true;
5277                 case RX_HANDLER_PASS:
5278                         break;
5279                 default:
5280                         BUG();
5281                 }
5282         }
5283
5284         if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
5285 check_vlan_id:
5286                 if (skb_vlan_tag_get_id(skb)) {
5287                         /* Vlan id is non 0 and vlan_do_receive() above couldn't
5288                          * find vlan device.
5289                          */
5290                         skb->pkt_type = PACKET_OTHERHOST;
5291                 } else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
5292                            skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
5293                         /* Outer header is 802.1P with vlan 0, inner header is
5294                          * 802.1Q or 802.1AD and vlan_do_receive() above could
5295                          * not find vlan dev for vlan id 0.
5296                          */
5297                         __vlan_hwaccel_clear_tag(skb);
5298                         skb = skb_vlan_untag(skb);
5299                         if (unlikely(!skb))
5300                                 goto out;
5301                         if (vlan_do_receive(&skb))
5302                                 /* After stripping off 802.1P header with vlan 0
5303                                  * vlan dev is found for inner header.
5304                                  */
5305                                 goto another_round;
5306                         else if (unlikely(!skb))
5307                                 goto out;
5308                         else
5309                                 /* We have stripped outer 802.1P vlan 0 header.
5310                                  * But could not find vlan dev.
5311                                  * check again for vlan id to set OTHERHOST.
5312                                  */
5313                                 goto check_vlan_id;
5314                 }
5315                 /* Note: we might in the future use prio bits
5316                  * and set skb->priority like in vlan_do_receive()
5317                  * For the time being, just ignore Priority Code Point
5318                  */
5319                 __vlan_hwaccel_clear_tag(skb);
5320         }
5321
5322         type = skb->protocol;
5323
5324         /* deliver only exact match when indicated */
5325         if (likely(!deliver_exact)) {
5326                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5327                                        &ptype_base[ntohs(type) &
5328                                                    PTYPE_HASH_MASK]);
5329         }
5330
5331         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5332                                &orig_dev->ptype_specific);
5333
5334         if (unlikely(skb->dev != orig_dev)) {
5335                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5336                                        &skb->dev->ptype_specific);
5337         }
5338
5339         if (pt_prev) {
5340                 if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
5341                         goto drop;
5342                 *ppt_prev = pt_prev;
5343         } else {
5344 drop:
5345                 if (!deliver_exact)
5346                         atomic_long_inc(&skb->dev->rx_dropped);
5347                 else
5348                         atomic_long_inc(&skb->dev->rx_nohandler);
5349                 kfree_skb(skb);
5350                 /* Jamal, now you will not able to escape explaining
5351                  * me how you were going to use this. :-)
5352                  */
5353                 ret = NET_RX_DROP;
5354         }
5355
5356 out:
5357         /* The invariant here is that if *ppt_prev is not NULL
5358          * then skb should also be non-NULL.
5359          *
5360          * Apparently *ppt_prev assignment above holds this invariant due to
5361          * skb dereferencing near it.
5362          */
5363         *pskb = skb;
5364         return ret;
5365 }
5366
5367 static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
5368 {
5369         struct net_device *orig_dev = skb->dev;
5370         struct packet_type *pt_prev = NULL;
5371         int ret;
5372
5373         ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5374         if (pt_prev)
5375                 ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
5376                                          skb->dev, pt_prev, orig_dev);
5377         return ret;
5378 }
5379
5380 /**
5381  *      netif_receive_skb_core - special purpose version of netif_receive_skb
5382  *      @skb: buffer to process
5383  *
5384  *      More direct receive version of netif_receive_skb().  It should
5385  *      only be used by callers that have a need to skip RPS and Generic XDP.
5386  *      Caller must also take care of handling if ``(page_is_)pfmemalloc``.
5387  *
5388  *      This function may only be called from softirq context and interrupts
5389  *      should be enabled.
5390  *
5391  *      Return values (usually ignored):
5392  *      NET_RX_SUCCESS: no congestion
5393  *      NET_RX_DROP: packet was dropped
5394  */
5395 int netif_receive_skb_core(struct sk_buff *skb)
5396 {
5397         int ret;
5398
5399         rcu_read_lock();
5400         ret = __netif_receive_skb_one_core(skb, false);
5401         rcu_read_unlock();
5402
5403         return ret;
5404 }
5405 EXPORT_SYMBOL(netif_receive_skb_core);
5406
5407 static inline void __netif_receive_skb_list_ptype(struct list_head *head,
5408                                                   struct packet_type *pt_prev,
5409                                                   struct net_device *orig_dev)
5410 {
5411         struct sk_buff *skb, *next;
5412
5413         if (!pt_prev)
5414                 return;
5415         if (list_empty(head))
5416                 return;
5417         if (pt_prev->list_func != NULL)
5418                 INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
5419                                    ip_list_rcv, head, pt_prev, orig_dev);
5420         else
5421                 list_for_each_entry_safe(skb, next, head, list) {
5422                         skb_list_del_init(skb);
5423                         pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
5424                 }
5425 }
5426
5427 static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
5428 {
5429         /* Fast-path assumptions:
5430          * - There is no RX handler.
5431          * - Only one packet_type matches.
5432          * If either of these fails, we will end up doing some per-packet
5433          * processing in-line, then handling the 'last ptype' for the whole
5434          * sublist.  This can't cause out-of-order delivery to any single ptype,
5435          * because the 'last ptype' must be constant across the sublist, and all
5436          * other ptypes are handled per-packet.
5437          */
5438         /* Current (common) ptype of sublist */
5439         struct packet_type *pt_curr = NULL;
5440         /* Current (common) orig_dev of sublist */
5441         struct net_device *od_curr = NULL;
5442         struct list_head sublist;
5443         struct sk_buff *skb, *next;
5444
5445         INIT_LIST_HEAD(&sublist);
5446         list_for_each_entry_safe(skb, next, head, list) {
5447                 struct net_device *orig_dev = skb->dev;
5448                 struct packet_type *pt_prev = NULL;
5449
5450                 skb_list_del_init(skb);
5451                 __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5452                 if (!pt_prev)
5453                         continue;
5454                 if (pt_curr != pt_prev || od_curr != orig_dev) {
5455                         /* dispatch old sublist */
5456                         __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5457                         /* start new sublist */
5458                         INIT_LIST_HEAD(&sublist);
5459                         pt_curr = pt_prev;
5460                         od_curr = orig_dev;
5461                 }
5462                 list_add_tail(&skb->list, &sublist);
5463         }
5464
5465         /* dispatch final sublist */
5466         __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5467 }
5468
5469 static int __netif_receive_skb(struct sk_buff *skb)
5470 {
5471         int ret;
5472
5473         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
5474                 unsigned int noreclaim_flag;
5475
5476                 /*
5477                  * PFMEMALLOC skbs are special, they should
5478                  * - be delivered to SOCK_MEMALLOC sockets only
5479                  * - stay away from userspace
5480                  * - have bounded memory usage
5481                  *
5482                  * Use PF_MEMALLOC as this saves us from propagating the allocation
5483                  * context down to all allocation sites.
5484                  */
5485                 noreclaim_flag = memalloc_noreclaim_save();
5486                 ret = __netif_receive_skb_one_core(skb, true);
5487                 memalloc_noreclaim_restore(noreclaim_flag);
5488         } else
5489                 ret = __netif_receive_skb_one_core(skb, false);
5490
5491         return ret;
5492 }
5493
5494 static void __netif_receive_skb_list(struct list_head *head)
5495 {
5496         unsigned long noreclaim_flag = 0;
5497         struct sk_buff *skb, *next;
5498         bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
5499
5500         list_for_each_entry_safe(skb, next, head, list) {
5501                 if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
5502                         struct list_head sublist;
5503
5504                         /* Handle the previous sublist */
5505                         list_cut_before(&sublist, head, &skb->list);
5506                         if (!list_empty(&sublist))
5507                                 __netif_receive_skb_list_core(&sublist, pfmemalloc);
5508                         pfmemalloc = !pfmemalloc;
5509                         /* See comments in __netif_receive_skb */
5510                         if (pfmemalloc)
5511                                 noreclaim_flag = memalloc_noreclaim_save();
5512                         else
5513                                 memalloc_noreclaim_restore(noreclaim_flag);
5514                 }
5515         }
5516         /* Handle the remaining sublist */
5517         if (!list_empty(head))
5518                 __netif_receive_skb_list_core(head, pfmemalloc);
5519         /* Restore pflags */
5520         if (pfmemalloc)
5521                 memalloc_noreclaim_restore(noreclaim_flag);
5522 }
5523
5524 static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
5525 {
5526         struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
5527         struct bpf_prog *new = xdp->prog;
5528         int ret = 0;
5529
5530         if (new) {
5531                 u32 i;
5532
5533                 mutex_lock(&new->aux->used_maps_mutex);
5534
5535                 /* generic XDP does not work with DEVMAPs that can
5536                  * have a bpf_prog installed on an entry
5537                  */
5538                 for (i = 0; i < new->aux->used_map_cnt; i++) {
5539                         if (dev_map_can_have_prog(new->aux->used_maps[i]) ||
5540                             cpu_map_prog_allowed(new->aux->used_maps[i])) {
5541                                 mutex_unlock(&new->aux->used_maps_mutex);
5542                                 return -EINVAL;
5543                         }
5544                 }
5545
5546                 mutex_unlock(&new->aux->used_maps_mutex);
5547         }
5548
5549         switch (xdp->command) {
5550         case XDP_SETUP_PROG:
5551                 rcu_assign_pointer(dev->xdp_prog, new);
5552                 if (old)
5553                         bpf_prog_put(old);
5554
5555                 if (old && !new) {
5556                         static_branch_dec(&generic_xdp_needed_key);
5557                 } else if (new && !old) {
5558                         static_branch_inc(&generic_xdp_needed_key);
5559                         dev_disable_lro(dev);
5560                         dev_disable_gro_hw(dev);
5561                 }
5562                 break;
5563
5564         default:
5565                 ret = -EINVAL;
5566                 break;
5567         }
5568
5569         return ret;
5570 }
5571
5572 static int netif_receive_skb_internal(struct sk_buff *skb)
5573 {
5574         int ret;
5575
5576         net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
5577
5578         if (skb_defer_rx_timestamp(skb))
5579                 return NET_RX_SUCCESS;
5580
5581         rcu_read_lock();
5582 #ifdef CONFIG_RPS
5583         if (static_branch_unlikely(&rps_needed)) {
5584                 struct rps_dev_flow voidflow, *rflow = &voidflow;
5585                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5586
5587                 if (cpu >= 0) {
5588                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5589                         rcu_read_unlock();
5590                         return ret;
5591                 }
5592         }
5593 #endif
5594         ret = __netif_receive_skb(skb);
5595         rcu_read_unlock();
5596         return ret;
5597 }
5598
5599 static void netif_receive_skb_list_internal(struct list_head *head)
5600 {
5601         struct sk_buff *skb, *next;
5602         struct list_head sublist;
5603
5604         INIT_LIST_HEAD(&sublist);
5605         list_for_each_entry_safe(skb, next, head, list) {
5606                 net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
5607                 skb_list_del_init(skb);
5608                 if (!skb_defer_rx_timestamp(skb))
5609                         list_add_tail(&skb->list, &sublist);
5610         }
5611         list_splice_init(&sublist, head);
5612
5613         rcu_read_lock();
5614 #ifdef CONFIG_RPS
5615         if (static_branch_unlikely(&rps_needed)) {
5616                 list_for_each_entry_safe(skb, next, head, list) {
5617                         struct rps_dev_flow voidflow, *rflow = &voidflow;
5618                         int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5619
5620                         if (cpu >= 0) {
5621                                 /* Will be handled, remove from list */
5622                                 skb_list_del_init(skb);
5623                                 enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5624                         }
5625                 }
5626         }
5627 #endif
5628         __netif_receive_skb_list(head);
5629         rcu_read_unlock();
5630 }
5631
5632 /**
5633  *      netif_receive_skb - process receive buffer from network
5634  *      @skb: buffer to process
5635  *
5636  *      netif_receive_skb() is the main receive data processing function.
5637  *      It always succeeds. The buffer may be dropped during processing
5638  *      for congestion control or by the protocol layers.
5639  *
5640  *      This function may only be called from softirq context and interrupts
5641  *      should be enabled.
5642  *
5643  *      Return values (usually ignored):
5644  *      NET_RX_SUCCESS: no congestion
5645  *      NET_RX_DROP: packet was dropped
5646  */
5647 int netif_receive_skb(struct sk_buff *skb)
5648 {
5649         int ret;
5650
5651         trace_netif_receive_skb_entry(skb);
5652
5653         ret = netif_receive_skb_internal(skb);
5654         trace_netif_receive_skb_exit(ret);
5655
5656         return ret;
5657 }
5658 EXPORT_SYMBOL(netif_receive_skb);
5659
5660 /**
5661  *      netif_receive_skb_list - process many receive buffers from network
5662  *      @head: list of skbs to process.
5663  *
5664  *      Since return value of netif_receive_skb() is normally ignored, and
5665  *      wouldn't be meaningful for a list, this function returns void.
5666  *
5667  *      This function may only be called from softirq context and interrupts
5668  *      should be enabled.
5669  */
5670 void netif_receive_skb_list(struct list_head *head)
5671 {
5672         struct sk_buff *skb;
5673
5674         if (list_empty(head))
5675                 return;
5676         if (trace_netif_receive_skb_list_entry_enabled()) {
5677                 list_for_each_entry(skb, head, list)
5678                         trace_netif_receive_skb_list_entry(skb);
5679         }
5680         netif_receive_skb_list_internal(head);
5681         trace_netif_receive_skb_list_exit(0);
5682 }
5683 EXPORT_SYMBOL(netif_receive_skb_list);
5684
5685 static DEFINE_PER_CPU(struct work_struct, flush_works);
5686
5687 /* Network device is going away, flush any packets still pending */
5688 static void flush_backlog(struct work_struct *work)
5689 {
5690         struct sk_buff *skb, *tmp;
5691         struct softnet_data *sd;
5692
5693         local_bh_disable();
5694         sd = this_cpu_ptr(&softnet_data);
5695
5696         local_irq_disable();
5697         rps_lock(sd);
5698         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
5699                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5700                         __skb_unlink(skb, &sd->input_pkt_queue);
5701                         dev_kfree_skb_irq(skb);
5702                         input_queue_head_incr(sd);
5703                 }
5704         }
5705         rps_unlock(sd);
5706         local_irq_enable();
5707
5708         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
5709                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5710                         __skb_unlink(skb, &sd->process_queue);
5711                         kfree_skb(skb);
5712                         input_queue_head_incr(sd);
5713                 }
5714         }
5715         local_bh_enable();
5716 }
5717
5718 static bool flush_required(int cpu)
5719 {
5720 #if IS_ENABLED(CONFIG_RPS)
5721         struct softnet_data *sd = &per_cpu(softnet_data, cpu);
5722         bool do_flush;
5723
5724         local_irq_disable();
5725         rps_lock(sd);
5726
5727         /* as insertion into process_queue happens with the rps lock held,
5728          * process_queue access may race only with dequeue
5729          */
5730         do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
5731                    !skb_queue_empty_lockless(&sd->process_queue);
5732         rps_unlock(sd);
5733         local_irq_enable();
5734
5735         return do_flush;
5736 #endif
5737         /* without RPS we can't safely check input_pkt_queue: during a
5738          * concurrent remote skb_queue_splice() we can detect as empty both
5739          * input_pkt_queue and process_queue even if the latter could end-up
5740          * containing a lot of packets.
5741          */
5742         return true;
5743 }
5744
5745 static void flush_all_backlogs(void)
5746 {
5747         static cpumask_t flush_cpus;
5748         unsigned int cpu;
5749
5750         /* since we are under rtnl lock protection we can use static data
5751          * for the cpumask and avoid allocating on stack the possibly
5752          * large mask
5753          */
5754         ASSERT_RTNL();
5755
5756         get_online_cpus();
5757
5758         cpumask_clear(&flush_cpus);
5759         for_each_online_cpu(cpu) {
5760                 if (flush_required(cpu)) {
5761                         queue_work_on(cpu, system_highpri_wq,
5762                                       per_cpu_ptr(&flush_works, cpu));
5763                         cpumask_set_cpu(cpu, &flush_cpus);
5764                 }
5765         }
5766
5767         /* we can have in flight packet[s] on the cpus we are not flushing,
5768          * synchronize_net() in unregister_netdevice_many() will take care of
5769          * them
5770          */
5771         for_each_cpu(cpu, &flush_cpus)
5772                 flush_work(per_cpu_ptr(&flush_works, cpu));
5773
5774         put_online_cpus();
5775 }
5776
5777 /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
5778 static void gro_normal_list(struct napi_struct *napi)
5779 {
5780         if (!napi->rx_count)
5781                 return;
5782         netif_receive_skb_list_internal(&napi->rx_list);
5783         INIT_LIST_HEAD(&napi->rx_list);
5784         napi->rx_count = 0;
5785 }
5786
5787 /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
5788  * pass the whole batch up to the stack.
5789  */
5790 static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs)
5791 {
5792         list_add_tail(&skb->list, &napi->rx_list);
5793         napi->rx_count += segs;
5794         if (napi->rx_count >= gro_normal_batch)
5795                 gro_normal_list(napi);
5796 }
5797
5798 INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
5799 INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
5800 static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
5801 {
5802         struct packet_offload *ptype;
5803         __be16 type = skb->protocol;
5804         struct list_head *head = &offload_base;
5805         int err = -ENOENT;
5806
5807         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
5808
5809         if (NAPI_GRO_CB(skb)->count == 1) {
5810                 skb_shinfo(skb)->gso_size = 0;
5811                 goto out;
5812         }
5813
5814         rcu_read_lock();
5815         list_for_each_entry_rcu(ptype, head, list) {
5816                 if (ptype->type != type || !ptype->callbacks.gro_complete)
5817                         continue;
5818
5819                 err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
5820                                          ipv6_gro_complete, inet_gro_complete,
5821                                          skb, 0);
5822                 break;
5823         }
5824         rcu_read_unlock();
5825
5826         if (err) {
5827                 WARN_ON(&ptype->list == head);
5828                 kfree_skb(skb);
5829                 return NET_RX_SUCCESS;
5830         }
5831
5832 out:
5833         gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count);
5834         return NET_RX_SUCCESS;
5835 }
5836
5837 static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
5838                                    bool flush_old)
5839 {
5840         struct list_head *head = &napi->gro_hash[index].list;
5841         struct sk_buff *skb, *p;
5842
5843         list_for_each_entry_safe_reverse(skb, p, head, list) {
5844                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
5845                         return;
5846                 skb_list_del_init(skb);
5847                 napi_gro_complete(napi, skb);
5848                 napi->gro_hash[index].count--;
5849         }
5850
5851         if (!napi->gro_hash[index].count)
5852                 __clear_bit(index, &napi->gro_bitmask);
5853 }
5854
5855 /* napi->gro_hash[].list contains packets ordered by age.
5856  * youngest packets at the head of it.
5857  * Complete skbs in reverse order to reduce latencies.
5858  */
5859 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
5860 {
5861         unsigned long bitmask = napi->gro_bitmask;
5862         unsigned int i, base = ~0U;
5863
5864         while ((i = ffs(bitmask)) != 0) {
5865                 bitmask >>= i;
5866                 base += i;
5867                 __napi_gro_flush_chain(napi, base, flush_old);
5868         }
5869 }
5870 EXPORT_SYMBOL(napi_gro_flush);
5871
5872 static struct list_head *gro_list_prepare(struct napi_struct *napi,
5873                                           struct sk_buff *skb)
5874 {
5875         unsigned int maclen = skb->dev->hard_header_len;
5876         u32 hash = skb_get_hash_raw(skb);
5877         struct list_head *head;
5878         struct sk_buff *p;
5879
5880         head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
5881         list_for_each_entry(p, head, list) {
5882                 unsigned long diffs;
5883
5884                 NAPI_GRO_CB(p)->flush = 0;
5885
5886                 if (hash != skb_get_hash_raw(p)) {
5887                         NAPI_GRO_CB(p)->same_flow = 0;
5888                         continue;
5889                 }
5890
5891                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
5892                 diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
5893                 if (skb_vlan_tag_present(p))
5894                         diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
5895                 diffs |= skb_metadata_dst_cmp(p, skb);
5896                 diffs |= skb_metadata_differs(p, skb);
5897                 if (maclen == ETH_HLEN)
5898                         diffs |= compare_ether_header(skb_mac_header(p),
5899                                                       skb_mac_header(skb));
5900                 else if (!diffs)
5901                         diffs = memcmp(skb_mac_header(p),
5902                                        skb_mac_header(skb),
5903                                        maclen);
5904
5905                 diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb);
5906 #if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
5907                 if (!diffs) {
5908                         struct tc_skb_ext *skb_ext = skb_ext_find(skb, TC_SKB_EXT);
5909                         struct tc_skb_ext *p_ext = skb_ext_find(p, TC_SKB_EXT);
5910
5911                         diffs |= (!!p_ext) ^ (!!skb_ext);
5912                         if (!diffs && unlikely(skb_ext))
5913                                 diffs |= p_ext->chain ^ skb_ext->chain;
5914                 }
5915 #endif
5916
5917                 NAPI_GRO_CB(p)->same_flow = !diffs;
5918         }
5919
5920         return head;
5921 }
5922
5923 static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff)
5924 {
5925         const struct skb_shared_info *pinfo = skb_shinfo(skb);
5926         const skb_frag_t *frag0 = &pinfo->frags[0];
5927
5928         NAPI_GRO_CB(skb)->data_offset = 0;
5929         NAPI_GRO_CB(skb)->frag0 = NULL;
5930         NAPI_GRO_CB(skb)->frag0_len = 0;
5931
5932         if (!skb_headlen(skb) && pinfo->nr_frags &&
5933             !PageHighMem(skb_frag_page(frag0)) &&
5934             (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) {
5935                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
5936                 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
5937                                                     skb_frag_size(frag0),
5938                                                     skb->end - skb->tail);
5939         }
5940 }
5941
5942 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
5943 {
5944         struct skb_shared_info *pinfo = skb_shinfo(skb);
5945
5946         BUG_ON(skb->end - skb->tail < grow);
5947
5948         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
5949
5950         skb->data_len -= grow;
5951         skb->tail += grow;
5952
5953         skb_frag_off_add(&pinfo->frags[0], grow);
5954         skb_frag_size_sub(&pinfo->frags[0], grow);
5955
5956         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
5957                 skb_frag_unref(skb, 0);
5958                 memmove(pinfo->frags, pinfo->frags + 1,
5959                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
5960         }
5961 }
5962
5963 static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
5964 {
5965         struct sk_buff *oldest;
5966
5967         oldest = list_last_entry(head, struct sk_buff, list);
5968
5969         /* We are called with head length >= MAX_GRO_SKBS, so this is
5970          * impossible.
5971          */
5972         if (WARN_ON_ONCE(!oldest))
5973                 return;
5974
5975         /* Do not adjust napi->gro_hash[].count, caller is adding a new
5976          * SKB to the chain.
5977          */
5978         skb_list_del_init(oldest);
5979         napi_gro_complete(napi, oldest);
5980 }
5981
5982 INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
5983                                                            struct sk_buff *));
5984 INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
5985                                                            struct sk_buff *));
5986 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5987 {
5988         u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
5989         struct list_head *head = &offload_base;
5990         struct packet_offload *ptype;
5991         __be16 type = skb->protocol;
5992         struct list_head *gro_head;
5993         struct sk_buff *pp = NULL;
5994         enum gro_result ret;
5995         int same_flow;
5996         int grow;
5997
5998         if (netif_elide_gro(skb->dev))
5999                 goto normal;
6000
6001         gro_head = gro_list_prepare(napi, skb);
6002
6003         rcu_read_lock();
6004         list_for_each_entry_rcu(ptype, head, list) {
6005                 if (ptype->type != type || !ptype->callbacks.gro_receive)
6006                         continue;
6007
6008                 skb_set_network_header(skb, skb_gro_offset(skb));
6009                 skb_reset_mac_len(skb);
6010                 NAPI_GRO_CB(skb)->same_flow = 0;
6011                 NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
6012                 NAPI_GRO_CB(skb)->free = 0;
6013                 NAPI_GRO_CB(skb)->encap_mark = 0;
6014                 NAPI_GRO_CB(skb)->recursion_counter = 0;
6015                 NAPI_GRO_CB(skb)->is_fou = 0;
6016                 NAPI_GRO_CB(skb)->is_atomic = 1;
6017                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
6018
6019                 /* Setup for GRO checksum validation */
6020                 switch (skb->ip_summed) {
6021                 case CHECKSUM_COMPLETE:
6022                         NAPI_GRO_CB(skb)->csum = skb->csum;
6023                         NAPI_GRO_CB(skb)->csum_valid = 1;
6024                         NAPI_GRO_CB(skb)->csum_cnt = 0;
6025                         break;
6026                 case CHECKSUM_UNNECESSARY:
6027                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
6028                         NAPI_GRO_CB(skb)->csum_valid = 0;
6029                         break;
6030                 default:
6031                         NAPI_GRO_CB(skb)->csum_cnt = 0;
6032                         NAPI_GRO_CB(skb)->csum_valid = 0;
6033                 }
6034
6035                 pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
6036                                         ipv6_gro_receive, inet_gro_receive,
6037                                         gro_head, skb);
6038                 break;
6039         }
6040         rcu_read_unlock();
6041
6042         if (&ptype->list == head)
6043                 goto normal;
6044
6045         if (PTR_ERR(pp) == -EINPROGRESS) {
6046                 ret = GRO_CONSUMED;
6047                 goto ok;
6048         }
6049
6050         same_flow = NAPI_GRO_CB(skb)->same_flow;
6051         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
6052
6053         if (pp) {
6054                 skb_list_del_init(pp);
6055                 napi_gro_complete(napi, pp);
6056                 napi->gro_hash[hash].count--;
6057         }
6058
6059         if (same_flow)
6060                 goto ok;
6061
6062         if (NAPI_GRO_CB(skb)->flush)
6063                 goto normal;
6064
6065         if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
6066                 gro_flush_oldest(napi, gro_head);
6067         } else {
6068                 napi->gro_hash[hash].count++;
6069         }
6070         NAPI_GRO_CB(skb)->count = 1;
6071         NAPI_GRO_CB(skb)->age = jiffies;
6072         NAPI_GRO_CB(skb)->last = skb;
6073         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
6074         list_add(&skb->list, gro_head);
6075         ret = GRO_HELD;
6076
6077 pull:
6078         grow = skb_gro_offset(skb) - skb_headlen(skb);
6079         if (grow > 0)
6080                 gro_pull_from_frag0(skb, grow);
6081 ok:
6082         if (napi->gro_hash[hash].count) {
6083                 if (!test_bit(hash, &napi->gro_bitmask))
6084                         __set_bit(hash, &napi->gro_bitmask);
6085         } else if (test_bit(hash, &napi->gro_bitmask)) {
6086                 __clear_bit(hash, &napi->gro_bitmask);
6087         }
6088
6089         return ret;
6090
6091 normal:
6092         ret = GRO_NORMAL;
6093         goto pull;
6094 }
6095
6096 struct packet_offload *gro_find_receive_by_type(__be16 type)
6097 {
6098         struct list_head *offload_head = &offload_base;
6099         struct packet_offload *ptype;
6100
6101         list_for_each_entry_rcu(ptype, offload_head, list) {
6102                 if (ptype->type != type || !ptype->callbacks.gro_receive)
6103                         continue;
6104                 return ptype;
6105         }
6106         return NULL;
6107 }
6108 EXPORT_SYMBOL(gro_find_receive_by_type);
6109
6110 struct packet_offload *gro_find_complete_by_type(__be16 type)
6111 {
6112         struct list_head *offload_head = &offload_base;
6113         struct packet_offload *ptype;
6114
6115         list_for_each_entry_rcu(ptype, offload_head, list) {
6116                 if (ptype->type != type || !ptype->callbacks.gro_complete)
6117                         continue;
6118                 return ptype;
6119         }
6120         return NULL;
6121 }
6122 EXPORT_SYMBOL(gro_find_complete_by_type);
6123
6124 static void napi_skb_free_stolen_head(struct sk_buff *skb)
6125 {
6126         nf_reset_ct(skb);
6127         skb_dst_drop(skb);
6128         skb_ext_put(skb);
6129         kmem_cache_free(skbuff_head_cache, skb);
6130 }
6131
6132 static gro_result_t napi_skb_finish(struct napi_struct *napi,
6133                                     struct sk_buff *skb,
6134                                     gro_result_t ret)
6135 {
6136         switch (ret) {
6137         case GRO_NORMAL:
6138                 gro_normal_one(napi, skb, 1);
6139                 break;
6140
6141         case GRO_DROP:
6142                 kfree_skb(skb);
6143                 break;
6144
6145         case GRO_MERGED_FREE:
6146                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
6147                         napi_skb_free_stolen_head(skb);
6148                 else
6149                         __kfree_skb(skb);
6150                 break;
6151
6152         case GRO_HELD:
6153         case GRO_MERGED:
6154         case GRO_CONSUMED:
6155                 break;
6156         }
6157
6158         return ret;
6159 }
6160
6161 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
6162 {
6163         gro_result_t ret;
6164
6165         skb_mark_napi_id(skb, napi);
6166         trace_napi_gro_receive_entry(skb);
6167
6168         skb_gro_reset_offset(skb, 0);
6169
6170         ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
6171         trace_napi_gro_receive_exit(ret);
6172
6173         return ret;
6174 }
6175 EXPORT_SYMBOL(napi_gro_receive);
6176
6177 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
6178 {
6179         if (unlikely(skb->pfmemalloc)) {
6180                 consume_skb(skb);
6181                 return;
6182         }
6183         __skb_pull(skb, skb_headlen(skb));
6184         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
6185         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
6186         __vlan_hwaccel_clear_tag(skb);
6187         skb->dev = napi->dev;
6188         skb->skb_iif = 0;
6189
6190         /* eth_type_trans() assumes pkt_type is PACKET_HOST */
6191         skb->pkt_type = PACKET_HOST;
6192
6193         skb->encapsulation = 0;
6194         skb_shinfo(skb)->gso_type = 0;
6195         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
6196         skb_ext_reset(skb);
6197         nf_reset_ct(skb);
6198
6199         napi->skb = skb;
6200 }
6201
6202 struct sk_buff *napi_get_frags(struct napi_struct *napi)
6203 {
6204         struct sk_buff *skb = napi->skb;
6205
6206         if (!skb) {
6207                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
6208                 if (skb) {
6209                         napi->skb = skb;
6210                         skb_mark_napi_id(skb, napi);
6211                 }
6212         }
6213         return skb;
6214 }
6215 EXPORT_SYMBOL(napi_get_frags);
6216
6217 static gro_result_t napi_frags_finish(struct napi_struct *napi,
6218                                       struct sk_buff *skb,
6219                                       gro_result_t ret)
6220 {
6221         switch (ret) {
6222         case GRO_NORMAL:
6223         case GRO_HELD:
6224                 __skb_push(skb, ETH_HLEN);
6225                 skb->protocol = eth_type_trans(skb, skb->dev);
6226                 if (ret == GRO_NORMAL)
6227                         gro_normal_one(napi, skb, 1);
6228                 break;
6229
6230         case GRO_DROP:
6231                 napi_reuse_skb(napi, skb);
6232                 break;
6233
6234         case GRO_MERGED_FREE:
6235                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
6236                         napi_skb_free_stolen_head(skb);
6237                 else
6238                         napi_reuse_skb(napi, skb);
6239                 break;
6240
6241         case GRO_MERGED:
6242         case GRO_CONSUMED:
6243                 break;
6244         }
6245
6246         return ret;
6247 }
6248
6249 /* Upper GRO stack assumes network header starts at gro_offset=0
6250  * Drivers could call both napi_gro_frags() and napi_gro_receive()
6251  * We copy ethernet header into skb->data to have a common layout.
6252  */
6253 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
6254 {
6255         struct sk_buff *skb = napi->skb;
6256         const struct ethhdr *eth;
6257         unsigned int hlen = sizeof(*eth);
6258
6259         napi->skb = NULL;
6260
6261         skb_reset_mac_header(skb);
6262         skb_gro_reset_offset(skb, hlen);
6263
6264         if (unlikely(skb_gro_header_hard(skb, hlen))) {
6265                 eth = skb_gro_header_slow(skb, hlen, 0);
6266                 if (unlikely(!eth)) {
6267                         net_warn_ratelimited("%s: dropping impossible skb from %s\n",
6268                                              __func__, napi->dev->name);
6269                         napi_reuse_skb(napi, skb);
6270                         return NULL;
6271                 }
6272         } else {
6273                 eth = (const struct ethhdr *)skb->data;
6274                 gro_pull_from_frag0(skb, hlen);
6275                 NAPI_GRO_CB(skb)->frag0 += hlen;
6276                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
6277         }
6278         __skb_pull(skb, hlen);
6279
6280         /*
6281          * This works because the only protocols we care about don't require
6282          * special handling.
6283          * We'll fix it up properly in napi_frags_finish()
6284          */
6285         skb->protocol = eth->h_proto;
6286
6287         return skb;
6288 }
6289
6290 gro_result_t napi_gro_frags(struct napi_struct *napi)
6291 {
6292         gro_result_t ret;
6293         struct sk_buff *skb = napi_frags_skb(napi);
6294
6295         if (!skb)
6296                 return GRO_DROP;
6297
6298         trace_napi_gro_frags_entry(skb);
6299
6300         ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
6301         trace_napi_gro_frags_exit(ret);
6302
6303         return ret;
6304 }
6305 EXPORT_SYMBOL(napi_gro_frags);
6306
6307 /* Compute the checksum from gro_offset and return the folded value
6308  * after adding in any pseudo checksum.
6309  */
6310 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
6311 {
6312         __wsum wsum;
6313         __sum16 sum;
6314
6315         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
6316
6317         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
6318         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
6319         /* See comments in __skb_checksum_complete(). */
6320         if (likely(!sum)) {
6321                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
6322                     !skb->csum_complete_sw)
6323                         netdev_rx_csum_fault(skb->dev, skb);
6324         }
6325
6326         NAPI_GRO_CB(skb)->csum = wsum;
6327         NAPI_GRO_CB(skb)->csum_valid = 1;
6328
6329         return sum;
6330 }
6331 EXPORT_SYMBOL(__skb_gro_checksum_complete);
6332
6333 static void net_rps_send_ipi(struct softnet_data *remsd)
6334 {
6335 #ifdef CONFIG_RPS
6336         while (remsd) {
6337                 struct softnet_data *next = remsd->rps_ipi_next;
6338
6339                 if (cpu_online(remsd->cpu))
6340                         smp_call_function_single_async(remsd->cpu, &remsd->csd);
6341                 remsd = next;
6342         }
6343 #endif
6344 }
6345
6346 /*
6347  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
6348  * Note: called with local irq disabled, but exits with local irq enabled.
6349  */
6350 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
6351 {
6352 #ifdef CONFIG_RPS
6353         struct softnet_data *remsd = sd->rps_ipi_list;
6354
6355         if (remsd) {
6356                 sd->rps_ipi_list = NULL;
6357
6358                 local_irq_enable();
6359
6360                 /* Send pending IPI's to kick RPS processing on remote cpus. */
6361                 net_rps_send_ipi(remsd);
6362         } else
6363 #endif
6364                 local_irq_enable();
6365 }
6366
6367 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
6368 {
6369 #ifdef CONFIG_RPS
6370         return sd->rps_ipi_list != NULL;
6371 #else
6372         return false;
6373 #endif
6374 }
6375
6376 static int process_backlog(struct napi_struct *napi, int quota)
6377 {
6378         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
6379         bool again = true;
6380         int work = 0;
6381
6382         /* Check if we have pending ipi, its better to send them now,
6383          * not waiting net_rx_action() end.
6384          */
6385         if (sd_has_rps_ipi_waiting(sd)) {
6386                 local_irq_disable();
6387                 net_rps_action_and_irq_enable(sd);
6388         }
6389
6390         napi->weight = READ_ONCE(dev_rx_weight);
6391         while (again) {
6392                 struct sk_buff *skb;
6393
6394                 while ((skb = __skb_dequeue(&sd->process_queue))) {
6395                         rcu_read_lock();
6396                         __netif_receive_skb(skb);
6397                         rcu_read_unlock();
6398                         input_queue_head_incr(sd);
6399                         if (++work >= quota)
6400                                 return work;
6401
6402                 }
6403
6404                 local_irq_disable();
6405                 rps_lock(sd);
6406                 if (skb_queue_empty(&sd->input_pkt_queue)) {
6407                         /*
6408                          * Inline a custom version of __napi_complete().
6409                          * only current cpu owns and manipulates this napi,
6410                          * and NAPI_STATE_SCHED is the only possible flag set
6411                          * on backlog.
6412                          * We can use a plain write instead of clear_bit(),
6413                          * and we dont need an smp_mb() memory barrier.
6414                          */
6415                         napi->state = 0;
6416                         again = false;
6417                 } else {
6418                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
6419                                                    &sd->process_queue);
6420                 }
6421                 rps_unlock(sd);
6422                 local_irq_enable();
6423         }
6424
6425         return work;
6426 }
6427
6428 /**
6429  * __napi_schedule - schedule for receive
6430  * @n: entry to schedule
6431  *
6432  * The entry's receive function will be scheduled to run.
6433  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
6434  */
6435 void __napi_schedule(struct napi_struct *n)
6436 {
6437         unsigned long flags;
6438
6439         local_irq_save(flags);
6440         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6441         local_irq_restore(flags);
6442 }
6443 EXPORT_SYMBOL(__napi_schedule);
6444
6445 /**
6446  *      napi_schedule_prep - check if napi can be scheduled
6447  *      @n: napi context
6448  *
6449  * Test if NAPI routine is already running, and if not mark
6450  * it as running.  This is used as a condition variable to
6451  * insure only one NAPI poll instance runs.  We also make
6452  * sure there is no pending NAPI disable.
6453  */
6454 bool napi_schedule_prep(struct napi_struct *n)
6455 {
6456         unsigned long val, new;
6457
6458         do {
6459                 val = READ_ONCE(n->state);
6460                 if (unlikely(val & NAPIF_STATE_DISABLE))
6461                         return false;
6462                 new = val | NAPIF_STATE_SCHED;
6463
6464                 /* Sets STATE_MISSED bit if STATE_SCHED was already set
6465                  * This was suggested by Alexander Duyck, as compiler
6466                  * emits better code than :
6467                  * if (val & NAPIF_STATE_SCHED)
6468                  *     new |= NAPIF_STATE_MISSED;
6469                  */
6470                 new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
6471                                                    NAPIF_STATE_MISSED;
6472         } while (cmpxchg(&n->state, val, new) != val);
6473
6474         return !(val & NAPIF_STATE_SCHED);
6475 }
6476 EXPORT_SYMBOL(napi_schedule_prep);
6477
6478 /**
6479  * __napi_schedule_irqoff - schedule for receive
6480  * @n: entry to schedule
6481  *
6482  * Variant of __napi_schedule() assuming hard irqs are masked.
6483  *
6484  * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
6485  * because the interrupt disabled assumption might not be true
6486  * due to force-threaded interrupts and spinlock substitution.
6487  */
6488 void __napi_schedule_irqoff(struct napi_struct *n)
6489 {
6490         if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6491                 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6492         else
6493                 __napi_schedule(n);
6494 }
6495 EXPORT_SYMBOL(__napi_schedule_irqoff);
6496
6497 bool napi_complete_done(struct napi_struct *n, int work_done)
6498 {
6499         unsigned long flags, val, new, timeout = 0;
6500         bool ret = true;
6501
6502         /*
6503          * 1) Don't let napi dequeue from the cpu poll list
6504          *    just in case its running on a different cpu.
6505          * 2) If we are busy polling, do nothing here, we have
6506          *    the guarantee we will be called later.
6507          */
6508         if (unlikely(n->state & (NAPIF_STATE_NPSVC |
6509                                  NAPIF_STATE_IN_BUSY_POLL)))
6510                 return false;
6511
6512         if (work_done) {
6513                 if (n->gro_bitmask)
6514                         timeout = READ_ONCE(n->dev->gro_flush_timeout);
6515                 n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
6516         }
6517         if (n->defer_hard_irqs_count > 0) {
6518                 n->defer_hard_irqs_count--;
6519                 timeout = READ_ONCE(n->dev->gro_flush_timeout);
6520                 if (timeout)
6521                         ret = false;
6522         }
6523         if (n->gro_bitmask) {
6524                 /* When the NAPI instance uses a timeout and keeps postponing
6525                  * it, we need to bound somehow the time packets are kept in
6526                  * the GRO layer
6527                  */
6528                 napi_gro_flush(n, !!timeout);
6529         }
6530
6531         gro_normal_list(n);
6532
6533         if (unlikely(!list_empty(&n->poll_list))) {
6534                 /* If n->poll_list is not empty, we need to mask irqs */
6535                 local_irq_save(flags);
6536                 list_del_init(&n->poll_list);
6537                 local_irq_restore(flags);
6538         }
6539
6540         do {
6541                 val = READ_ONCE(n->state);
6542
6543                 WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
6544
6545                 new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
6546
6547                 /* If STATE_MISSED was set, leave STATE_SCHED set,
6548                  * because we will call napi->poll() one more time.
6549                  * This C code was suggested by Alexander Duyck to help gcc.
6550                  */
6551                 new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
6552                                                     NAPIF_STATE_SCHED;
6553         } while (cmpxchg(&n->state, val, new) != val);
6554
6555         if (unlikely(val & NAPIF_STATE_MISSED)) {
6556                 __napi_schedule(n);
6557                 return false;
6558         }
6559
6560         if (timeout)
6561                 hrtimer_start(&n->timer, ns_to_ktime(timeout),
6562                               HRTIMER_MODE_REL_PINNED);
6563         return ret;
6564 }
6565 EXPORT_SYMBOL(napi_complete_done);
6566
6567 /* must be called under rcu_read_lock(), as we dont take a reference */
6568 static struct napi_struct *napi_by_id(unsigned int napi_id)
6569 {
6570         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
6571         struct napi_struct *napi;
6572
6573         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
6574                 if (napi->napi_id == napi_id)
6575                         return napi;
6576
6577         return NULL;
6578 }
6579
6580 #if defined(CONFIG_NET_RX_BUSY_POLL)
6581
6582 #define BUSY_POLL_BUDGET 8
6583
6584 static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
6585 {
6586         int rc;
6587
6588         /* Busy polling means there is a high chance device driver hard irq
6589          * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
6590          * set in napi_schedule_prep().
6591          * Since we are about to call napi->poll() once more, we can safely
6592          * clear NAPI_STATE_MISSED.
6593          *
6594          * Note: x86 could use a single "lock and ..." instruction
6595          * to perform these two clear_bit()
6596          */
6597         clear_bit(NAPI_STATE_MISSED, &napi->state);
6598         clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
6599
6600         local_bh_disable();
6601
6602         /* All we really want here is to re-enable device interrupts.
6603          * Ideally, a new ndo_busy_poll_stop() could avoid another round.
6604          */
6605         rc = napi->poll(napi, BUSY_POLL_BUDGET);
6606         /* We can't gro_normal_list() here, because napi->poll() might have
6607          * rearmed the napi (napi_complete_done()) in which case it could
6608          * already be running on another CPU.
6609          */
6610         trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
6611         netpoll_poll_unlock(have_poll_lock);
6612         if (rc == BUSY_POLL_BUDGET) {
6613                 /* As the whole budget was spent, we still own the napi so can
6614                  * safely handle the rx_list.
6615                  */
6616                 gro_normal_list(napi);
6617                 __napi_schedule(napi);
6618         }
6619         local_bh_enable();
6620 }
6621
6622 void napi_busy_loop(unsigned int napi_id,
6623                     bool (*loop_end)(void *, unsigned long),
6624                     void *loop_end_arg)
6625 {
6626         unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
6627         int (*napi_poll)(struct napi_struct *napi, int budget);
6628         void *have_poll_lock = NULL;
6629         struct napi_struct *napi;
6630
6631 restart:
6632         napi_poll = NULL;
6633
6634         rcu_read_lock();
6635
6636         napi = napi_by_id(napi_id);
6637         if (!napi)
6638                 goto out;
6639
6640         preempt_disable();
6641         for (;;) {
6642                 int work = 0;
6643
6644                 local_bh_disable();
6645                 if (!napi_poll) {
6646                         unsigned long val = READ_ONCE(napi->state);
6647
6648                         /* If multiple threads are competing for this napi,
6649                          * we avoid dirtying napi->state as much as we can.
6650                          */
6651                         if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
6652                                    NAPIF_STATE_IN_BUSY_POLL))
6653                                 goto count;
6654                         if (cmpxchg(&napi->state, val,
6655                                     val | NAPIF_STATE_IN_BUSY_POLL |
6656                                           NAPIF_STATE_SCHED) != val)
6657                                 goto count;
6658                         have_poll_lock = netpoll_poll_lock(napi);
6659                         napi_poll = napi->poll;
6660                 }
6661                 work = napi_poll(napi, BUSY_POLL_BUDGET);
6662                 trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
6663                 gro_normal_list(napi);
6664 count:
6665                 if (work > 0)
6666                         __NET_ADD_STATS(dev_net(napi->dev),
6667                                         LINUX_MIB_BUSYPOLLRXPACKETS, work);
6668                 local_bh_enable();
6669
6670                 if (!loop_end || loop_end(loop_end_arg, start_time))
6671                         break;
6672
6673                 if (unlikely(need_resched())) {
6674                         if (napi_poll)
6675                                 busy_poll_stop(napi, have_poll_lock);
6676                         preempt_enable();
6677                         rcu_read_unlock();
6678                         cond_resched();
6679                         if (loop_end(loop_end_arg, start_time))
6680                                 return;
6681                         goto restart;
6682                 }
6683                 cpu_relax();
6684         }
6685         if (napi_poll)
6686                 busy_poll_stop(napi, have_poll_lock);
6687         preempt_enable();
6688 out:
6689         rcu_read_unlock();
6690 }
6691 EXPORT_SYMBOL(napi_busy_loop);
6692
6693 #endif /* CONFIG_NET_RX_BUSY_POLL */
6694
6695 static void napi_hash_add(struct napi_struct *napi)
6696 {
6697         if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
6698                 return;
6699
6700         spin_lock(&napi_hash_lock);
6701
6702         /* 0..NR_CPUS range is reserved for sender_cpu use */
6703         do {
6704                 if (unlikely(++napi_gen_id < MIN_NAPI_ID))
6705                         napi_gen_id = MIN_NAPI_ID;
6706         } while (napi_by_id(napi_gen_id));
6707         napi->napi_id = napi_gen_id;
6708
6709         hlist_add_head_rcu(&napi->napi_hash_node,
6710                            &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
6711
6712         spin_unlock(&napi_hash_lock);
6713 }
6714
6715 /* Warning : caller is responsible to make sure rcu grace period
6716  * is respected before freeing memory containing @napi
6717  */
6718 static void napi_hash_del(struct napi_struct *napi)
6719 {
6720         spin_lock(&napi_hash_lock);
6721
6722         hlist_del_init_rcu(&napi->napi_hash_node);
6723
6724         spin_unlock(&napi_hash_lock);
6725 }
6726
6727 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
6728 {
6729         struct napi_struct *napi;
6730
6731         napi = container_of(timer, struct napi_struct, timer);
6732
6733         /* Note : we use a relaxed variant of napi_schedule_prep() not setting
6734          * NAPI_STATE_MISSED, since we do not react to a device IRQ.
6735          */
6736         if (!napi_disable_pending(napi) &&
6737             !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
6738                 __napi_schedule_irqoff(napi);
6739
6740         return HRTIMER_NORESTART;
6741 }
6742
6743 static void init_gro_hash(struct napi_struct *napi)
6744 {
6745         int i;
6746
6747         for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6748                 INIT_LIST_HEAD(&napi->gro_hash[i].list);
6749                 napi->gro_hash[i].count = 0;
6750         }
6751         napi->gro_bitmask = 0;
6752 }
6753
6754 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
6755                     int (*poll)(struct napi_struct *, int), int weight)
6756 {
6757         if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
6758                 return;
6759
6760         INIT_LIST_HEAD(&napi->poll_list);
6761         INIT_HLIST_NODE(&napi->napi_hash_node);
6762         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
6763         napi->timer.function = napi_watchdog;
6764         init_gro_hash(napi);
6765         napi->skb = NULL;
6766         INIT_LIST_HEAD(&napi->rx_list);
6767         napi->rx_count = 0;
6768         napi->poll = poll;
6769         if (weight > NAPI_POLL_WEIGHT)
6770                 netdev_err_once(dev, "%s() called with weight %d\n", __func__,
6771                                 weight);
6772         napi->weight = weight;
6773         napi->dev = dev;
6774 #ifdef CONFIG_NETPOLL
6775         napi->poll_owner = -1;
6776 #endif
6777         set_bit(NAPI_STATE_SCHED, &napi->state);
6778         set_bit(NAPI_STATE_NPSVC, &napi->state);
6779         list_add_rcu(&napi->dev_list, &dev->napi_list);
6780         napi_hash_add(napi);
6781 }
6782 EXPORT_SYMBOL(netif_napi_add);
6783
6784 void napi_disable(struct napi_struct *n)
6785 {
6786         might_sleep();
6787         set_bit(NAPI_STATE_DISABLE, &n->state);
6788
6789         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
6790                 msleep(1);
6791         while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
6792                 msleep(1);
6793
6794         hrtimer_cancel(&n->timer);
6795
6796         clear_bit(NAPI_STATE_DISABLE, &n->state);
6797 }
6798 EXPORT_SYMBOL(napi_disable);
6799
6800 static void flush_gro_hash(struct napi_struct *napi)
6801 {
6802         int i;
6803
6804         for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6805                 struct sk_buff *skb, *n;
6806
6807                 list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
6808                         kfree_skb(skb);
6809                 napi->gro_hash[i].count = 0;
6810         }
6811 }
6812
6813 /* Must be called in process context */
6814 void __netif_napi_del(struct napi_struct *napi)
6815 {
6816         if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
6817                 return;
6818
6819         napi_hash_del(napi);
6820         list_del_rcu(&napi->dev_list);
6821         napi_free_frags(napi);
6822
6823         flush_gro_hash(napi);
6824         napi->gro_bitmask = 0;
6825 }
6826 EXPORT_SYMBOL(__netif_napi_del);
6827
6828 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
6829 {
6830         void *have;
6831         int work, weight;
6832
6833         list_del_init(&n->poll_list);
6834
6835         have = netpoll_poll_lock(n);
6836
6837         weight = n->weight;
6838
6839         /* This NAPI_STATE_SCHED test is for avoiding a race
6840          * with netpoll's poll_napi().  Only the entity which
6841          * obtains the lock and sees NAPI_STATE_SCHED set will
6842          * actually make the ->poll() call.  Therefore we avoid
6843          * accidentally calling ->poll() when NAPI is not scheduled.
6844          */
6845         work = 0;
6846         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
6847                 work = n->poll(n, weight);
6848                 trace_napi_poll(n, work, weight);
6849         }
6850
6851         if (unlikely(work > weight))
6852                 pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
6853                             n->poll, work, weight);
6854
6855         if (likely(work < weight))
6856                 goto out_unlock;
6857
6858         /* Drivers must not modify the NAPI state if they
6859          * consume the entire weight.  In such cases this code
6860          * still "owns" the NAPI instance and therefore can
6861          * move the instance around on the list at-will.
6862          */
6863         if (unlikely(napi_disable_pending(n))) {
6864                 napi_complete(n);
6865                 goto out_unlock;
6866         }
6867
6868         if (n->gro_bitmask) {
6869                 /* flush too old packets
6870                  * If HZ < 1000, flush all packets.
6871                  */
6872                 napi_gro_flush(n, HZ >= 1000);
6873         }
6874
6875         gro_normal_list(n);
6876
6877         /* Some drivers may have called napi_schedule
6878          * prior to exhausting their budget.
6879          */
6880         if (unlikely(!list_empty(&n->poll_list))) {
6881                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
6882                              n->dev ? n->dev->name : "backlog");
6883                 goto out_unlock;
6884         }
6885
6886         list_add_tail(&n->poll_list, repoll);
6887
6888 out_unlock:
6889         netpoll_poll_unlock(have);
6890
6891         return work;
6892 }
6893
6894 static __latent_entropy void net_rx_action(struct softirq_action *h)
6895 {
6896         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6897         unsigned long time_limit = jiffies +
6898                 usecs_to_jiffies(READ_ONCE(netdev_budget_usecs));
6899         int budget = READ_ONCE(netdev_budget);
6900         LIST_HEAD(list);
6901         LIST_HEAD(repoll);
6902
6903         local_irq_disable();
6904         list_splice_init(&sd->poll_list, &list);
6905         local_irq_enable();
6906
6907         for (;;) {
6908                 struct napi_struct *n;
6909
6910                 if (list_empty(&list)) {
6911                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
6912                                 goto out;
6913                         break;
6914                 }
6915
6916                 n = list_first_entry(&list, struct napi_struct, poll_list);
6917                 budget -= napi_poll(n, &repoll);
6918
6919                 /* If softirq window is exhausted then punt.
6920                  * Allow this to run for 2 jiffies since which will allow
6921                  * an average latency of 1.5/HZ.
6922                  */
6923                 if (unlikely(budget <= 0 ||
6924                              time_after_eq(jiffies, time_limit))) {
6925                         sd->time_squeeze++;
6926                         break;
6927                 }
6928         }
6929
6930         local_irq_disable();
6931
6932         list_splice_tail_init(&sd->poll_list, &list);
6933         list_splice_tail(&repoll, &list);
6934         list_splice(&list, &sd->poll_list);
6935         if (!list_empty(&sd->poll_list))
6936                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
6937
6938         net_rps_action_and_irq_enable(sd);
6939 out:
6940         __kfree_skb_flush();
6941 }
6942
6943 struct netdev_adjacent {
6944         struct net_device *dev;
6945
6946         /* upper master flag, there can only be one master device per list */
6947         bool master;
6948
6949         /* lookup ignore flag */
6950         bool ignore;
6951
6952         /* counter for the number of times this device was added to us */
6953         u16 ref_nr;
6954
6955         /* private field for the users */
6956         void *private;
6957
6958         struct list_head list;
6959         struct rcu_head rcu;
6960 };
6961
6962 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
6963                                                  struct list_head *adj_list)
6964 {
6965         struct netdev_adjacent *adj;
6966
6967         list_for_each_entry(adj, adj_list, list) {
6968                 if (adj->dev == adj_dev)
6969                         return adj;
6970         }
6971         return NULL;
6972 }
6973
6974 static int ____netdev_has_upper_dev(struct net_device *upper_dev,
6975                                     struct netdev_nested_priv *priv)
6976 {
6977         struct net_device *dev = (struct net_device *)priv->data;
6978
6979         return upper_dev == dev;
6980 }
6981
6982 /**
6983  * netdev_has_upper_dev - Check if device is linked to an upper device
6984  * @dev: device
6985  * @upper_dev: upper device to check
6986  *
6987  * Find out if a device is linked to specified upper device and return true
6988  * in case it is. Note that this checks only immediate upper device,
6989  * not through a complete stack of devices. The caller must hold the RTNL lock.
6990  */
6991 bool netdev_has_upper_dev(struct net_device *dev,
6992                           struct net_device *upper_dev)
6993 {
6994         struct netdev_nested_priv priv = {
6995                 .data = (void *)upper_dev,
6996         };
6997
6998         ASSERT_RTNL();
6999
7000         return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
7001                                              &priv);
7002 }
7003 EXPORT_SYMBOL(netdev_has_upper_dev);
7004
7005 /**
7006  * netdev_has_upper_dev_all - Check if device is linked to an upper device
7007  * @dev: device
7008  * @upper_dev: upper device to check
7009  *
7010  * Find out if a device is linked to specified upper device and return true
7011  * in case it is. Note that this checks the entire upper device chain.
7012  * The caller must hold rcu lock.
7013  */
7014
7015 bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
7016                                   struct net_device *upper_dev)
7017 {
7018         struct netdev_nested_priv priv = {
7019                 .data = (void *)upper_dev,
7020         };
7021
7022         return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
7023                                                &priv);
7024 }
7025 EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
7026
7027 /**
7028  * netdev_has_any_upper_dev - Check if device is linked to some device
7029  * @dev: device
7030  *
7031  * Find out if a device is linked to an upper device and return true in case
7032  * it is. The caller must hold the RTNL lock.
7033  */
7034 bool netdev_has_any_upper_dev(struct net_device *dev)
7035 {
7036         ASSERT_RTNL();
7037
7038         return !list_empty(&dev->adj_list.upper);
7039 }
7040 EXPORT_SYMBOL(netdev_has_any_upper_dev);
7041
7042 /**
7043  * netdev_master_upper_dev_get - Get master upper device
7044  * @dev: device
7045  *
7046  * Find a master upper device and return pointer to it or NULL in case
7047  * it's not there. The caller must hold the RTNL lock.
7048  */
7049 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
7050 {
7051         struct netdev_adjacent *upper;
7052
7053         ASSERT_RTNL();
7054
7055         if (list_empty(&dev->adj_list.upper))
7056                 return NULL;
7057
7058         upper = list_first_entry(&dev->adj_list.upper,
7059                                  struct netdev_adjacent, list);
7060         if (likely(upper->master))
7061                 return upper->dev;
7062         return NULL;
7063 }
7064 EXPORT_SYMBOL(netdev_master_upper_dev_get);
7065
7066 static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
7067 {
7068         struct netdev_adjacent *upper;
7069
7070         ASSERT_RTNL();
7071
7072         if (list_empty(&dev->adj_list.upper))
7073                 return NULL;
7074
7075         upper = list_first_entry(&dev->adj_list.upper,
7076                                  struct netdev_adjacent, list);
7077         if (likely(upper->master) && !upper->ignore)
7078                 return upper->dev;
7079         return NULL;
7080 }
7081
7082 /**
7083  * netdev_has_any_lower_dev - Check if device is linked to some device
7084  * @dev: device
7085  *
7086  * Find out if a device is linked to a lower device and return true in case
7087  * it is. The caller must hold the RTNL lock.
7088  */
7089 static bool netdev_has_any_lower_dev(struct net_device *dev)
7090 {
7091         ASSERT_RTNL();
7092
7093         return !list_empty(&dev->adj_list.lower);
7094 }
7095
7096 void *netdev_adjacent_get_private(struct list_head *adj_list)
7097 {
7098         struct netdev_adjacent *adj;
7099
7100         adj = list_entry(adj_list, struct netdev_adjacent, list);
7101
7102         return adj->private;
7103 }
7104 EXPORT_SYMBOL(netdev_adjacent_get_private);
7105
7106 /**
7107  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
7108  * @dev: device
7109  * @iter: list_head ** of the current position
7110  *
7111  * Gets the next device from the dev's upper list, starting from iter
7112  * position. The caller must hold RCU read lock.
7113  */
7114 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
7115                                                  struct list_head **iter)
7116 {
7117         struct netdev_adjacent *upper;
7118
7119         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
7120
7121         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7122
7123         if (&upper->list == &dev->adj_list.upper)
7124                 return NULL;
7125
7126         *iter = &upper->list;
7127
7128         return upper->dev;
7129 }
7130 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
7131
7132 static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
7133                                                   struct list_head **iter,
7134                                                   bool *ignore)
7135 {
7136         struct netdev_adjacent *upper;
7137
7138         upper = list_entry((*iter)->next, struct netdev_adjacent, list);
7139
7140         if (&upper->list == &dev->adj_list.upper)
7141                 return NULL;
7142
7143         *iter = &upper->list;
7144         *ignore = upper->ignore;
7145
7146         return upper->dev;
7147 }
7148
7149 static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
7150                                                     struct list_head **iter)
7151 {
7152         struct netdev_adjacent *upper;
7153
7154         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
7155
7156         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7157
7158         if (&upper->list == &dev->adj_list.upper)
7159                 return NULL;
7160
7161         *iter = &upper->list;
7162
7163         return upper->dev;
7164 }
7165
7166 static int __netdev_walk_all_upper_dev(struct net_device *dev,
7167                                        int (*fn)(struct net_device *dev,
7168                                          struct netdev_nested_priv *priv),
7169                                        struct netdev_nested_priv *priv)
7170 {
7171         struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7172         struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7173         int ret, cur = 0;
7174         bool ignore;
7175
7176         now = dev;
7177         iter = &dev->adj_list.upper;
7178
7179         while (1) {
7180                 if (now != dev) {
7181                         ret = fn(now, priv);
7182                         if (ret)
7183                                 return ret;
7184                 }
7185
7186                 next = NULL;
7187                 while (1) {
7188                         udev = __netdev_next_upper_dev(now, &iter, &ignore);
7189                         if (!udev)
7190                                 break;
7191                         if (ignore)
7192                                 continue;
7193
7194                         next = udev;
7195                         niter = &udev->adj_list.upper;
7196                         dev_stack[cur] = now;
7197                         iter_stack[cur++] = iter;
7198                         break;
7199                 }
7200
7201                 if (!next) {
7202                         if (!cur)
7203                                 return 0;
7204                         next = dev_stack[--cur];
7205                         niter = iter_stack[cur];
7206                 }
7207
7208                 now = next;
7209                 iter = niter;
7210         }
7211
7212         return 0;
7213 }
7214
7215 int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
7216                                   int (*fn)(struct net_device *dev,
7217                                             struct netdev_nested_priv *priv),
7218                                   struct netdev_nested_priv *priv)
7219 {
7220         struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7221         struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7222         int ret, cur = 0;
7223
7224         now = dev;
7225         iter = &dev->adj_list.upper;
7226
7227         while (1) {
7228                 if (now != dev) {
7229                         ret = fn(now, priv);
7230                         if (ret)
7231                                 return ret;
7232                 }
7233
7234                 next = NULL;
7235                 while (1) {
7236                         udev = netdev_next_upper_dev_rcu(now, &iter);
7237                         if (!udev)
7238                                 break;
7239
7240                         next = udev;
7241                         niter = &udev->adj_list.upper;
7242                         dev_stack[cur] = now;
7243                         iter_stack[cur++] = iter;
7244                         break;
7245                 }
7246
7247                 if (!next) {
7248                         if (!cur)
7249                                 return 0;
7250                         next = dev_stack[--cur];
7251                         niter = iter_stack[cur];
7252                 }
7253
7254                 now = next;
7255                 iter = niter;
7256         }
7257
7258         return 0;
7259 }
7260 EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
7261
7262 static bool __netdev_has_upper_dev(struct net_device *dev,
7263                                    struct net_device *upper_dev)
7264 {
7265         struct netdev_nested_priv priv = {
7266                 .flags = 0,
7267                 .data = (void *)upper_dev,
7268         };
7269
7270         ASSERT_RTNL();
7271
7272         return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
7273                                            &priv);
7274 }
7275
7276 /**
7277  * netdev_lower_get_next_private - Get the next ->private from the
7278  *                                 lower neighbour list
7279  * @dev: device
7280  * @iter: list_head ** of the current position
7281  *
7282  * Gets the next netdev_adjacent->private from the dev's lower neighbour
7283  * list, starting from iter position. The caller must hold either hold the
7284  * RTNL lock or its own locking that guarantees that the neighbour lower
7285  * list will remain unchanged.
7286  */
7287 void *netdev_lower_get_next_private(struct net_device *dev,
7288                                     struct list_head **iter)
7289 {
7290         struct netdev_adjacent *lower;
7291
7292         lower = list_entry(*iter, struct netdev_adjacent, list);
7293
7294         if (&lower->list == &dev->adj_list.lower)
7295                 return NULL;
7296
7297         *iter = lower->list.next;
7298
7299         return lower->private;
7300 }
7301 EXPORT_SYMBOL(netdev_lower_get_next_private);
7302
7303 /**
7304  * netdev_lower_get_next_private_rcu - Get the next ->private from the
7305  *                                     lower neighbour list, RCU
7306  *                                     variant
7307  * @dev: device
7308  * @iter: list_head ** of the current position
7309  *
7310  * Gets the next netdev_adjacent->private from the dev's lower neighbour
7311  * list, starting from iter position. The caller must hold RCU read lock.
7312  */
7313 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
7314                                         struct list_head **iter)
7315 {
7316         struct netdev_adjacent *lower;
7317
7318         WARN_ON_ONCE(!rcu_read_lock_held());
7319
7320         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7321
7322         if (&lower->list == &dev->adj_list.lower)
7323                 return NULL;
7324
7325         *iter = &lower->list;
7326
7327         return lower->private;
7328 }
7329 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
7330
7331 /**
7332  * netdev_lower_get_next - Get the next device from the lower neighbour
7333  *                         list
7334  * @dev: device
7335  * @iter: list_head ** of the current position
7336  *
7337  * Gets the next netdev_adjacent from the dev's lower neighbour
7338  * list, starting from iter position. The caller must hold RTNL lock or
7339  * its own locking that guarantees that the neighbour lower
7340  * list will remain unchanged.
7341  */
7342 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
7343 {
7344         struct netdev_adjacent *lower;
7345
7346         lower = list_entry(*iter, struct netdev_adjacent, list);
7347
7348         if (&lower->list == &dev->adj_list.lower)
7349                 return NULL;
7350
7351         *iter = lower->list.next;
7352
7353         return lower->dev;
7354 }
7355 EXPORT_SYMBOL(netdev_lower_get_next);
7356
7357 static struct net_device *netdev_next_lower_dev(struct net_device *dev,
7358                                                 struct list_head **iter)
7359 {
7360         struct netdev_adjacent *lower;
7361
7362         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7363
7364         if (&lower->list == &dev->adj_list.lower)
7365                 return NULL;
7366
7367         *iter = &lower->list;
7368
7369         return lower->dev;
7370 }
7371
7372 static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
7373                                                   struct list_head **iter,
7374                                                   bool *ignore)
7375 {
7376         struct netdev_adjacent *lower;
7377
7378         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7379
7380         if (&lower->list == &dev->adj_list.lower)
7381                 return NULL;
7382
7383         *iter = &lower->list;
7384         *ignore = lower->ignore;
7385
7386         return lower->dev;
7387 }
7388
7389 int netdev_walk_all_lower_dev(struct net_device *dev,
7390                               int (*fn)(struct net_device *dev,
7391                                         struct netdev_nested_priv *priv),
7392                               struct netdev_nested_priv *priv)
7393 {
7394         struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7395         struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7396         int ret, cur = 0;
7397
7398         now = dev;
7399         iter = &dev->adj_list.lower;
7400
7401         while (1) {
7402                 if (now != dev) {
7403                         ret = fn(now, priv);
7404                         if (ret)
7405                                 return ret;
7406                 }
7407
7408                 next = NULL;
7409                 while (1) {
7410                         ldev = netdev_next_lower_dev(now, &iter);
7411                         if (!ldev)
7412                                 break;
7413
7414                         next = ldev;
7415                         niter = &ldev->adj_list.lower;
7416                         dev_stack[cur] = now;
7417                         iter_stack[cur++] = iter;
7418                         break;
7419                 }
7420
7421                 if (!next) {
7422                         if (!cur)
7423                                 return 0;
7424                         next = dev_stack[--cur];
7425                         niter = iter_stack[cur];
7426                 }
7427
7428                 now = next;
7429                 iter = niter;
7430         }
7431
7432         return 0;
7433 }
7434 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
7435
7436 static int __netdev_walk_all_lower_dev(struct net_device *dev,
7437                                        int (*fn)(struct net_device *dev,
7438                                          struct netdev_nested_priv *priv),
7439                                        struct netdev_nested_priv *priv)
7440 {
7441         struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7442         struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7443         int ret, cur = 0;
7444         bool ignore;
7445
7446         now = dev;
7447         iter = &dev->adj_list.lower;
7448
7449         while (1) {
7450                 if (now != dev) {
7451                         ret = fn(now, priv);
7452                         if (ret)
7453                                 return ret;
7454                 }
7455
7456                 next = NULL;
7457                 while (1) {
7458                         ldev = __netdev_next_lower_dev(now, &iter, &ignore);
7459                         if (!ldev)
7460                                 break;
7461                         if (ignore)
7462                                 continue;
7463
7464                         next = ldev;
7465                         niter = &ldev->adj_list.lower;
7466                         dev_stack[cur] = now;
7467                         iter_stack[cur++] = iter;
7468                         break;
7469                 }
7470
7471                 if (!next) {
7472                         if (!cur)
7473                                 return 0;
7474                         next = dev_stack[--cur];
7475                         niter = iter_stack[cur];
7476                 }
7477
7478                 now = next;
7479                 iter = niter;
7480         }
7481
7482         return 0;
7483 }
7484
7485 struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
7486                                              struct list_head **iter)
7487 {
7488         struct netdev_adjacent *lower;
7489
7490         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7491         if (&lower->list == &dev->adj_list.lower)
7492                 return NULL;
7493
7494         *iter = &lower->list;
7495
7496         return lower->dev;
7497 }
7498 EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
7499
7500 static u8 __netdev_upper_depth(struct net_device *dev)
7501 {
7502         struct net_device *udev;
7503         struct list_head *iter;
7504         u8 max_depth = 0;
7505         bool ignore;
7506
7507         for (iter = &dev->adj_list.upper,
7508              udev = __netdev_next_upper_dev(dev, &iter, &ignore);
7509              udev;
7510              udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
7511                 if (ignore)
7512                         continue;
7513                 if (max_depth < udev->upper_level)
7514                         max_depth = udev->upper_level;
7515         }
7516
7517         return max_depth;
7518 }
7519
7520 static u8 __netdev_lower_depth(struct net_device *dev)
7521 {
7522         struct net_device *ldev;
7523         struct list_head *iter;
7524         u8 max_depth = 0;
7525         bool ignore;
7526
7527         for (iter = &dev->adj_list.lower,
7528              ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
7529              ldev;
7530              ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
7531                 if (ignore)
7532                         continue;
7533                 if (max_depth < ldev->lower_level)
7534                         max_depth = ldev->lower_level;
7535         }
7536
7537         return max_depth;
7538 }
7539
7540 static int __netdev_update_upper_level(struct net_device *dev,
7541                                        struct netdev_nested_priv *__unused)
7542 {
7543         dev->upper_level = __netdev_upper_depth(dev) + 1;
7544         return 0;
7545 }
7546
7547 static int __netdev_update_lower_level(struct net_device *dev,
7548                                        struct netdev_nested_priv *priv)
7549 {
7550         dev->lower_level = __netdev_lower_depth(dev) + 1;
7551
7552 #ifdef CONFIG_LOCKDEP
7553         if (!priv)
7554                 return 0;
7555
7556         if (priv->flags & NESTED_SYNC_IMM)
7557                 dev->nested_level = dev->lower_level - 1;
7558         if (priv->flags & NESTED_SYNC_TODO)
7559                 net_unlink_todo(dev);
7560 #endif
7561         return 0;
7562 }
7563
7564 int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
7565                                   int (*fn)(struct net_device *dev,
7566                                             struct netdev_nested_priv *priv),
7567                                   struct netdev_nested_priv *priv)
7568 {
7569         struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7570         struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7571         int ret, cur = 0;
7572
7573         now = dev;
7574         iter = &dev->adj_list.lower;
7575
7576         while (1) {
7577                 if (now != dev) {
7578                         ret = fn(now, priv);
7579                         if (ret)
7580                                 return ret;
7581                 }
7582
7583                 next = NULL;
7584                 while (1) {
7585                         ldev = netdev_next_lower_dev_rcu(now, &iter);
7586                         if (!ldev)
7587                                 break;
7588
7589                         next = ldev;
7590                         niter = &ldev->adj_list.lower;
7591                         dev_stack[cur] = now;
7592                         iter_stack[cur++] = iter;
7593                         break;
7594                 }
7595
7596                 if (!next) {
7597                         if (!cur)
7598                                 return 0;
7599                         next = dev_stack[--cur];
7600                         niter = iter_stack[cur];
7601                 }
7602
7603                 now = next;
7604                 iter = niter;
7605         }
7606
7607         return 0;
7608 }
7609 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
7610
7611 /**
7612  * netdev_lower_get_first_private_rcu - Get the first ->private from the
7613  *                                     lower neighbour list, RCU
7614  *                                     variant
7615  * @dev: device
7616  *
7617  * Gets the first netdev_adjacent->private from the dev's lower neighbour
7618  * list. The caller must hold RCU read lock.
7619  */
7620 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
7621 {
7622         struct netdev_adjacent *lower;
7623
7624         lower = list_first_or_null_rcu(&dev->adj_list.lower,
7625                         struct netdev_adjacent, list);
7626         if (lower)
7627                 return lower->private;
7628         return NULL;
7629 }
7630 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
7631
7632 /**
7633  * netdev_master_upper_dev_get_rcu - Get master upper device
7634  * @dev: device
7635  *
7636  * Find a master upper device and return pointer to it or NULL in case
7637  * it's not there. The caller must hold the RCU read lock.
7638  */
7639 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
7640 {
7641         struct netdev_adjacent *upper;
7642
7643         upper = list_first_or_null_rcu(&dev->adj_list.upper,
7644                                        struct netdev_adjacent, list);
7645         if (upper && likely(upper->master))
7646                 return upper->dev;
7647         return NULL;
7648 }
7649 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
7650
7651 static int netdev_adjacent_sysfs_add(struct net_device *dev,
7652                               struct net_device *adj_dev,
7653                               struct list_head *dev_list)
7654 {
7655         char linkname[IFNAMSIZ+7];
7656
7657         sprintf(linkname, dev_list == &dev->adj_list.upper ?
7658                 "upper_%s" : "lower_%s", adj_dev->name);
7659         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
7660                                  linkname);
7661 }
7662 static void netdev_adjacent_sysfs_del(struct net_device *dev,
7663                                char *name,
7664                                struct list_head *dev_list)
7665 {
7666         char linkname[IFNAMSIZ+7];
7667
7668         sprintf(linkname, dev_list == &dev->adj_list.upper ?
7669                 "upper_%s" : "lower_%s", name);
7670         sysfs_remove_link(&(dev->dev.kobj), linkname);
7671 }
7672
7673 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
7674                                                  struct net_device *adj_dev,
7675                                                  struct list_head *dev_list)
7676 {
7677         return (dev_list == &dev->adj_list.upper ||
7678                 dev_list == &dev->adj_list.lower) &&
7679                 net_eq(dev_net(dev), dev_net(adj_dev));
7680 }
7681
7682 static int __netdev_adjacent_dev_insert(struct net_device *dev,
7683                                         struct net_device *adj_dev,
7684                                         struct list_head *dev_list,
7685                                         void *private, bool master)
7686 {
7687         struct netdev_adjacent *adj;
7688         int ret;
7689
7690         adj = __netdev_find_adj(adj_dev, dev_list);
7691
7692         if (adj) {
7693                 adj->ref_nr += 1;
7694                 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
7695                          dev->name, adj_dev->name, adj->ref_nr);
7696
7697                 return 0;
7698         }
7699
7700         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
7701         if (!adj)
7702                 return -ENOMEM;
7703
7704         adj->dev = adj_dev;
7705         adj->master = master;
7706         adj->ref_nr = 1;
7707         adj->private = private;
7708         adj->ignore = false;
7709         dev_hold(adj_dev);
7710
7711         pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
7712                  dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
7713
7714         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
7715                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
7716                 if (ret)
7717                         goto free_adj;
7718         }
7719
7720         /* Ensure that master link is always the first item in list. */
7721         if (master) {
7722                 ret = sysfs_create_link(&(dev->dev.kobj),
7723                                         &(adj_dev->dev.kobj), "master");
7724                 if (ret)
7725                         goto remove_symlinks;
7726
7727                 list_add_rcu(&adj->list, dev_list);
7728         } else {
7729                 list_add_tail_rcu(&adj->list, dev_list);
7730         }
7731
7732         return 0;
7733
7734 remove_symlinks:
7735         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7736                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7737 free_adj:
7738         kfree(adj);
7739         dev_put(adj_dev);
7740
7741         return ret;
7742 }
7743
7744 static void __netdev_adjacent_dev_remove(struct net_device *dev,
7745                                          struct net_device *adj_dev,
7746                                          u16 ref_nr,
7747                                          struct list_head *dev_list)
7748 {
7749         struct netdev_adjacent *adj;
7750
7751         pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
7752                  dev->name, adj_dev->name, ref_nr);
7753
7754         adj = __netdev_find_adj(adj_dev, dev_list);
7755
7756         if (!adj) {
7757                 pr_err("Adjacency does not exist for device %s from %s\n",
7758                        dev->name, adj_dev->name);
7759                 WARN_ON(1);
7760                 return;
7761         }
7762
7763         if (adj->ref_nr > ref_nr) {
7764                 pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
7765                          dev->name, adj_dev->name, ref_nr,
7766                          adj->ref_nr - ref_nr);
7767                 adj->ref_nr -= ref_nr;
7768                 return;
7769         }
7770
7771         if (adj->master)
7772                 sysfs_remove_link(&(dev->dev.kobj), "master");
7773
7774         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7775                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7776
7777         list_del_rcu(&adj->list);
7778         pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
7779                  adj_dev->name, dev->name, adj_dev->name);
7780         dev_put(adj_dev);
7781         kfree_rcu(adj, rcu);
7782 }
7783
7784 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
7785                                             struct net_device *upper_dev,
7786                                             struct list_head *up_list,
7787                                             struct list_head *down_list,
7788                                             void *private, bool master)
7789 {
7790         int ret;
7791
7792         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
7793                                            private, master);
7794         if (ret)
7795                 return ret;
7796
7797         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
7798                                            private, false);
7799         if (ret) {
7800                 __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
7801                 return ret;
7802         }
7803
7804         return 0;
7805 }
7806
7807 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
7808                                                struct net_device *upper_dev,
7809                                                u16 ref_nr,
7810                                                struct list_head *up_list,
7811                                                struct list_head *down_list)
7812 {
7813         __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
7814         __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
7815 }
7816
7817 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
7818                                                 struct net_device *upper_dev,
7819                                                 void *private, bool master)
7820 {
7821         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
7822                                                 &dev->adj_list.upper,
7823                                                 &upper_dev->adj_list.lower,
7824                                                 private, master);
7825 }
7826
7827 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
7828                                                    struct net_device *upper_dev)
7829 {
7830         __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
7831                                            &dev->adj_list.upper,
7832                                            &upper_dev->adj_list.lower);
7833 }
7834
7835 static int __netdev_upper_dev_link(struct net_device *dev,
7836                                    struct net_device *upper_dev, bool master,
7837                                    void *upper_priv, void *upper_info,
7838                                    struct netdev_nested_priv *priv,
7839                                    struct netlink_ext_ack *extack)
7840 {
7841         struct netdev_notifier_changeupper_info changeupper_info = {
7842                 .info = {
7843                         .dev = dev,
7844                         .extack = extack,
7845                 },
7846                 .upper_dev = upper_dev,
7847                 .master = master,
7848                 .linking = true,
7849                 .upper_info = upper_info,
7850         };
7851         struct net_device *master_dev;
7852         int ret = 0;
7853
7854         ASSERT_RTNL();
7855
7856         if (dev == upper_dev)
7857                 return -EBUSY;
7858
7859         /* To prevent loops, check if dev is not upper device to upper_dev. */
7860         if (__netdev_has_upper_dev(upper_dev, dev))
7861                 return -EBUSY;
7862
7863         if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
7864                 return -EMLINK;
7865
7866         if (!master) {
7867                 if (__netdev_has_upper_dev(dev, upper_dev))
7868                         return -EEXIST;
7869         } else {
7870                 master_dev = __netdev_master_upper_dev_get(dev);
7871                 if (master_dev)
7872                         return master_dev == upper_dev ? -EEXIST : -EBUSY;
7873         }
7874
7875         ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7876                                             &changeupper_info.info);
7877         ret = notifier_to_errno(ret);
7878         if (ret)
7879                 return ret;
7880
7881         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
7882                                                    master);
7883         if (ret)
7884                 return ret;
7885
7886         ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7887                                             &changeupper_info.info);
7888         ret = notifier_to_errno(ret);
7889         if (ret)
7890                 goto rollback;
7891
7892         __netdev_update_upper_level(dev, NULL);
7893         __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7894
7895         __netdev_update_lower_level(upper_dev, priv);
7896         __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7897                                     priv);
7898
7899         return 0;
7900
7901 rollback:
7902         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7903
7904         return ret;
7905 }
7906
7907 /**
7908  * netdev_upper_dev_link - Add a link to the upper device
7909  * @dev: device
7910  * @upper_dev: new upper device
7911  * @extack: netlink extended ack
7912  *
7913  * Adds a link to device which is upper to this one. The caller must hold
7914  * the RTNL lock. On a failure a negative errno code is returned.
7915  * On success the reference counts are adjusted and the function
7916  * returns zero.
7917  */
7918 int netdev_upper_dev_link(struct net_device *dev,
7919                           struct net_device *upper_dev,
7920                           struct netlink_ext_ack *extack)
7921 {
7922         struct netdev_nested_priv priv = {
7923                 .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7924                 .data = NULL,
7925         };
7926
7927         return __netdev_upper_dev_link(dev, upper_dev, false,
7928                                        NULL, NULL, &priv, extack);
7929 }
7930 EXPORT_SYMBOL(netdev_upper_dev_link);
7931
7932 /**
7933  * netdev_master_upper_dev_link - Add a master link to the upper device
7934  * @dev: device
7935  * @upper_dev: new upper device
7936  * @upper_priv: upper device private
7937  * @upper_info: upper info to be passed down via notifier
7938  * @extack: netlink extended ack
7939  *
7940  * Adds a link to device which is upper to this one. In this case, only
7941  * one master upper device can be linked, although other non-master devices
7942  * might be linked as well. The caller must hold the RTNL lock.
7943  * On a failure a negative errno code is returned. On success the reference
7944  * counts are adjusted and the function returns zero.
7945  */
7946 int netdev_master_upper_dev_link(struct net_device *dev,
7947                                  struct net_device *upper_dev,
7948                                  void *upper_priv, void *upper_info,
7949                                  struct netlink_ext_ack *extack)
7950 {
7951         struct netdev_nested_priv priv = {
7952                 .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7953                 .data = NULL,
7954         };
7955
7956         return __netdev_upper_dev_link(dev, upper_dev, true,
7957                                        upper_priv, upper_info, &priv, extack);
7958 }
7959 EXPORT_SYMBOL(netdev_master_upper_dev_link);
7960
7961 static void __netdev_upper_dev_unlink(struct net_device *dev,
7962                                       struct net_device *upper_dev,
7963                                       struct netdev_nested_priv *priv)
7964 {
7965         struct netdev_notifier_changeupper_info changeupper_info = {
7966                 .info = {
7967                         .dev = dev,
7968                 },
7969                 .upper_dev = upper_dev,
7970                 .linking = false,
7971         };
7972
7973         ASSERT_RTNL();
7974
7975         changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
7976
7977         call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7978                                       &changeupper_info.info);
7979
7980         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7981
7982         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7983                                       &changeupper_info.info);
7984
7985         __netdev_update_upper_level(dev, NULL);
7986         __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7987
7988         __netdev_update_lower_level(upper_dev, priv);
7989         __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7990                                     priv);
7991 }
7992
7993 /**
7994  * netdev_upper_dev_unlink - Removes a link to upper device
7995  * @dev: device
7996  * @upper_dev: new upper device
7997  *
7998  * Removes a link to device which is upper to this one. The caller must hold
7999  * the RTNL lock.
8000  */
8001 void netdev_upper_dev_unlink(struct net_device *dev,
8002                              struct net_device *upper_dev)
8003 {
8004         struct netdev_nested_priv priv = {
8005                 .flags = NESTED_SYNC_TODO,
8006                 .data = NULL,
8007         };
8008
8009         __netdev_upper_dev_unlink(dev, upper_dev, &priv);
8010 }
8011 EXPORT_SYMBOL(netdev_upper_dev_unlink);
8012
8013 static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
8014                                       struct net_device *lower_dev,
8015                                       bool val)
8016 {
8017         struct netdev_adjacent *adj;
8018
8019         adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
8020         if (adj)
8021                 adj->ignore = val;
8022
8023         adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
8024         if (adj)
8025                 adj->ignore = val;
8026 }
8027
8028 static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
8029                                         struct net_device *lower_dev)
8030 {
8031         __netdev_adjacent_dev_set(upper_dev, lower_dev, true);
8032 }
8033
8034 static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
8035                                        struct net_device *lower_dev)
8036 {
8037         __netdev_adjacent_dev_set(upper_dev, lower_dev, false);
8038 }
8039
8040 int netdev_adjacent_change_prepare(struct net_device *old_dev,
8041                                    struct net_device *new_dev,
8042                                    struct net_device *dev,
8043                                    struct netlink_ext_ack *extack)
8044 {
8045         struct netdev_nested_priv priv = {
8046                 .flags = 0,
8047                 .data = NULL,
8048         };
8049         int err;
8050
8051         if (!new_dev)
8052                 return 0;
8053
8054         if (old_dev && new_dev != old_dev)
8055                 netdev_adjacent_dev_disable(dev, old_dev);
8056         err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
8057                                       extack);
8058         if (err) {
8059                 if (old_dev && new_dev != old_dev)
8060                         netdev_adjacent_dev_enable(dev, old_dev);
8061                 return err;
8062         }
8063
8064         return 0;
8065 }
8066 EXPORT_SYMBOL(netdev_adjacent_change_prepare);
8067
8068 void netdev_adjacent_change_commit(struct net_device *old_dev,
8069                                    struct net_device *new_dev,
8070                                    struct net_device *dev)
8071 {
8072         struct netdev_nested_priv priv = {
8073                 .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
8074                 .data = NULL,
8075         };
8076
8077         if (!new_dev || !old_dev)
8078                 return;
8079
8080         if (new_dev == old_dev)
8081                 return;
8082
8083         netdev_adjacent_dev_enable(dev, old_dev);
8084         __netdev_upper_dev_unlink(old_dev, dev, &priv);
8085 }
8086 EXPORT_SYMBOL(netdev_adjacent_change_commit);
8087
8088 void netdev_adjacent_change_abort(struct net_device *old_dev,
8089                                   struct net_device *new_dev,
8090                                   struct net_device *dev)
8091 {
8092         struct netdev_nested_priv priv = {
8093                 .flags = 0,
8094                 .data = NULL,
8095         };
8096
8097         if (!new_dev)
8098                 return;
8099
8100         if (old_dev && new_dev != old_dev)
8101                 netdev_adjacent_dev_enable(dev, old_dev);
8102
8103         __netdev_upper_dev_unlink(new_dev, dev, &priv);
8104 }
8105 EXPORT_SYMBOL(netdev_adjacent_change_abort);
8106
8107 /**
8108  * netdev_bonding_info_change - Dispatch event about slave change
8109  * @dev: device
8110  * @bonding_info: info to dispatch
8111  *
8112  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
8113  * The caller must hold the RTNL lock.
8114  */
8115 void netdev_bonding_info_change(struct net_device *dev,
8116                                 struct netdev_bonding_info *bonding_info)
8117 {
8118         struct netdev_notifier_bonding_info info = {
8119                 .info.dev = dev,
8120         };
8121
8122         memcpy(&info.bonding_info, bonding_info,
8123                sizeof(struct netdev_bonding_info));
8124         call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
8125                                       &info.info);
8126 }
8127 EXPORT_SYMBOL(netdev_bonding_info_change);
8128
8129 /**
8130  * netdev_get_xmit_slave - Get the xmit slave of master device
8131  * @dev: device
8132  * @skb: The packet
8133  * @all_slaves: assume all the slaves are active
8134  *
8135  * The reference counters are not incremented so the caller must be
8136  * careful with locks. The caller must hold RCU lock.
8137  * %NULL is returned if no slave is found.
8138  */
8139
8140 struct net_device *netdev_get_xmit_slave(struct net_device *dev,
8141                                          struct sk_buff *skb,
8142                                          bool all_slaves)
8143 {
8144         const struct net_device_ops *ops = dev->netdev_ops;
8145
8146         if (!ops->ndo_get_xmit_slave)
8147                 return NULL;
8148         return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
8149 }
8150 EXPORT_SYMBOL(netdev_get_xmit_slave);
8151
8152 static void netdev_adjacent_add_links(struct net_device *dev)
8153 {
8154         struct netdev_adjacent *iter;
8155
8156         struct net *net = dev_net(dev);
8157
8158         list_for_each_entry(iter, &dev->adj_list.upper, list) {
8159                 if (!net_eq(net, dev_net(iter->dev)))
8160                         continue;
8161                 netdev_adjacent_sysfs_add(iter->dev, dev,
8162                                           &iter->dev->adj_list.lower);
8163                 netdev_adjacent_sysfs_add(dev, iter->dev,
8164                                           &dev->adj_list.upper);
8165         }
8166
8167         list_for_each_entry(iter, &dev->adj_list.lower, list) {
8168                 if (!net_eq(net, dev_net(iter->dev)))
8169                         continue;
8170                 netdev_adjacent_sysfs_add(iter->dev, dev,
8171                                           &iter->dev->adj_list.upper);
8172                 netdev_adjacent_sysfs_add(dev, iter->dev,
8173                                           &dev->adj_list.lower);
8174         }
8175 }
8176
8177 static void netdev_adjacent_del_links(struct net_device *dev)
8178 {
8179         struct netdev_adjacent *iter;
8180
8181         struct net *net = dev_net(dev);
8182
8183         list_for_each_entry(iter, &dev->adj_list.upper, list) {
8184                 if (!net_eq(net, dev_net(iter->dev)))
8185                         continue;
8186                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
8187                                           &iter->dev->adj_list.lower);
8188                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
8189                                           &dev->adj_list.upper);
8190         }
8191
8192         list_for_each_entry(iter, &dev->adj_list.lower, list) {
8193                 if (!net_eq(net, dev_net(iter->dev)))
8194                         continue;
8195                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
8196                                           &iter->dev->adj_list.upper);
8197                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
8198                                           &dev->adj_list.lower);
8199         }
8200 }
8201
8202 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
8203 {
8204         struct netdev_adjacent *iter;
8205
8206         struct net *net = dev_net(dev);
8207
8208         list_for_each_entry(iter, &dev->adj_list.upper, list) {
8209                 if (!net_eq(net, dev_net(iter->dev)))
8210                         continue;
8211                 netdev_adjacent_sysfs_del(iter->dev, oldname,
8212                                           &iter->dev->adj_list.lower);
8213                 netdev_adjacent_sysfs_add(iter->dev, dev,
8214                                           &iter->dev->adj_list.lower);
8215         }
8216
8217         list_for_each_entry(iter, &dev->adj_list.lower, list) {
8218                 if (!net_eq(net, dev_net(iter->dev)))
8219                         continue;
8220                 netdev_adjacent_sysfs_del(iter->dev, oldname,
8221                                           &iter->dev->adj_list.upper);
8222                 netdev_adjacent_sysfs_add(iter->dev, dev,
8223                                           &iter->dev->adj_list.upper);
8224         }
8225 }
8226
8227 void *netdev_lower_dev_get_private(struct net_device *dev,
8228                                    struct net_device *lower_dev)
8229 {
8230         struct netdev_adjacent *lower;
8231
8232         if (!lower_dev)
8233                 return NULL;
8234         lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
8235         if (!lower)
8236                 return NULL;
8237
8238         return lower->private;
8239 }
8240 EXPORT_SYMBOL(netdev_lower_dev_get_private);
8241
8242
8243 /**
8244  * netdev_lower_change - Dispatch event about lower device state change
8245  * @lower_dev: device
8246  * @lower_state_info: state to dispatch
8247  *
8248  * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
8249  * The caller must hold the RTNL lock.
8250  */
8251 void netdev_lower_state_changed(struct net_device *lower_dev,
8252                                 void *lower_state_info)
8253 {
8254         struct netdev_notifier_changelowerstate_info changelowerstate_info = {
8255                 .info.dev = lower_dev,
8256         };
8257
8258         ASSERT_RTNL();
8259         changelowerstate_info.lower_state_info = lower_state_info;
8260         call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
8261                                       &changelowerstate_info.info);
8262 }
8263 EXPORT_SYMBOL(netdev_lower_state_changed);
8264
8265 static void dev_change_rx_flags(struct net_device *dev, int flags)
8266 {
8267         const struct net_device_ops *ops = dev->netdev_ops;
8268
8269         if (ops->ndo_change_rx_flags)
8270                 ops->ndo_change_rx_flags(dev, flags);
8271 }
8272
8273 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
8274 {
8275         unsigned int old_flags = dev->flags;
8276         kuid_t uid;
8277         kgid_t gid;
8278
8279         ASSERT_RTNL();
8280
8281         dev->flags |= IFF_PROMISC;
8282         dev->promiscuity += inc;
8283         if (dev->promiscuity == 0) {
8284                 /*
8285                  * Avoid overflow.
8286                  * If inc causes overflow, untouch promisc and return error.
8287                  */
8288                 if (inc < 0)
8289                         dev->flags &= ~IFF_PROMISC;
8290                 else {
8291                         dev->promiscuity -= inc;
8292                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
8293                                 dev->name);
8294                         return -EOVERFLOW;
8295                 }
8296         }
8297         if (dev->flags != old_flags) {
8298                 pr_info("device %s %s promiscuous mode\n",
8299                         dev->name,
8300                         dev->flags & IFF_PROMISC ? "entered" : "left");
8301                 if (audit_enabled) {
8302                         current_uid_gid(&uid, &gid);
8303                         audit_log(audit_context(), GFP_ATOMIC,
8304                                   AUDIT_ANOM_PROMISCUOUS,
8305                                   "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
8306                                   dev->name, (dev->flags & IFF_PROMISC),
8307                                   (old_flags & IFF_PROMISC),
8308                                   from_kuid(&init_user_ns, audit_get_loginuid(current)),
8309                                   from_kuid(&init_user_ns, uid),
8310                                   from_kgid(&init_user_ns, gid),
8311                                   audit_get_sessionid(current));
8312                 }
8313
8314                 dev_change_rx_flags(dev, IFF_PROMISC);
8315         }
8316         if (notify)
8317                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
8318         return 0;
8319 }
8320
8321 /**
8322  *      dev_set_promiscuity     - update promiscuity count on a device
8323  *      @dev: device
8324  *      @inc: modifier
8325  *
8326  *      Add or remove promiscuity from a device. While the count in the device
8327  *      remains above zero the interface remains promiscuous. Once it hits zero
8328  *      the device reverts back to normal filtering operation. A negative inc
8329  *      value is used to drop promiscuity on the device.
8330  *      Return 0 if successful or a negative errno code on error.
8331  */
8332 int dev_set_promiscuity(struct net_device *dev, int inc)
8333 {
8334         unsigned int old_flags = dev->flags;
8335         int err;
8336
8337         err = __dev_set_promiscuity(dev, inc, true);
8338         if (err < 0)
8339                 return err;
8340         if (dev->flags != old_flags)
8341                 dev_set_rx_mode(dev);
8342         return err;
8343 }
8344 EXPORT_SYMBOL(dev_set_promiscuity);
8345
8346 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
8347 {
8348         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
8349
8350         ASSERT_RTNL();
8351
8352         dev->flags |= IFF_ALLMULTI;
8353         dev->allmulti += inc;
8354         if (dev->allmulti == 0) {
8355                 /*
8356                  * Avoid overflow.
8357                  * If inc causes overflow, untouch allmulti and return error.
8358                  */
8359                 if (inc < 0)
8360                         dev->flags &= ~IFF_ALLMULTI;
8361                 else {
8362                         dev->allmulti -= inc;
8363                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
8364                                 dev->name);
8365                         return -EOVERFLOW;
8366                 }
8367         }
8368         if (dev->flags ^ old_flags) {
8369                 dev_change_rx_flags(dev, IFF_ALLMULTI);
8370                 dev_set_rx_mode(dev);
8371                 if (notify)
8372                         __dev_notify_flags(dev, old_flags,
8373                                            dev->gflags ^ old_gflags);
8374         }
8375         return 0;
8376 }
8377
8378 /**
8379  *      dev_set_allmulti        - update allmulti count on a device
8380  *      @dev: device
8381  *      @inc: modifier
8382  *
8383  *      Add or remove reception of all multicast frames to a device. While the
8384  *      count in the device remains above zero the interface remains listening
8385  *      to all interfaces. Once it hits zero the device reverts back to normal
8386  *      filtering operation. A negative @inc value is used to drop the counter
8387  *      when releasing a resource needing all multicasts.
8388  *      Return 0 if successful or a negative errno code on error.
8389  */
8390
8391 int dev_set_allmulti(struct net_device *dev, int inc)
8392 {
8393         return __dev_set_allmulti(dev, inc, true);
8394 }
8395 EXPORT_SYMBOL(dev_set_allmulti);
8396
8397 /*
8398  *      Upload unicast and multicast address lists to device and
8399  *      configure RX filtering. When the device doesn't support unicast
8400  *      filtering it is put in promiscuous mode while unicast addresses
8401  *      are present.
8402  */
8403 void __dev_set_rx_mode(struct net_device *dev)
8404 {
8405         const struct net_device_ops *ops = dev->netdev_ops;
8406
8407         /* dev_open will call this function so the list will stay sane. */
8408         if (!(dev->flags&IFF_UP))
8409                 return;
8410
8411         if (!netif_device_present(dev))
8412                 return;
8413
8414         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
8415                 /* Unicast addresses changes may only happen under the rtnl,
8416                  * therefore calling __dev_set_promiscuity here is safe.
8417                  */
8418                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
8419                         __dev_set_promiscuity(dev, 1, false);
8420                         dev->uc_promisc = true;
8421                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
8422                         __dev_set_promiscuity(dev, -1, false);
8423                         dev->uc_promisc = false;
8424                 }
8425         }
8426
8427         if (ops->ndo_set_rx_mode)
8428                 ops->ndo_set_rx_mode(dev);
8429 }
8430
8431 void dev_set_rx_mode(struct net_device *dev)
8432 {
8433         netif_addr_lock_bh(dev);
8434         __dev_set_rx_mode(dev);
8435         netif_addr_unlock_bh(dev);
8436 }
8437
8438 /**
8439  *      dev_get_flags - get flags reported to userspace
8440  *      @dev: device
8441  *
8442  *      Get the combination of flag bits exported through APIs to userspace.
8443  */
8444 unsigned int dev_get_flags(const struct net_device *dev)
8445 {
8446         unsigned int flags;
8447
8448         flags = (dev->flags & ~(IFF_PROMISC |
8449                                 IFF_ALLMULTI |
8450                                 IFF_RUNNING |
8451                                 IFF_LOWER_UP |
8452                                 IFF_DORMANT)) |
8453                 (dev->gflags & (IFF_PROMISC |
8454                                 IFF_ALLMULTI));
8455
8456         if (netif_running(dev)) {
8457                 if (netif_oper_up(dev))
8458                         flags |= IFF_RUNNING;
8459                 if (netif_carrier_ok(dev))
8460                         flags |= IFF_LOWER_UP;
8461                 if (netif_dormant(dev))
8462                         flags |= IFF_DORMANT;
8463         }
8464
8465         return flags;
8466 }
8467 EXPORT_SYMBOL(dev_get_flags);
8468
8469 int __dev_change_flags(struct net_device *dev, unsigned int flags,
8470                        struct netlink_ext_ack *extack)
8471 {
8472         unsigned int old_flags = dev->flags;
8473         int ret;
8474
8475         ASSERT_RTNL();
8476
8477         /*
8478          *      Set the flags on our device.
8479          */
8480
8481         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
8482                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
8483                                IFF_AUTOMEDIA)) |
8484                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
8485                                     IFF_ALLMULTI));
8486
8487         /*
8488          *      Load in the correct multicast list now the flags have changed.
8489          */
8490
8491         if ((old_flags ^ flags) & IFF_MULTICAST)
8492                 dev_change_rx_flags(dev, IFF_MULTICAST);
8493
8494         dev_set_rx_mode(dev);
8495
8496         /*
8497          *      Have we downed the interface. We handle IFF_UP ourselves
8498          *      according to user attempts to set it, rather than blindly
8499          *      setting it.
8500          */
8501
8502         ret = 0;
8503         if ((old_flags ^ flags) & IFF_UP) {
8504                 if (old_flags & IFF_UP)
8505                         __dev_close(dev);
8506                 else
8507                         ret = __dev_open(dev, extack);
8508         }
8509
8510         if ((flags ^ dev->gflags) & IFF_PROMISC) {
8511                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
8512                 unsigned int old_flags = dev->flags;
8513
8514                 dev->gflags ^= IFF_PROMISC;
8515
8516                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
8517                         if (dev->flags != old_flags)
8518                                 dev_set_rx_mode(dev);
8519         }
8520
8521         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
8522          * is important. Some (broken) drivers set IFF_PROMISC, when
8523          * IFF_ALLMULTI is requested not asking us and not reporting.
8524          */
8525         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
8526                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
8527
8528                 dev->gflags ^= IFF_ALLMULTI;
8529                 __dev_set_allmulti(dev, inc, false);
8530         }
8531
8532         return ret;
8533 }
8534
8535 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
8536                         unsigned int gchanges)
8537 {
8538         unsigned int changes = dev->flags ^ old_flags;
8539
8540         if (gchanges)
8541                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
8542
8543         if (changes & IFF_UP) {
8544                 if (dev->flags & IFF_UP)
8545                         call_netdevice_notifiers(NETDEV_UP, dev);
8546                 else
8547                         call_netdevice_notifiers(NETDEV_DOWN, dev);
8548         }
8549
8550         if (dev->flags & IFF_UP &&
8551             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
8552                 struct netdev_notifier_change_info change_info = {
8553                         .info = {
8554                                 .dev = dev,
8555                         },
8556                         .flags_changed = changes,
8557                 };
8558
8559                 call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
8560         }
8561 }
8562
8563 /**
8564  *      dev_change_flags - change device settings
8565  *      @dev: device
8566  *      @flags: device state flags
8567  *      @extack: netlink extended ack
8568  *
8569  *      Change settings on device based state flags. The flags are
8570  *      in the userspace exported format.
8571  */
8572 int dev_change_flags(struct net_device *dev, unsigned int flags,
8573                      struct netlink_ext_ack *extack)
8574 {
8575         int ret;
8576         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
8577
8578         ret = __dev_change_flags(dev, flags, extack);
8579         if (ret < 0)
8580                 return ret;
8581
8582         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
8583         __dev_notify_flags(dev, old_flags, changes);
8584         return ret;
8585 }
8586 EXPORT_SYMBOL(dev_change_flags);
8587
8588 int __dev_set_mtu(struct net_device *dev, int new_mtu)
8589 {
8590         const struct net_device_ops *ops = dev->netdev_ops;
8591
8592         if (ops->ndo_change_mtu)
8593                 return ops->ndo_change_mtu(dev, new_mtu);
8594
8595         /* Pairs with all the lockless reads of dev->mtu in the stack */
8596         WRITE_ONCE(dev->mtu, new_mtu);
8597         return 0;
8598 }
8599 EXPORT_SYMBOL(__dev_set_mtu);
8600
8601 int dev_validate_mtu(struct net_device *dev, int new_mtu,
8602                      struct netlink_ext_ack *extack)
8603 {
8604         /* MTU must be positive, and in range */
8605         if (new_mtu < 0 || new_mtu < dev->min_mtu) {
8606                 NL_SET_ERR_MSG(extack, "mtu less than device minimum");
8607                 return -EINVAL;
8608         }
8609
8610         if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
8611                 NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
8612                 return -EINVAL;
8613         }
8614         return 0;
8615 }
8616
8617 /**
8618  *      dev_set_mtu_ext - Change maximum transfer unit
8619  *      @dev: device
8620  *      @new_mtu: new transfer unit
8621  *      @extack: netlink extended ack
8622  *
8623  *      Change the maximum transfer size of the network device.
8624  */
8625 int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
8626                     struct netlink_ext_ack *extack)
8627 {
8628         int err, orig_mtu;
8629
8630         if (new_mtu == dev->mtu)
8631                 return 0;
8632
8633         err = dev_validate_mtu(dev, new_mtu, extack);
8634         if (err)
8635                 return err;
8636
8637         if (!netif_device_present(dev))
8638                 return -ENODEV;
8639
8640         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
8641         err = notifier_to_errno(err);
8642         if (err)
8643                 return err;
8644
8645         orig_mtu = dev->mtu;
8646         err = __dev_set_mtu(dev, new_mtu);
8647
8648         if (!err) {
8649                 err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8650                                                    orig_mtu);
8651                 err = notifier_to_errno(err);
8652                 if (err) {
8653                         /* setting mtu back and notifying everyone again,
8654                          * so that they have a chance to revert changes.
8655                          */
8656                         __dev_set_mtu(dev, orig_mtu);
8657                         call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8658                                                      new_mtu);
8659                 }
8660         }
8661         return err;
8662 }
8663
8664 int dev_set_mtu(struct net_device *dev, int new_mtu)
8665 {
8666         struct netlink_ext_ack extack;
8667         int err;
8668
8669         memset(&extack, 0, sizeof(extack));
8670         err = dev_set_mtu_ext(dev, new_mtu, &extack);
8671         if (err && extack._msg)
8672                 net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
8673         return err;
8674 }
8675 EXPORT_SYMBOL(dev_set_mtu);
8676
8677 /**
8678  *      dev_change_tx_queue_len - Change TX queue length of a netdevice
8679  *      @dev: device
8680  *      @new_len: new tx queue length
8681  */
8682 int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
8683 {
8684         unsigned int orig_len = dev->tx_queue_len;
8685         int res;
8686
8687         if (new_len != (unsigned int)new_len)
8688                 return -ERANGE;
8689
8690         if (new_len != orig_len) {
8691                 dev->tx_queue_len = new_len;
8692                 res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
8693                 res = notifier_to_errno(res);
8694                 if (res)
8695                         goto err_rollback;
8696                 res = dev_qdisc_change_tx_queue_len(dev);
8697                 if (res)
8698                         goto err_rollback;
8699         }
8700
8701         return 0;
8702
8703 err_rollback:
8704         netdev_err(dev, "refused to change device tx_queue_len\n");
8705         dev->tx_queue_len = orig_len;
8706         return res;
8707 }
8708
8709 /**
8710  *      dev_set_group - Change group this device belongs to
8711  *      @dev: device
8712  *      @new_group: group this device should belong to
8713  */
8714 void dev_set_group(struct net_device *dev, int new_group)
8715 {
8716         dev->group = new_group;
8717 }
8718 EXPORT_SYMBOL(dev_set_group);
8719
8720 /**
8721  *      dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
8722  *      @dev: device
8723  *      @addr: new address
8724  *      @extack: netlink extended ack
8725  */
8726 int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
8727                               struct netlink_ext_ack *extack)
8728 {
8729         struct netdev_notifier_pre_changeaddr_info info = {
8730                 .info.dev = dev,
8731                 .info.extack = extack,
8732                 .dev_addr = addr,
8733         };
8734         int rc;
8735
8736         rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
8737         return notifier_to_errno(rc);
8738 }
8739 EXPORT_SYMBOL(dev_pre_changeaddr_notify);
8740
8741 /**
8742  *      dev_set_mac_address - Change Media Access Control Address
8743  *      @dev: device
8744  *      @sa: new address
8745  *      @extack: netlink extended ack
8746  *
8747  *      Change the hardware (MAC) address of the device
8748  */
8749 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
8750                         struct netlink_ext_ack *extack)
8751 {
8752         const struct net_device_ops *ops = dev->netdev_ops;
8753         int err;
8754
8755         if (!ops->ndo_set_mac_address)
8756                 return -EOPNOTSUPP;
8757         if (sa->sa_family != dev->type)
8758                 return -EINVAL;
8759         if (!netif_device_present(dev))
8760                 return -ENODEV;
8761         err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
8762         if (err)
8763                 return err;
8764         err = ops->ndo_set_mac_address(dev, sa);
8765         if (err)
8766                 return err;
8767         dev->addr_assign_type = NET_ADDR_SET;
8768         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
8769         add_device_randomness(dev->dev_addr, dev->addr_len);
8770         return 0;
8771 }
8772 EXPORT_SYMBOL(dev_set_mac_address);
8773
8774 static DECLARE_RWSEM(dev_addr_sem);
8775
8776 int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
8777                              struct netlink_ext_ack *extack)
8778 {
8779         int ret;
8780
8781         down_write(&dev_addr_sem);
8782         ret = dev_set_mac_address(dev, sa, extack);
8783         up_write(&dev_addr_sem);
8784         return ret;
8785 }
8786 EXPORT_SYMBOL(dev_set_mac_address_user);
8787
8788 int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
8789 {
8790         size_t size = sizeof(sa->sa_data_min);
8791         struct net_device *dev;
8792         int ret = 0;
8793
8794         down_read(&dev_addr_sem);
8795         rcu_read_lock();
8796
8797         dev = dev_get_by_name_rcu(net, dev_name);
8798         if (!dev) {
8799                 ret = -ENODEV;
8800                 goto unlock;
8801         }
8802         if (!dev->addr_len)
8803                 memset(sa->sa_data, 0, size);
8804         else
8805                 memcpy(sa->sa_data, dev->dev_addr,
8806                        min_t(size_t, size, dev->addr_len));
8807         sa->sa_family = dev->type;
8808
8809 unlock:
8810         rcu_read_unlock();
8811         up_read(&dev_addr_sem);
8812         return ret;
8813 }
8814 EXPORT_SYMBOL(dev_get_mac_address);
8815
8816 /**
8817  *      dev_change_carrier - Change device carrier
8818  *      @dev: device
8819  *      @new_carrier: new value
8820  *
8821  *      Change device carrier
8822  */
8823 int dev_change_carrier(struct net_device *dev, bool new_carrier)
8824 {
8825         const struct net_device_ops *ops = dev->netdev_ops;
8826
8827         if (!ops->ndo_change_carrier)
8828                 return -EOPNOTSUPP;
8829         if (!netif_device_present(dev))
8830                 return -ENODEV;
8831         return ops->ndo_change_carrier(dev, new_carrier);
8832 }
8833 EXPORT_SYMBOL(dev_change_carrier);
8834
8835 /**
8836  *      dev_get_phys_port_id - Get device physical port ID
8837  *      @dev: device
8838  *      @ppid: port ID
8839  *
8840  *      Get device physical port ID
8841  */
8842 int dev_get_phys_port_id(struct net_device *dev,
8843                          struct netdev_phys_item_id *ppid)
8844 {
8845         const struct net_device_ops *ops = dev->netdev_ops;
8846
8847         if (!ops->ndo_get_phys_port_id)
8848                 return -EOPNOTSUPP;
8849         return ops->ndo_get_phys_port_id(dev, ppid);
8850 }
8851 EXPORT_SYMBOL(dev_get_phys_port_id);
8852
8853 /**
8854  *      dev_get_phys_port_name - Get device physical port name
8855  *      @dev: device
8856  *      @name: port name
8857  *      @len: limit of bytes to copy to name
8858  *
8859  *      Get device physical port name
8860  */
8861 int dev_get_phys_port_name(struct net_device *dev,
8862                            char *name, size_t len)
8863 {
8864         const struct net_device_ops *ops = dev->netdev_ops;
8865         int err;
8866
8867         if (ops->ndo_get_phys_port_name) {
8868                 err = ops->ndo_get_phys_port_name(dev, name, len);
8869                 if (err != -EOPNOTSUPP)
8870                         return err;
8871         }
8872         return devlink_compat_phys_port_name_get(dev, name, len);
8873 }
8874 EXPORT_SYMBOL(dev_get_phys_port_name);
8875
8876 /**
8877  *      dev_get_port_parent_id - Get the device's port parent identifier
8878  *      @dev: network device
8879  *      @ppid: pointer to a storage for the port's parent identifier
8880  *      @recurse: allow/disallow recursion to lower devices
8881  *
8882  *      Get the devices's port parent identifier
8883  */
8884 int dev_get_port_parent_id(struct net_device *dev,
8885                            struct netdev_phys_item_id *ppid,
8886                            bool recurse)
8887 {
8888         const struct net_device_ops *ops = dev->netdev_ops;
8889         struct netdev_phys_item_id first = { };
8890         struct net_device *lower_dev;
8891         struct list_head *iter;
8892         int err;
8893
8894         if (ops->ndo_get_port_parent_id) {
8895                 err = ops->ndo_get_port_parent_id(dev, ppid);
8896                 if (err != -EOPNOTSUPP)
8897                         return err;
8898         }
8899
8900         err = devlink_compat_switch_id_get(dev, ppid);
8901         if (!err || err != -EOPNOTSUPP)
8902                 return err;
8903
8904         if (!recurse)
8905                 return -EOPNOTSUPP;
8906
8907         netdev_for_each_lower_dev(dev, lower_dev, iter) {
8908                 err = dev_get_port_parent_id(lower_dev, ppid, recurse);
8909                 if (err)
8910                         break;
8911                 if (!first.id_len)
8912                         first = *ppid;
8913                 else if (memcmp(&first, ppid, sizeof(*ppid)))
8914                         return -EOPNOTSUPP;
8915         }
8916
8917         return err;
8918 }
8919 EXPORT_SYMBOL(dev_get_port_parent_id);
8920
8921 /**
8922  *      netdev_port_same_parent_id - Indicate if two network devices have
8923  *      the same port parent identifier
8924  *      @a: first network device
8925  *      @b: second network device
8926  */
8927 bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
8928 {
8929         struct netdev_phys_item_id a_id = { };
8930         struct netdev_phys_item_id b_id = { };
8931
8932         if (dev_get_port_parent_id(a, &a_id, true) ||
8933             dev_get_port_parent_id(b, &b_id, true))
8934                 return false;
8935
8936         return netdev_phys_item_id_same(&a_id, &b_id);
8937 }
8938 EXPORT_SYMBOL(netdev_port_same_parent_id);
8939
8940 /**
8941  *      dev_change_proto_down - update protocol port state information
8942  *      @dev: device
8943  *      @proto_down: new value
8944  *
8945  *      This info can be used by switch drivers to set the phys state of the
8946  *      port.
8947  */
8948 int dev_change_proto_down(struct net_device *dev, bool proto_down)
8949 {
8950         const struct net_device_ops *ops = dev->netdev_ops;
8951
8952         if (!ops->ndo_change_proto_down)
8953                 return -EOPNOTSUPP;
8954         if (!netif_device_present(dev))
8955                 return -ENODEV;
8956         return ops->ndo_change_proto_down(dev, proto_down);
8957 }
8958 EXPORT_SYMBOL(dev_change_proto_down);
8959
8960 /**
8961  *      dev_change_proto_down_generic - generic implementation for
8962  *      ndo_change_proto_down that sets carrier according to
8963  *      proto_down.
8964  *
8965  *      @dev: device
8966  *      @proto_down: new value
8967  */
8968 int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
8969 {
8970         if (proto_down)
8971                 netif_carrier_off(dev);
8972         else
8973                 netif_carrier_on(dev);
8974         dev->proto_down = proto_down;
8975         return 0;
8976 }
8977 EXPORT_SYMBOL(dev_change_proto_down_generic);
8978
8979 /**
8980  *      dev_change_proto_down_reason - proto down reason
8981  *
8982  *      @dev: device
8983  *      @mask: proto down mask
8984  *      @value: proto down value
8985  */
8986 void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
8987                                   u32 value)
8988 {
8989         int b;
8990
8991         if (!mask) {
8992                 dev->proto_down_reason = value;
8993         } else {
8994                 for_each_set_bit(b, &mask, 32) {
8995                         if (value & (1 << b))
8996                                 dev->proto_down_reason |= BIT(b);
8997                         else
8998                                 dev->proto_down_reason &= ~BIT(b);
8999                 }
9000         }
9001 }
9002 EXPORT_SYMBOL(dev_change_proto_down_reason);
9003
9004 struct bpf_xdp_link {
9005         struct bpf_link link;
9006         struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
9007         int flags;
9008 };
9009
9010 static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
9011 {
9012         if (flags & XDP_FLAGS_HW_MODE)
9013                 return XDP_MODE_HW;
9014         if (flags & XDP_FLAGS_DRV_MODE)
9015                 return XDP_MODE_DRV;
9016         if (flags & XDP_FLAGS_SKB_MODE)
9017                 return XDP_MODE_SKB;
9018         return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
9019 }
9020
9021 static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
9022 {
9023         switch (mode) {
9024         case XDP_MODE_SKB:
9025                 return generic_xdp_install;
9026         case XDP_MODE_DRV:
9027         case XDP_MODE_HW:
9028                 return dev->netdev_ops->ndo_bpf;
9029         default:
9030                 return NULL;
9031         };
9032 }
9033
9034 static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
9035                                          enum bpf_xdp_mode mode)
9036 {
9037         return dev->xdp_state[mode].link;
9038 }
9039
9040 static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
9041                                      enum bpf_xdp_mode mode)
9042 {
9043         struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
9044
9045         if (link)
9046                 return link->link.prog;
9047         return dev->xdp_state[mode].prog;
9048 }
9049
9050 static u8 dev_xdp_prog_count(struct net_device *dev)
9051 {
9052         u8 count = 0;
9053         int i;
9054
9055         for (i = 0; i < __MAX_XDP_MODE; i++)
9056                 if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
9057                         count++;
9058         return count;
9059 }
9060
9061 u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
9062 {
9063         struct bpf_prog *prog = dev_xdp_prog(dev, mode);
9064
9065         return prog ? prog->aux->id : 0;
9066 }
9067
9068 static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
9069                              struct bpf_xdp_link *link)
9070 {
9071         dev->xdp_state[mode].link = link;
9072         dev->xdp_state[mode].prog = NULL;
9073 }
9074
9075 static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
9076                              struct bpf_prog *prog)
9077 {
9078         dev->xdp_state[mode].link = NULL;
9079         dev->xdp_state[mode].prog = prog;
9080 }
9081
9082 static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
9083                            bpf_op_t bpf_op, struct netlink_ext_ack *extack,
9084                            u32 flags, struct bpf_prog *prog)
9085 {
9086         struct netdev_bpf xdp;
9087         int err;
9088
9089         memset(&xdp, 0, sizeof(xdp));
9090         xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
9091         xdp.extack = extack;
9092         xdp.flags = flags;
9093         xdp.prog = prog;
9094
9095         /* Drivers assume refcnt is already incremented (i.e, prog pointer is
9096          * "moved" into driver), so they don't increment it on their own, but
9097          * they do decrement refcnt when program is detached or replaced.
9098          * Given net_device also owns link/prog, we need to bump refcnt here
9099          * to prevent drivers from underflowing it.
9100          */
9101         if (prog)
9102                 bpf_prog_inc(prog);
9103         err = bpf_op(dev, &xdp);
9104         if (err) {
9105                 if (prog)
9106                         bpf_prog_put(prog);
9107                 return err;
9108         }
9109
9110         if (mode != XDP_MODE_HW)
9111                 bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
9112
9113         return 0;
9114 }
9115
9116 static void dev_xdp_uninstall(struct net_device *dev)
9117 {
9118         struct bpf_xdp_link *link;
9119         struct bpf_prog *prog;
9120         enum bpf_xdp_mode mode;
9121         bpf_op_t bpf_op;
9122
9123         ASSERT_RTNL();
9124
9125         for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
9126                 prog = dev_xdp_prog(dev, mode);
9127                 if (!prog)
9128                         continue;
9129
9130                 bpf_op = dev_xdp_bpf_op(dev, mode);
9131                 if (!bpf_op)
9132                         continue;
9133
9134                 WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9135
9136                 /* auto-detach link from net device */
9137                 link = dev_xdp_link(dev, mode);
9138                 if (link)
9139                         link->dev = NULL;
9140                 else
9141                         bpf_prog_put(prog);
9142
9143                 dev_xdp_set_link(dev, mode, NULL);
9144         }
9145 }
9146
9147 static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
9148                           struct bpf_xdp_link *link, struct bpf_prog *new_prog,
9149                           struct bpf_prog *old_prog, u32 flags)
9150 {
9151         unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
9152         struct bpf_prog *cur_prog;
9153         enum bpf_xdp_mode mode;
9154         bpf_op_t bpf_op;
9155         int err;
9156
9157         ASSERT_RTNL();
9158
9159         /* either link or prog attachment, never both */
9160         if (link && (new_prog || old_prog))
9161                 return -EINVAL;
9162         /* link supports only XDP mode flags */
9163         if (link && (flags & ~XDP_FLAGS_MODES)) {
9164                 NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
9165                 return -EINVAL;
9166         }
9167         /* just one XDP mode bit should be set, zero defaults to drv/skb mode */
9168         if (num_modes > 1) {
9169                 NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
9170                 return -EINVAL;
9171         }
9172         /* avoid ambiguity if offload + drv/skb mode progs are both loaded */
9173         if (!num_modes && dev_xdp_prog_count(dev) > 1) {
9174                 NL_SET_ERR_MSG(extack,
9175                                "More than one program loaded, unset mode is ambiguous");
9176                 return -EINVAL;
9177         }
9178         /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
9179         if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
9180                 NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
9181                 return -EINVAL;
9182         }
9183
9184         mode = dev_xdp_mode(dev, flags);
9185         /* can't replace attached link */
9186         if (dev_xdp_link(dev, mode)) {
9187                 NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
9188                 return -EBUSY;
9189         }
9190
9191         cur_prog = dev_xdp_prog(dev, mode);
9192         /* can't replace attached prog with link */
9193         if (link && cur_prog) {
9194                 NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
9195                 return -EBUSY;
9196         }
9197         if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
9198                 NL_SET_ERR_MSG(extack, "Active program does not match expected");
9199                 return -EEXIST;
9200         }
9201
9202         /* put effective new program into new_prog */
9203         if (link)
9204                 new_prog = link->link.prog;
9205
9206         if (new_prog) {
9207                 bool offload = mode == XDP_MODE_HW;
9208                 enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
9209                                                ? XDP_MODE_DRV : XDP_MODE_SKB;
9210
9211                 if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
9212                         NL_SET_ERR_MSG(extack, "XDP program already attached");
9213                         return -EBUSY;
9214                 }
9215                 if (!offload && dev_xdp_prog(dev, other_mode)) {
9216                         NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
9217                         return -EEXIST;
9218                 }
9219                 if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) {
9220                         NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
9221                         return -EINVAL;
9222                 }
9223                 if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
9224                         NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
9225                         return -EINVAL;
9226                 }
9227                 if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
9228                         NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
9229                         return -EINVAL;
9230                 }
9231         }
9232
9233         /* don't call drivers if the effective program didn't change */
9234         if (new_prog != cur_prog) {
9235                 bpf_op = dev_xdp_bpf_op(dev, mode);
9236                 if (!bpf_op) {
9237                         NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
9238                         return -EOPNOTSUPP;
9239                 }
9240
9241                 err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
9242                 if (err)
9243                         return err;
9244         }
9245
9246         if (link)
9247                 dev_xdp_set_link(dev, mode, link);
9248         else
9249                 dev_xdp_set_prog(dev, mode, new_prog);
9250         if (cur_prog)
9251                 bpf_prog_put(cur_prog);
9252
9253         return 0;
9254 }
9255
9256 static int dev_xdp_attach_link(struct net_device *dev,
9257                                struct netlink_ext_ack *extack,
9258                                struct bpf_xdp_link *link)
9259 {
9260         return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
9261 }
9262
9263 static int dev_xdp_detach_link(struct net_device *dev,
9264                                struct netlink_ext_ack *extack,
9265                                struct bpf_xdp_link *link)
9266 {
9267         enum bpf_xdp_mode mode;
9268         bpf_op_t bpf_op;
9269
9270         ASSERT_RTNL();
9271
9272         mode = dev_xdp_mode(dev, link->flags);
9273         if (dev_xdp_link(dev, mode) != link)
9274                 return -EINVAL;
9275
9276         bpf_op = dev_xdp_bpf_op(dev, mode);
9277         WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9278         dev_xdp_set_link(dev, mode, NULL);
9279         return 0;
9280 }
9281
9282 static void bpf_xdp_link_release(struct bpf_link *link)
9283 {
9284         struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9285
9286         rtnl_lock();
9287
9288         /* if racing with net_device's tear down, xdp_link->dev might be
9289          * already NULL, in which case link was already auto-detached
9290          */
9291         if (xdp_link->dev) {
9292                 WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
9293                 xdp_link->dev = NULL;
9294         }
9295
9296         rtnl_unlock();
9297 }
9298
9299 static int bpf_xdp_link_detach(struct bpf_link *link)
9300 {
9301         bpf_xdp_link_release(link);
9302         return 0;
9303 }
9304
9305 static void bpf_xdp_link_dealloc(struct bpf_link *link)
9306 {
9307         struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9308
9309         kfree(xdp_link);
9310 }
9311
9312 static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
9313                                      struct seq_file *seq)
9314 {
9315         struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9316         u32 ifindex = 0;
9317
9318         rtnl_lock();
9319         if (xdp_link->dev)
9320                 ifindex = xdp_link->dev->ifindex;
9321         rtnl_unlock();
9322
9323         seq_printf(seq, "ifindex:\t%u\n", ifindex);
9324 }
9325
9326 static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
9327                                        struct bpf_link_info *info)
9328 {
9329         struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9330         u32 ifindex = 0;
9331
9332         rtnl_lock();
9333         if (xdp_link->dev)
9334                 ifindex = xdp_link->dev->ifindex;
9335         rtnl_unlock();
9336
9337         info->xdp.ifindex = ifindex;
9338         return 0;
9339 }
9340
9341 static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
9342                                struct bpf_prog *old_prog)
9343 {
9344         struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9345         enum bpf_xdp_mode mode;
9346         bpf_op_t bpf_op;
9347         int err = 0;
9348
9349         rtnl_lock();
9350
9351         /* link might have been auto-released already, so fail */
9352         if (!xdp_link->dev) {
9353                 err = -ENOLINK;
9354                 goto out_unlock;
9355         }
9356
9357         if (old_prog && link->prog != old_prog) {
9358                 err = -EPERM;
9359                 goto out_unlock;
9360         }
9361         old_prog = link->prog;
9362         if (old_prog->type != new_prog->type ||
9363             old_prog->expected_attach_type != new_prog->expected_attach_type) {
9364                 err = -EINVAL;
9365                 goto out_unlock;
9366         }
9367
9368         if (old_prog == new_prog) {
9369                 /* no-op, don't disturb drivers */
9370                 bpf_prog_put(new_prog);
9371                 goto out_unlock;
9372         }
9373
9374         mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
9375         bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
9376         err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
9377                               xdp_link->flags, new_prog);
9378         if (err)
9379                 goto out_unlock;
9380
9381         old_prog = xchg(&link->prog, new_prog);
9382         bpf_prog_put(old_prog);
9383
9384 out_unlock:
9385         rtnl_unlock();
9386         return err;
9387 }
9388
9389 static const struct bpf_link_ops bpf_xdp_link_lops = {
9390         .release = bpf_xdp_link_release,
9391         .dealloc = bpf_xdp_link_dealloc,
9392         .detach = bpf_xdp_link_detach,
9393         .show_fdinfo = bpf_xdp_link_show_fdinfo,
9394         .fill_link_info = bpf_xdp_link_fill_link_info,
9395         .update_prog = bpf_xdp_link_update,
9396 };
9397
9398 int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
9399 {
9400         struct net *net = current->nsproxy->net_ns;
9401         struct bpf_link_primer link_primer;
9402         struct bpf_xdp_link *link;
9403         struct net_device *dev;
9404         int err, fd;
9405
9406         rtnl_lock();
9407         dev = dev_get_by_index(net, attr->link_create.target_ifindex);
9408         if (!dev) {
9409                 rtnl_unlock();
9410                 return -EINVAL;
9411         }
9412
9413         link = kzalloc(sizeof(*link), GFP_USER);
9414         if (!link) {
9415                 err = -ENOMEM;
9416                 goto unlock;
9417         }
9418
9419         bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
9420         link->dev = dev;
9421         link->flags = attr->link_create.flags;
9422
9423         err = bpf_link_prime(&link->link, &link_primer);
9424         if (err) {
9425                 kfree(link);
9426                 goto unlock;
9427         }
9428
9429         err = dev_xdp_attach_link(dev, NULL, link);
9430         rtnl_unlock();
9431
9432         if (err) {
9433                 link->dev = NULL;
9434                 bpf_link_cleanup(&link_primer);
9435                 goto out_put_dev;
9436         }
9437
9438         fd = bpf_link_settle(&link_primer);
9439         /* link itself doesn't hold dev's refcnt to not complicate shutdown */
9440         dev_put(dev);
9441         return fd;
9442
9443 unlock:
9444         rtnl_unlock();
9445
9446 out_put_dev:
9447         dev_put(dev);
9448         return err;
9449 }
9450
9451 /**
9452  *      dev_change_xdp_fd - set or clear a bpf program for a device rx path
9453  *      @dev: device
9454  *      @extack: netlink extended ack
9455  *      @fd: new program fd or negative value to clear
9456  *      @expected_fd: old program fd that userspace expects to replace or clear
9457  *      @flags: xdp-related flags
9458  *
9459  *      Set or clear a bpf program for a device
9460  */
9461 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
9462                       int fd, int expected_fd, u32 flags)
9463 {
9464         enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
9465         struct bpf_prog *new_prog = NULL, *old_prog = NULL;
9466         int err;
9467
9468         ASSERT_RTNL();
9469
9470         if (fd >= 0) {
9471                 new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
9472                                                  mode != XDP_MODE_SKB);
9473                 if (IS_ERR(new_prog))
9474                         return PTR_ERR(new_prog);
9475         }
9476
9477         if (expected_fd >= 0) {
9478                 old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
9479                                                  mode != XDP_MODE_SKB);
9480                 if (IS_ERR(old_prog)) {
9481                         err = PTR_ERR(old_prog);
9482                         old_prog = NULL;
9483                         goto err_out;
9484                 }
9485         }
9486
9487         err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
9488
9489 err_out:
9490         if (err && new_prog)
9491                 bpf_prog_put(new_prog);
9492         if (old_prog)
9493                 bpf_prog_put(old_prog);
9494         return err;
9495 }
9496
9497 /**
9498  *      dev_new_index   -       allocate an ifindex
9499  *      @net: the applicable net namespace
9500  *
9501  *      Returns a suitable unique value for a new device interface
9502  *      number.  The caller must hold the rtnl semaphore or the
9503  *      dev_base_lock to be sure it remains unique.
9504  */
9505 static int dev_new_index(struct net *net)
9506 {
9507         int ifindex = net->ifindex;
9508
9509         for (;;) {
9510                 if (++ifindex <= 0)
9511                         ifindex = 1;
9512                 if (!__dev_get_by_index(net, ifindex))
9513                         return net->ifindex = ifindex;
9514         }
9515 }
9516
9517 /* Delayed registration/unregisteration */
9518 static LIST_HEAD(net_todo_list);
9519 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
9520
9521 static void net_set_todo(struct net_device *dev)
9522 {
9523         list_add_tail(&dev->todo_list, &net_todo_list);
9524         dev_net(dev)->dev_unreg_count++;
9525 }
9526
9527 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
9528         struct net_device *upper, netdev_features_t features)
9529 {
9530         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9531         netdev_features_t feature;
9532         int feature_bit;
9533
9534         for_each_netdev_feature(upper_disables, feature_bit) {
9535                 feature = __NETIF_F_BIT(feature_bit);
9536                 if (!(upper->wanted_features & feature)
9537                     && (features & feature)) {
9538                         netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
9539                                    &feature, upper->name);
9540                         features &= ~feature;
9541                 }
9542         }
9543
9544         return features;
9545 }
9546
9547 static void netdev_sync_lower_features(struct net_device *upper,
9548         struct net_device *lower, netdev_features_t features)
9549 {
9550         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9551         netdev_features_t feature;
9552         int feature_bit;
9553
9554         for_each_netdev_feature(upper_disables, feature_bit) {
9555                 feature = __NETIF_F_BIT(feature_bit);
9556                 if (!(features & feature) && (lower->features & feature)) {
9557                         netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
9558                                    &feature, lower->name);
9559                         lower->wanted_features &= ~feature;
9560                         __netdev_update_features(lower);
9561
9562                         if (unlikely(lower->features & feature))
9563                                 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
9564                                             &feature, lower->name);
9565                         else
9566                                 netdev_features_change(lower);
9567                 }
9568         }
9569 }
9570
9571 static netdev_features_t netdev_fix_features(struct net_device *dev,
9572         netdev_features_t features)
9573 {
9574         /* Fix illegal checksum combinations */
9575         if ((features & NETIF_F_HW_CSUM) &&
9576             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
9577                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
9578                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
9579         }
9580
9581         /* TSO requires that SG is present as well. */
9582         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
9583                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
9584                 features &= ~NETIF_F_ALL_TSO;
9585         }
9586
9587         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
9588                                         !(features & NETIF_F_IP_CSUM)) {
9589                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
9590                 features &= ~NETIF_F_TSO;
9591                 features &= ~NETIF_F_TSO_ECN;
9592         }
9593
9594         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
9595                                          !(features & NETIF_F_IPV6_CSUM)) {
9596                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
9597                 features &= ~NETIF_F_TSO6;
9598         }
9599
9600         /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
9601         if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
9602                 features &= ~NETIF_F_TSO_MANGLEID;
9603
9604         /* TSO ECN requires that TSO is present as well. */
9605         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
9606                 features &= ~NETIF_F_TSO_ECN;
9607
9608         /* Software GSO depends on SG. */
9609         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
9610                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
9611                 features &= ~NETIF_F_GSO;
9612         }
9613
9614         /* GSO partial features require GSO partial be set */
9615         if ((features & dev->gso_partial_features) &&
9616             !(features & NETIF_F_GSO_PARTIAL)) {
9617                 netdev_dbg(dev,
9618                            "Dropping partially supported GSO features since no GSO partial.\n");
9619                 features &= ~dev->gso_partial_features;
9620         }
9621
9622         if (!(features & NETIF_F_RXCSUM)) {
9623                 /* NETIF_F_GRO_HW implies doing RXCSUM since every packet
9624                  * successfully merged by hardware must also have the
9625                  * checksum verified by hardware.  If the user does not
9626                  * want to enable RXCSUM, logically, we should disable GRO_HW.
9627                  */
9628                 if (features & NETIF_F_GRO_HW) {
9629                         netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
9630                         features &= ~NETIF_F_GRO_HW;
9631                 }
9632         }
9633
9634         /* LRO/HW-GRO features cannot be combined with RX-FCS */
9635         if (features & NETIF_F_RXFCS) {
9636                 if (features & NETIF_F_LRO) {
9637                         netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
9638                         features &= ~NETIF_F_LRO;
9639                 }
9640
9641                 if (features & NETIF_F_GRO_HW) {
9642                         netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
9643                         features &= ~NETIF_F_GRO_HW;
9644                 }
9645         }
9646
9647         if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
9648                 netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
9649                 features &= ~NETIF_F_HW_TLS_RX;
9650         }
9651
9652         return features;
9653 }
9654
9655 int __netdev_update_features(struct net_device *dev)
9656 {
9657         struct net_device *upper, *lower;
9658         netdev_features_t features;
9659         struct list_head *iter;
9660         int err = -1;
9661
9662         ASSERT_RTNL();
9663
9664         features = netdev_get_wanted_features(dev);
9665
9666         if (dev->netdev_ops->ndo_fix_features)
9667                 features = dev->netdev_ops->ndo_fix_features(dev, features);
9668
9669         /* driver might be less strict about feature dependencies */
9670         features = netdev_fix_features(dev, features);
9671
9672         /* some features can't be enabled if they're off on an upper device */
9673         netdev_for_each_upper_dev_rcu(dev, upper, iter)
9674                 features = netdev_sync_upper_features(dev, upper, features);
9675
9676         if (dev->features == features)
9677                 goto sync_lower;
9678
9679         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
9680                 &dev->features, &features);
9681
9682         if (dev->netdev_ops->ndo_set_features)
9683                 err = dev->netdev_ops->ndo_set_features(dev, features);
9684         else
9685                 err = 0;
9686
9687         if (unlikely(err < 0)) {
9688                 netdev_err(dev,
9689                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
9690                         err, &features, &dev->features);
9691                 /* return non-0 since some features might have changed and
9692                  * it's better to fire a spurious notification than miss it
9693                  */
9694                 return -1;
9695         }
9696
9697 sync_lower:
9698         /* some features must be disabled on lower devices when disabled
9699          * on an upper device (think: bonding master or bridge)
9700          */
9701         netdev_for_each_lower_dev(dev, lower, iter)
9702                 netdev_sync_lower_features(dev, lower, features);
9703
9704         if (!err) {
9705                 netdev_features_t diff = features ^ dev->features;
9706
9707                 if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
9708                         /* udp_tunnel_{get,drop}_rx_info both need
9709                          * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
9710                          * device, or they won't do anything.
9711                          * Thus we need to update dev->features
9712                          * *before* calling udp_tunnel_get_rx_info,
9713                          * but *after* calling udp_tunnel_drop_rx_info.
9714                          */
9715                         if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
9716                                 dev->features = features;
9717                                 udp_tunnel_get_rx_info(dev);
9718                         } else {
9719                                 udp_tunnel_drop_rx_info(dev);
9720                         }
9721                 }
9722
9723                 if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
9724                         if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
9725                                 dev->features = features;
9726                                 err |= vlan_get_rx_ctag_filter_info(dev);
9727                         } else {
9728                                 vlan_drop_rx_ctag_filter_info(dev);
9729                         }
9730                 }
9731
9732                 if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
9733                         if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
9734                                 dev->features = features;
9735                                 err |= vlan_get_rx_stag_filter_info(dev);
9736                         } else {
9737                                 vlan_drop_rx_stag_filter_info(dev);
9738                         }
9739                 }
9740
9741                 dev->features = features;
9742         }
9743
9744         return err < 0 ? 0 : 1;
9745 }
9746
9747 /**
9748  *      netdev_update_features - recalculate device features
9749  *      @dev: the device to check
9750  *
9751  *      Recalculate dev->features set and send notifications if it
9752  *      has changed. Should be called after driver or hardware dependent
9753  *      conditions might have changed that influence the features.
9754  */
9755 void netdev_update_features(struct net_device *dev)
9756 {
9757         if (__netdev_update_features(dev))
9758                 netdev_features_change(dev);
9759 }
9760 EXPORT_SYMBOL(netdev_update_features);
9761
9762 /**
9763  *      netdev_change_features - recalculate device features
9764  *      @dev: the device to check
9765  *
9766  *      Recalculate dev->features set and send notifications even
9767  *      if they have not changed. Should be called instead of
9768  *      netdev_update_features() if also dev->vlan_features might
9769  *      have changed to allow the changes to be propagated to stacked
9770  *      VLAN devices.
9771  */
9772 void netdev_change_features(struct net_device *dev)
9773 {
9774         __netdev_update_features(dev);
9775         netdev_features_change(dev);
9776 }
9777 EXPORT_SYMBOL(netdev_change_features);
9778
9779 /**
9780  *      netif_stacked_transfer_operstate -      transfer operstate
9781  *      @rootdev: the root or lower level device to transfer state from
9782  *      @dev: the device to transfer operstate to
9783  *
9784  *      Transfer operational state from root to device. This is normally
9785  *      called when a stacking relationship exists between the root
9786  *      device and the device(a leaf device).
9787  */
9788 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
9789                                         struct net_device *dev)
9790 {
9791         if (rootdev->operstate == IF_OPER_DORMANT)
9792                 netif_dormant_on(dev);
9793         else
9794                 netif_dormant_off(dev);
9795
9796         if (rootdev->operstate == IF_OPER_TESTING)
9797                 netif_testing_on(dev);
9798         else
9799                 netif_testing_off(dev);
9800
9801         if (netif_carrier_ok(rootdev))
9802                 netif_carrier_on(dev);
9803         else
9804                 netif_carrier_off(dev);
9805 }
9806 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
9807
9808 static int netif_alloc_rx_queues(struct net_device *dev)
9809 {
9810         unsigned int i, count = dev->num_rx_queues;
9811         struct netdev_rx_queue *rx;
9812         size_t sz = count * sizeof(*rx);
9813         int err = 0;
9814
9815         BUG_ON(count < 1);
9816
9817         rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
9818         if (!rx)
9819                 return -ENOMEM;
9820
9821         dev->_rx = rx;
9822
9823         for (i = 0; i < count; i++) {
9824                 rx[i].dev = dev;
9825
9826                 /* XDP RX-queue setup */
9827                 err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
9828                 if (err < 0)
9829                         goto err_rxq_info;
9830         }
9831         return 0;
9832
9833 err_rxq_info:
9834         /* Rollback successful reg's and free other resources */
9835         while (i--)
9836                 xdp_rxq_info_unreg(&rx[i].xdp_rxq);
9837         kvfree(dev->_rx);
9838         dev->_rx = NULL;
9839         return err;
9840 }
9841
9842 static void netif_free_rx_queues(struct net_device *dev)
9843 {
9844         unsigned int i, count = dev->num_rx_queues;
9845
9846         /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
9847         if (!dev->_rx)
9848                 return;
9849
9850         for (i = 0; i < count; i++)
9851                 xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
9852
9853         kvfree(dev->_rx);
9854 }
9855
9856 static void netdev_init_one_queue(struct net_device *dev,
9857                                   struct netdev_queue *queue, void *_unused)
9858 {
9859         /* Initialize queue lock */
9860         spin_lock_init(&queue->_xmit_lock);
9861         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
9862         queue->xmit_lock_owner = -1;
9863         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
9864         queue->dev = dev;
9865 #ifdef CONFIG_BQL
9866         dql_init(&queue->dql, HZ);
9867 #endif
9868 }
9869
9870 static void netif_free_tx_queues(struct net_device *dev)
9871 {
9872         kvfree(dev->_tx);
9873 }
9874
9875 static int netif_alloc_netdev_queues(struct net_device *dev)
9876 {
9877         unsigned int count = dev->num_tx_queues;
9878         struct netdev_queue *tx;
9879         size_t sz = count * sizeof(*tx);
9880
9881         if (count < 1 || count > 0xffff)
9882                 return -EINVAL;
9883
9884         tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
9885         if (!tx)
9886                 return -ENOMEM;
9887
9888         dev->_tx = tx;
9889
9890         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
9891         spin_lock_init(&dev->tx_global_lock);
9892
9893         return 0;
9894 }
9895
9896 void netif_tx_stop_all_queues(struct net_device *dev)
9897 {
9898         unsigned int i;
9899
9900         for (i = 0; i < dev->num_tx_queues; i++) {
9901                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
9902
9903                 netif_tx_stop_queue(txq);
9904         }
9905 }
9906 EXPORT_SYMBOL(netif_tx_stop_all_queues);
9907
9908 /**
9909  *      register_netdevice      - register a network device
9910  *      @dev: device to register
9911  *
9912  *      Take a completed network device structure and add it to the kernel
9913  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
9914  *      chain. 0 is returned on success. A negative errno code is returned
9915  *      on a failure to set up the device, or if the name is a duplicate.
9916  *
9917  *      Callers must hold the rtnl semaphore. You may want
9918  *      register_netdev() instead of this.
9919  *
9920  *      BUGS:
9921  *      The locking appears insufficient to guarantee two parallel registers
9922  *      will not get the same name.
9923  */
9924
9925 int register_netdevice(struct net_device *dev)
9926 {
9927         int ret;
9928         struct net *net = dev_net(dev);
9929
9930         BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
9931                      NETDEV_FEATURE_COUNT);
9932         BUG_ON(dev_boot_phase);
9933         ASSERT_RTNL();
9934
9935         might_sleep();
9936
9937         /* When net_device's are persistent, this will be fatal. */
9938         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
9939         BUG_ON(!net);
9940
9941         ret = ethtool_check_ops(dev->ethtool_ops);
9942         if (ret)
9943                 return ret;
9944
9945         spin_lock_init(&dev->addr_list_lock);
9946         netdev_set_addr_lockdep_class(dev);
9947
9948         ret = dev_get_valid_name(net, dev, dev->name);
9949         if (ret < 0)
9950                 goto out;
9951
9952         ret = -ENOMEM;
9953         dev->name_node = netdev_name_node_head_alloc(dev);
9954         if (!dev->name_node)
9955                 goto out;
9956
9957         /* Init, if this function is available */
9958         if (dev->netdev_ops->ndo_init) {
9959                 ret = dev->netdev_ops->ndo_init(dev);
9960                 if (ret) {
9961                         if (ret > 0)
9962                                 ret = -EIO;
9963                         goto err_free_name;
9964                 }
9965         }
9966
9967         if (((dev->hw_features | dev->features) &
9968              NETIF_F_HW_VLAN_CTAG_FILTER) &&
9969             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
9970              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
9971                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
9972                 ret = -EINVAL;
9973                 goto err_uninit;
9974         }
9975
9976         ret = -EBUSY;
9977         if (!dev->ifindex)
9978                 dev->ifindex = dev_new_index(net);
9979         else if (__dev_get_by_index(net, dev->ifindex))
9980                 goto err_uninit;
9981
9982         /* Transfer changeable features to wanted_features and enable
9983          * software offloads (GSO and GRO).
9984          */
9985         dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
9986         dev->features |= NETIF_F_SOFT_FEATURES;
9987
9988         if (dev->netdev_ops->ndo_udp_tunnel_add) {
9989                 dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
9990                 dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
9991         }
9992
9993         dev->wanted_features = dev->features & dev->hw_features;
9994
9995         if (!(dev->flags & IFF_LOOPBACK))
9996                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
9997
9998         /* If IPv4 TCP segmentation offload is supported we should also
9999          * allow the device to enable segmenting the frame with the option
10000          * of ignoring a static IP ID value.  This doesn't enable the
10001          * feature itself but allows the user to enable it later.
10002          */
10003         if (dev->hw_features & NETIF_F_TSO)
10004                 dev->hw_features |= NETIF_F_TSO_MANGLEID;
10005         if (dev->vlan_features & NETIF_F_TSO)
10006                 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
10007         if (dev->mpls_features & NETIF_F_TSO)
10008                 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
10009         if (dev->hw_enc_features & NETIF_F_TSO)
10010                 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
10011
10012         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
10013          */
10014         dev->vlan_features |= NETIF_F_HIGHDMA;
10015
10016         /* Make NETIF_F_SG inheritable to tunnel devices.
10017          */
10018         dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
10019
10020         /* Make NETIF_F_SG inheritable to MPLS.
10021          */
10022         dev->mpls_features |= NETIF_F_SG;
10023
10024         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
10025         ret = notifier_to_errno(ret);
10026         if (ret)
10027                 goto err_uninit;
10028
10029         ret = netdev_register_kobject(dev);
10030         if (ret) {
10031                 dev->reg_state = NETREG_UNREGISTERED;
10032                 goto err_uninit;
10033         }
10034         dev->reg_state = NETREG_REGISTERED;
10035
10036         __netdev_update_features(dev);
10037
10038         /*
10039          *      Default initial state at registry is that the
10040          *      device is present.
10041          */
10042
10043         set_bit(__LINK_STATE_PRESENT, &dev->state);
10044
10045         linkwatch_init_dev(dev);
10046
10047         dev_init_scheduler(dev);
10048         dev_hold(dev);
10049         list_netdevice(dev);
10050         add_device_randomness(dev->dev_addr, dev->addr_len);
10051
10052         /* If the device has permanent device address, driver should
10053          * set dev_addr and also addr_assign_type should be set to
10054          * NET_ADDR_PERM (default value).
10055          */
10056         if (dev->addr_assign_type == NET_ADDR_PERM)
10057                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
10058
10059         /* Notify protocols, that a new device appeared. */
10060         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
10061         ret = notifier_to_errno(ret);
10062         if (ret) {
10063                 /* Expect explicit free_netdev() on failure */
10064                 dev->needs_free_netdev = false;
10065                 unregister_netdevice_queue(dev, NULL);
10066                 goto out;
10067         }
10068         /*
10069          *      Prevent userspace races by waiting until the network
10070          *      device is fully setup before sending notifications.
10071          */
10072         if (!dev->rtnl_link_ops ||
10073             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10074                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
10075
10076 out:
10077         return ret;
10078
10079 err_uninit:
10080         if (dev->netdev_ops->ndo_uninit)
10081                 dev->netdev_ops->ndo_uninit(dev);
10082         if (dev->priv_destructor)
10083                 dev->priv_destructor(dev);
10084 err_free_name:
10085         netdev_name_node_free(dev->name_node);
10086         goto out;
10087 }
10088 EXPORT_SYMBOL(register_netdevice);
10089
10090 /**
10091  *      init_dummy_netdev       - init a dummy network device for NAPI
10092  *      @dev: device to init
10093  *
10094  *      This takes a network device structure and initialize the minimum
10095  *      amount of fields so it can be used to schedule NAPI polls without
10096  *      registering a full blown interface. This is to be used by drivers
10097  *      that need to tie several hardware interfaces to a single NAPI
10098  *      poll scheduler due to HW limitations.
10099  */
10100 int init_dummy_netdev(struct net_device *dev)
10101 {
10102         /* Clear everything. Note we don't initialize spinlocks
10103          * are they aren't supposed to be taken by any of the
10104          * NAPI code and this dummy netdev is supposed to be
10105          * only ever used for NAPI polls
10106          */
10107         memset(dev, 0, sizeof(struct net_device));
10108
10109         /* make sure we BUG if trying to hit standard
10110          * register/unregister code path
10111          */
10112         dev->reg_state = NETREG_DUMMY;
10113
10114         /* NAPI wants this */
10115         INIT_LIST_HEAD(&dev->napi_list);
10116
10117         /* a dummy interface is started by default */
10118         set_bit(__LINK_STATE_PRESENT, &dev->state);
10119         set_bit(__LINK_STATE_START, &dev->state);
10120
10121         /* napi_busy_loop stats accounting wants this */
10122         dev_net_set(dev, &init_net);
10123
10124         /* Note : We dont allocate pcpu_refcnt for dummy devices,
10125          * because users of this 'device' dont need to change
10126          * its refcount.
10127          */
10128
10129         return 0;
10130 }
10131 EXPORT_SYMBOL_GPL(init_dummy_netdev);
10132
10133
10134 /**
10135  *      register_netdev - register a network device
10136  *      @dev: device to register
10137  *
10138  *      Take a completed network device structure and add it to the kernel
10139  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
10140  *      chain. 0 is returned on success. A negative errno code is returned
10141  *      on a failure to set up the device, or if the name is a duplicate.
10142  *
10143  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
10144  *      and expands the device name if you passed a format string to
10145  *      alloc_netdev.
10146  */
10147 int register_netdev(struct net_device *dev)
10148 {
10149         int err;
10150
10151         if (rtnl_lock_killable())
10152                 return -EINTR;
10153         err = register_netdevice(dev);
10154         rtnl_unlock();
10155         return err;
10156 }
10157 EXPORT_SYMBOL(register_netdev);
10158
10159 int netdev_refcnt_read(const struct net_device *dev)
10160 {
10161         int i, refcnt = 0;
10162
10163         for_each_possible_cpu(i)
10164                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
10165         return refcnt;
10166 }
10167 EXPORT_SYMBOL(netdev_refcnt_read);
10168
10169 #define WAIT_REFS_MIN_MSECS 1
10170 #define WAIT_REFS_MAX_MSECS 250
10171 /**
10172  * netdev_wait_allrefs - wait until all references are gone.
10173  * @dev: target net_device
10174  *
10175  * This is called when unregistering network devices.
10176  *
10177  * Any protocol or device that holds a reference should register
10178  * for netdevice notification, and cleanup and put back the
10179  * reference if they receive an UNREGISTER event.
10180  * We can get stuck here if buggy protocols don't correctly
10181  * call dev_put.
10182  */
10183 static void netdev_wait_allrefs(struct net_device *dev)
10184 {
10185         unsigned long rebroadcast_time, warning_time;
10186         int wait = 0, refcnt;
10187
10188         linkwatch_forget_dev(dev);
10189
10190         rebroadcast_time = warning_time = jiffies;
10191         refcnt = netdev_refcnt_read(dev);
10192
10193         while (refcnt != 0) {
10194                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
10195                         rtnl_lock();
10196
10197                         /* Rebroadcast unregister notification */
10198                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10199
10200                         __rtnl_unlock();
10201                         rcu_barrier();
10202                         rtnl_lock();
10203
10204                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
10205                                      &dev->state)) {
10206                                 /* We must not have linkwatch events
10207                                  * pending on unregister. If this
10208                                  * happens, we simply run the queue
10209                                  * unscheduled, resulting in a noop
10210                                  * for this device.
10211                                  */
10212                                 linkwatch_run_queue();
10213                         }
10214
10215                         __rtnl_unlock();
10216
10217                         rebroadcast_time = jiffies;
10218                 }
10219
10220                 rcu_barrier();
10221
10222                 if (!wait) {
10223                         wait = WAIT_REFS_MIN_MSECS;
10224                 } else {
10225                         msleep(wait);
10226                         wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
10227                 }
10228
10229                 refcnt = netdev_refcnt_read(dev);
10230
10231                 if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
10232                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
10233                                  dev->name, refcnt);
10234                         warning_time = jiffies;
10235                 }
10236         }
10237 }
10238
10239 /* The sequence is:
10240  *
10241  *      rtnl_lock();
10242  *      ...
10243  *      register_netdevice(x1);
10244  *      register_netdevice(x2);
10245  *      ...
10246  *      unregister_netdevice(y1);
10247  *      unregister_netdevice(y2);
10248  *      ...
10249  *      rtnl_unlock();
10250  *      free_netdev(y1);
10251  *      free_netdev(y2);
10252  *
10253  * We are invoked by rtnl_unlock().
10254  * This allows us to deal with problems:
10255  * 1) We can delete sysfs objects which invoke hotplug
10256  *    without deadlocking with linkwatch via keventd.
10257  * 2) Since we run with the RTNL semaphore not held, we can sleep
10258  *    safely in order to wait for the netdev refcnt to drop to zero.
10259  *
10260  * We must not return until all unregister events added during
10261  * the interval the lock was held have been completed.
10262  */
10263 void netdev_run_todo(void)
10264 {
10265         struct list_head list;
10266 #ifdef CONFIG_LOCKDEP
10267         struct list_head unlink_list;
10268
10269         list_replace_init(&net_unlink_list, &unlink_list);
10270
10271         while (!list_empty(&unlink_list)) {
10272                 struct net_device *dev = list_first_entry(&unlink_list,
10273                                                           struct net_device,
10274                                                           unlink_list);
10275                 list_del_init(&dev->unlink_list);
10276                 dev->nested_level = dev->lower_level - 1;
10277         }
10278 #endif
10279
10280         /* Snapshot list, allow later requests */
10281         list_replace_init(&net_todo_list, &list);
10282
10283         __rtnl_unlock();
10284
10285
10286         /* Wait for rcu callbacks to finish before next phase */
10287         if (!list_empty(&list))
10288                 rcu_barrier();
10289
10290         while (!list_empty(&list)) {
10291                 struct net_device *dev
10292                         = list_first_entry(&list, struct net_device, todo_list);
10293                 list_del(&dev->todo_list);
10294
10295                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
10296                         pr_err("network todo '%s' but state %d\n",
10297                                dev->name, dev->reg_state);
10298                         dump_stack();
10299                         continue;
10300                 }
10301
10302                 dev->reg_state = NETREG_UNREGISTERED;
10303
10304                 netdev_wait_allrefs(dev);
10305
10306                 /* paranoia */
10307                 BUG_ON(netdev_refcnt_read(dev));
10308                 BUG_ON(!list_empty(&dev->ptype_all));
10309                 BUG_ON(!list_empty(&dev->ptype_specific));
10310                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
10311                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
10312
10313                 if (dev->priv_destructor)
10314                         dev->priv_destructor(dev);
10315                 if (dev->needs_free_netdev)
10316                         free_netdev(dev);
10317
10318                 /* Report a network device has been unregistered */
10319                 rtnl_lock();
10320                 dev_net(dev)->dev_unreg_count--;
10321                 __rtnl_unlock();
10322                 wake_up(&netdev_unregistering_wq);
10323
10324                 /* Free network device */
10325                 kobject_put(&dev->dev.kobj);
10326         }
10327 }
10328
10329 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
10330  * all the same fields in the same order as net_device_stats, with only
10331  * the type differing, but rtnl_link_stats64 may have additional fields
10332  * at the end for newer counters.
10333  */
10334 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
10335                              const struct net_device_stats *netdev_stats)
10336 {
10337         size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
10338         const atomic_long_t *src = (atomic_long_t *)netdev_stats;
10339         u64 *dst = (u64 *)stats64;
10340
10341         BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
10342         for (i = 0; i < n; i++)
10343                 dst[i] = (unsigned long)atomic_long_read(&src[i]);
10344         /* zero out counters that only exist in rtnl_link_stats64 */
10345         memset((char *)stats64 + n * sizeof(u64), 0,
10346                sizeof(*stats64) - n * sizeof(u64));
10347 }
10348 EXPORT_SYMBOL(netdev_stats_to_stats64);
10349
10350 /**
10351  *      dev_get_stats   - get network device statistics
10352  *      @dev: device to get statistics from
10353  *      @storage: place to store stats
10354  *
10355  *      Get network statistics from device. Return @storage.
10356  *      The device driver may provide its own method by setting
10357  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
10358  *      otherwise the internal statistics structure is used.
10359  */
10360 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
10361                                         struct rtnl_link_stats64 *storage)
10362 {
10363         const struct net_device_ops *ops = dev->netdev_ops;
10364
10365         if (ops->ndo_get_stats64) {
10366                 memset(storage, 0, sizeof(*storage));
10367                 ops->ndo_get_stats64(dev, storage);
10368         } else if (ops->ndo_get_stats) {
10369                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
10370         } else {
10371                 netdev_stats_to_stats64(storage, &dev->stats);
10372         }
10373         storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
10374         storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
10375         storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
10376         return storage;
10377 }
10378 EXPORT_SYMBOL(dev_get_stats);
10379
10380 /**
10381  *      dev_fetch_sw_netstats - get per-cpu network device statistics
10382  *      @s: place to store stats
10383  *      @netstats: per-cpu network stats to read from
10384  *
10385  *      Read per-cpu network statistics and populate the related fields in @s.
10386  */
10387 void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
10388                            const struct pcpu_sw_netstats __percpu *netstats)
10389 {
10390         int cpu;
10391
10392         for_each_possible_cpu(cpu) {
10393                 const struct pcpu_sw_netstats *stats;
10394                 struct pcpu_sw_netstats tmp;
10395                 unsigned int start;
10396
10397                 stats = per_cpu_ptr(netstats, cpu);
10398                 do {
10399                         start = u64_stats_fetch_begin_irq(&stats->syncp);
10400                         tmp.rx_packets = stats->rx_packets;
10401                         tmp.rx_bytes   = stats->rx_bytes;
10402                         tmp.tx_packets = stats->tx_packets;
10403                         tmp.tx_bytes   = stats->tx_bytes;
10404                 } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
10405
10406                 s->rx_packets += tmp.rx_packets;
10407                 s->rx_bytes   += tmp.rx_bytes;
10408                 s->tx_packets += tmp.tx_packets;
10409                 s->tx_bytes   += tmp.tx_bytes;
10410         }
10411 }
10412 EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
10413
10414 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
10415 {
10416         struct netdev_queue *queue = dev_ingress_queue(dev);
10417
10418 #ifdef CONFIG_NET_CLS_ACT
10419         if (queue)
10420                 return queue;
10421         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
10422         if (!queue)
10423                 return NULL;
10424         netdev_init_one_queue(dev, queue, NULL);
10425         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
10426         queue->qdisc_sleeping = &noop_qdisc;
10427         rcu_assign_pointer(dev->ingress_queue, queue);
10428 #endif
10429         return queue;
10430 }
10431
10432 static const struct ethtool_ops default_ethtool_ops;
10433
10434 void netdev_set_default_ethtool_ops(struct net_device *dev,
10435                                     const struct ethtool_ops *ops)
10436 {
10437         if (dev->ethtool_ops == &default_ethtool_ops)
10438                 dev->ethtool_ops = ops;
10439 }
10440 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
10441
10442 void netdev_freemem(struct net_device *dev)
10443 {
10444         char *addr = (char *)dev - dev->padded;
10445
10446         kvfree(addr);
10447 }
10448
10449 /**
10450  * alloc_netdev_mqs - allocate network device
10451  * @sizeof_priv: size of private data to allocate space for
10452  * @name: device name format string
10453  * @name_assign_type: origin of device name
10454  * @setup: callback to initialize device
10455  * @txqs: the number of TX subqueues to allocate
10456  * @rxqs: the number of RX subqueues to allocate
10457  *
10458  * Allocates a struct net_device with private data area for driver use
10459  * and performs basic initialization.  Also allocates subqueue structs
10460  * for each queue on the device.
10461  */
10462 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
10463                 unsigned char name_assign_type,
10464                 void (*setup)(struct net_device *),
10465                 unsigned int txqs, unsigned int rxqs)
10466 {
10467         struct net_device *dev;
10468         unsigned int alloc_size;
10469         struct net_device *p;
10470
10471         BUG_ON(strlen(name) >= sizeof(dev->name));
10472
10473         if (txqs < 1) {
10474                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
10475                 return NULL;
10476         }
10477
10478         if (rxqs < 1) {
10479                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
10480                 return NULL;
10481         }
10482
10483         alloc_size = sizeof(struct net_device);
10484         if (sizeof_priv) {
10485                 /* ensure 32-byte alignment of private area */
10486                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
10487                 alloc_size += sizeof_priv;
10488         }
10489         /* ensure 32-byte alignment of whole construct */
10490         alloc_size += NETDEV_ALIGN - 1;
10491
10492         p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
10493         if (!p)
10494                 return NULL;
10495
10496         dev = PTR_ALIGN(p, NETDEV_ALIGN);
10497         dev->padded = (char *)dev - (char *)p;
10498
10499         dev->pcpu_refcnt = alloc_percpu(int);
10500         if (!dev->pcpu_refcnt)
10501                 goto free_dev;
10502
10503         if (dev_addr_init(dev))
10504                 goto free_pcpu;
10505
10506         dev_mc_init(dev);
10507         dev_uc_init(dev);
10508
10509         dev_net_set(dev, &init_net);
10510
10511         dev->gso_max_size = GSO_MAX_SIZE;
10512         dev->gso_max_segs = GSO_MAX_SEGS;
10513         dev->upper_level = 1;
10514         dev->lower_level = 1;
10515 #ifdef CONFIG_LOCKDEP
10516         dev->nested_level = 0;
10517         INIT_LIST_HEAD(&dev->unlink_list);
10518 #endif
10519
10520         INIT_LIST_HEAD(&dev->napi_list);
10521         INIT_LIST_HEAD(&dev->unreg_list);
10522         INIT_LIST_HEAD(&dev->close_list);
10523         INIT_LIST_HEAD(&dev->link_watch_list);
10524         INIT_LIST_HEAD(&dev->adj_list.upper);
10525         INIT_LIST_HEAD(&dev->adj_list.lower);
10526         INIT_LIST_HEAD(&dev->ptype_all);
10527         INIT_LIST_HEAD(&dev->ptype_specific);
10528         INIT_LIST_HEAD(&dev->net_notifier_list);
10529 #ifdef CONFIG_NET_SCHED
10530         hash_init(dev->qdisc_hash);
10531 #endif
10532         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
10533         setup(dev);
10534
10535         if (!dev->tx_queue_len) {
10536                 dev->priv_flags |= IFF_NO_QUEUE;
10537                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
10538         }
10539
10540         dev->num_tx_queues = txqs;
10541         dev->real_num_tx_queues = txqs;
10542         if (netif_alloc_netdev_queues(dev))
10543                 goto free_all;
10544
10545         dev->num_rx_queues = rxqs;
10546         dev->real_num_rx_queues = rxqs;
10547         if (netif_alloc_rx_queues(dev))
10548                 goto free_all;
10549
10550         strcpy(dev->name, name);
10551         dev->name_assign_type = name_assign_type;
10552         dev->group = INIT_NETDEV_GROUP;
10553         if (!dev->ethtool_ops)
10554                 dev->ethtool_ops = &default_ethtool_ops;
10555
10556         nf_hook_ingress_init(dev);
10557
10558         return dev;
10559
10560 free_all:
10561         free_netdev(dev);
10562         return NULL;
10563
10564 free_pcpu:
10565         free_percpu(dev->pcpu_refcnt);
10566 free_dev:
10567         netdev_freemem(dev);
10568         return NULL;
10569 }
10570 EXPORT_SYMBOL(alloc_netdev_mqs);
10571
10572 /**
10573  * free_netdev - free network device
10574  * @dev: device
10575  *
10576  * This function does the last stage of destroying an allocated device
10577  * interface. The reference to the device object is released. If this
10578  * is the last reference then it will be freed.Must be called in process
10579  * context.
10580  */
10581 void free_netdev(struct net_device *dev)
10582 {
10583         struct napi_struct *p, *n;
10584
10585         might_sleep();
10586
10587         /* When called immediately after register_netdevice() failed the unwind
10588          * handling may still be dismantling the device. Handle that case by
10589          * deferring the free.
10590          */
10591         if (dev->reg_state == NETREG_UNREGISTERING) {
10592                 ASSERT_RTNL();
10593                 dev->needs_free_netdev = true;
10594                 return;
10595         }
10596
10597         netif_free_tx_queues(dev);
10598         netif_free_rx_queues(dev);
10599
10600         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
10601
10602         /* Flush device addresses */
10603         dev_addr_flush(dev);
10604
10605         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
10606                 netif_napi_del(p);
10607
10608         free_percpu(dev->pcpu_refcnt);
10609         dev->pcpu_refcnt = NULL;
10610         free_percpu(dev->xdp_bulkq);
10611         dev->xdp_bulkq = NULL;
10612
10613         /*  Compatibility with error handling in drivers */
10614         if (dev->reg_state == NETREG_UNINITIALIZED) {
10615                 netdev_freemem(dev);
10616                 return;
10617         }
10618
10619         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
10620         dev->reg_state = NETREG_RELEASED;
10621
10622         /* will free via device release */
10623         put_device(&dev->dev);
10624 }
10625 EXPORT_SYMBOL(free_netdev);
10626
10627 /**
10628  *      synchronize_net -  Synchronize with packet receive processing
10629  *
10630  *      Wait for packets currently being received to be done.
10631  *      Does not block later packets from starting.
10632  */
10633 void synchronize_net(void)
10634 {
10635         might_sleep();
10636         if (rtnl_is_locked())
10637                 synchronize_rcu_expedited();
10638         else
10639                 synchronize_rcu();
10640 }
10641 EXPORT_SYMBOL(synchronize_net);
10642
10643 /**
10644  *      unregister_netdevice_queue - remove device from the kernel
10645  *      @dev: device
10646  *      @head: list
10647  *
10648  *      This function shuts down a device interface and removes it
10649  *      from the kernel tables.
10650  *      If head not NULL, device is queued to be unregistered later.
10651  *
10652  *      Callers must hold the rtnl semaphore.  You may want
10653  *      unregister_netdev() instead of this.
10654  */
10655
10656 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
10657 {
10658         ASSERT_RTNL();
10659
10660         if (head) {
10661                 list_move_tail(&dev->unreg_list, head);
10662         } else {
10663                 LIST_HEAD(single);
10664
10665                 list_add(&dev->unreg_list, &single);
10666                 unregister_netdevice_many(&single);
10667         }
10668 }
10669 EXPORT_SYMBOL(unregister_netdevice_queue);
10670
10671 /**
10672  *      unregister_netdevice_many - unregister many devices
10673  *      @head: list of devices
10674  *
10675  *  Note: As most callers use a stack allocated list_head,
10676  *  we force a list_del() to make sure stack wont be corrupted later.
10677  */
10678 void unregister_netdevice_many(struct list_head *head)
10679 {
10680         struct net_device *dev, *tmp;
10681         LIST_HEAD(close_head);
10682
10683         BUG_ON(dev_boot_phase);
10684         ASSERT_RTNL();
10685
10686         if (list_empty(head))
10687                 return;
10688
10689         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
10690                 /* Some devices call without registering
10691                  * for initialization unwind. Remove those
10692                  * devices and proceed with the remaining.
10693                  */
10694                 if (dev->reg_state == NETREG_UNINITIALIZED) {
10695                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
10696                                  dev->name, dev);
10697
10698                         WARN_ON(1);
10699                         list_del(&dev->unreg_list);
10700                         continue;
10701                 }
10702                 dev->dismantle = true;
10703                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
10704         }
10705
10706         /* If device is running, close it first. */
10707         list_for_each_entry(dev, head, unreg_list)
10708                 list_add_tail(&dev->close_list, &close_head);
10709         dev_close_many(&close_head, true);
10710
10711         list_for_each_entry(dev, head, unreg_list) {
10712                 /* And unlink it from device chain. */
10713                 unlist_netdevice(dev);
10714
10715                 dev->reg_state = NETREG_UNREGISTERING;
10716         }
10717         flush_all_backlogs();
10718
10719         synchronize_net();
10720
10721         list_for_each_entry(dev, head, unreg_list) {
10722                 struct sk_buff *skb = NULL;
10723
10724                 /* Shutdown queueing discipline. */
10725                 dev_shutdown(dev);
10726
10727                 dev_xdp_uninstall(dev);
10728
10729                 /* Notify protocols, that we are about to destroy
10730                  * this device. They should clean all the things.
10731                  */
10732                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10733
10734                 if (!dev->rtnl_link_ops ||
10735                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10736                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
10737                                                      GFP_KERNEL, NULL, 0);
10738
10739                 /*
10740                  *      Flush the unicast and multicast chains
10741                  */
10742                 dev_uc_flush(dev);
10743                 dev_mc_flush(dev);
10744
10745                 netdev_name_node_alt_flush(dev);
10746                 netdev_name_node_free(dev->name_node);
10747
10748                 if (dev->netdev_ops->ndo_uninit)
10749                         dev->netdev_ops->ndo_uninit(dev);
10750
10751                 if (skb)
10752                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
10753
10754                 /* Notifier chain MUST detach us all upper devices. */
10755                 WARN_ON(netdev_has_any_upper_dev(dev));
10756                 WARN_ON(netdev_has_any_lower_dev(dev));
10757
10758                 /* Remove entries from kobject tree */
10759                 netdev_unregister_kobject(dev);
10760 #ifdef CONFIG_XPS
10761                 /* Remove XPS queueing entries */
10762                 netif_reset_xps_queues_gt(dev, 0);
10763 #endif
10764         }
10765
10766         synchronize_net();
10767
10768         list_for_each_entry(dev, head, unreg_list) {
10769                 dev_put(dev);
10770                 net_set_todo(dev);
10771         }
10772
10773         list_del(head);
10774 }
10775 EXPORT_SYMBOL(unregister_netdevice_many);
10776
10777 /**
10778  *      unregister_netdev - remove device from the kernel
10779  *      @dev: device
10780  *
10781  *      This function shuts down a device interface and removes it
10782  *      from the kernel tables.
10783  *
10784  *      This is just a wrapper for unregister_netdevice that takes
10785  *      the rtnl semaphore.  In general you want to use this and not
10786  *      unregister_netdevice.
10787  */
10788 void unregister_netdev(struct net_device *dev)
10789 {
10790         rtnl_lock();
10791         unregister_netdevice(dev);
10792         rtnl_unlock();
10793 }
10794 EXPORT_SYMBOL(unregister_netdev);
10795
10796 /**
10797  *      dev_change_net_namespace - move device to different nethost namespace
10798  *      @dev: device
10799  *      @net: network namespace
10800  *      @pat: If not NULL name pattern to try if the current device name
10801  *            is already taken in the destination network namespace.
10802  *
10803  *      This function shuts down a device interface and moves it
10804  *      to a new network namespace. On success 0 is returned, on
10805  *      a failure a netagive errno code is returned.
10806  *
10807  *      Callers must hold the rtnl semaphore.
10808  */
10809
10810 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
10811 {
10812         struct net *net_old = dev_net(dev);
10813         int err, new_nsid, new_ifindex;
10814
10815         ASSERT_RTNL();
10816
10817         /* Don't allow namespace local devices to be moved. */
10818         err = -EINVAL;
10819         if (dev->features & NETIF_F_NETNS_LOCAL)
10820                 goto out;
10821
10822         /* Ensure the device has been registrered */
10823         if (dev->reg_state != NETREG_REGISTERED)
10824                 goto out;
10825
10826         /* Get out if there is nothing todo */
10827         err = 0;
10828         if (net_eq(net_old, net))
10829                 goto out;
10830
10831         /* Pick the destination device name, and ensure
10832          * we can use it in the destination network namespace.
10833          */
10834         err = -EEXIST;
10835         if (__dev_get_by_name(net, dev->name)) {
10836                 /* We get here if we can't use the current device name */
10837                 if (!pat)
10838                         goto out;
10839                 err = dev_get_valid_name(net, dev, pat);
10840                 if (err < 0)
10841                         goto out;
10842         }
10843
10844         /*
10845          * And now a mini version of register_netdevice unregister_netdevice.
10846          */
10847
10848         /* If device is running close it first. */
10849         dev_close(dev);
10850
10851         /* And unlink it from device chain */
10852         unlist_netdevice(dev);
10853
10854         synchronize_net();
10855
10856         /* Shutdown queueing discipline. */
10857         dev_shutdown(dev);
10858
10859         /* Notify protocols, that we are about to destroy
10860          * this device. They should clean all the things.
10861          *
10862          * Note that dev->reg_state stays at NETREG_REGISTERED.
10863          * This is wanted because this way 8021q and macvlan know
10864          * the device is just moving and can keep their slaves up.
10865          */
10866         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10867         rcu_barrier();
10868
10869         new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
10870         /* If there is an ifindex conflict assign a new one */
10871         if (__dev_get_by_index(net, dev->ifindex))
10872                 new_ifindex = dev_new_index(net);
10873         else
10874                 new_ifindex = dev->ifindex;
10875
10876         rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
10877                             new_ifindex);
10878
10879         /*
10880          *      Flush the unicast and multicast chains
10881          */
10882         dev_uc_flush(dev);
10883         dev_mc_flush(dev);
10884
10885         /* Send a netdev-removed uevent to the old namespace */
10886         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
10887         netdev_adjacent_del_links(dev);
10888
10889         /* Move per-net netdevice notifiers that are following the netdevice */
10890         move_netdevice_notifiers_dev_net(dev, net);
10891
10892         /* Actually switch the network namespace */
10893         dev_net_set(dev, net);
10894         dev->ifindex = new_ifindex;
10895
10896         /* Send a netdev-add uevent to the new namespace */
10897         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
10898         netdev_adjacent_add_links(dev);
10899
10900         /* Fixup kobjects */
10901         err = device_rename(&dev->dev, dev->name);
10902         WARN_ON(err);
10903
10904         /* Adapt owner in case owning user namespace of target network
10905          * namespace is different from the original one.
10906          */
10907         err = netdev_change_owner(dev, net_old, net);
10908         WARN_ON(err);
10909
10910         /* Add the device back in the hashes */
10911         list_netdevice(dev);
10912
10913         /* Notify protocols, that a new device appeared. */
10914         call_netdevice_notifiers(NETDEV_REGISTER, dev);
10915
10916         /*
10917          *      Prevent userspace races by waiting until the network
10918          *      device is fully setup before sending notifications.
10919          */
10920         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
10921
10922         synchronize_net();
10923         err = 0;
10924 out:
10925         return err;
10926 }
10927 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
10928
10929 static int dev_cpu_dead(unsigned int oldcpu)
10930 {
10931         struct sk_buff **list_skb;
10932         struct sk_buff *skb;
10933         unsigned int cpu;
10934         struct softnet_data *sd, *oldsd, *remsd = NULL;
10935
10936         local_irq_disable();
10937         cpu = smp_processor_id();
10938         sd = &per_cpu(softnet_data, cpu);
10939         oldsd = &per_cpu(softnet_data, oldcpu);
10940
10941         /* Find end of our completion_queue. */
10942         list_skb = &sd->completion_queue;
10943         while (*list_skb)
10944                 list_skb = &(*list_skb)->next;
10945         /* Append completion queue from offline CPU. */
10946         *list_skb = oldsd->completion_queue;
10947         oldsd->completion_queue = NULL;
10948
10949         /* Append output queue from offline CPU. */
10950         if (oldsd->output_queue) {
10951                 *sd->output_queue_tailp = oldsd->output_queue;
10952                 sd->output_queue_tailp = oldsd->output_queue_tailp;
10953                 oldsd->output_queue = NULL;
10954                 oldsd->output_queue_tailp = &oldsd->output_queue;
10955         }
10956         /* Append NAPI poll list from offline CPU, with one exception :
10957          * process_backlog() must be called by cpu owning percpu backlog.
10958          * We properly handle process_queue & input_pkt_queue later.
10959          */
10960         while (!list_empty(&oldsd->poll_list)) {
10961                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
10962                                                             struct napi_struct,
10963                                                             poll_list);
10964
10965                 list_del_init(&napi->poll_list);
10966                 if (napi->poll == process_backlog)
10967                         napi->state = 0;
10968                 else
10969                         ____napi_schedule(sd, napi);
10970         }
10971
10972         raise_softirq_irqoff(NET_TX_SOFTIRQ);
10973         local_irq_enable();
10974
10975 #ifdef CONFIG_RPS
10976         remsd = oldsd->rps_ipi_list;
10977         oldsd->rps_ipi_list = NULL;
10978 #endif
10979         /* send out pending IPI's on offline CPU */
10980         net_rps_send_ipi(remsd);
10981
10982         /* Process offline CPU's input_pkt_queue */
10983         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
10984                 netif_rx_ni(skb);
10985                 input_queue_head_incr(oldsd);
10986         }
10987         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
10988                 netif_rx_ni(skb);
10989                 input_queue_head_incr(oldsd);
10990         }
10991
10992         return 0;
10993 }
10994
10995 /**
10996  *      netdev_increment_features - increment feature set by one
10997  *      @all: current feature set
10998  *      @one: new feature set
10999  *      @mask: mask feature set
11000  *
11001  *      Computes a new feature set after adding a device with feature set
11002  *      @one to the master device with current feature set @all.  Will not
11003  *      enable anything that is off in @mask. Returns the new feature set.
11004  */
11005 netdev_features_t netdev_increment_features(netdev_features_t all,
11006         netdev_features_t one, netdev_features_t mask)
11007 {
11008         if (mask & NETIF_F_HW_CSUM)
11009                 mask |= NETIF_F_CSUM_MASK;
11010         mask |= NETIF_F_VLAN_CHALLENGED;
11011
11012         all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
11013         all &= one | ~NETIF_F_ALL_FOR_ALL;
11014
11015         /* If one device supports hw checksumming, set for all. */
11016         if (all & NETIF_F_HW_CSUM)
11017                 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
11018
11019         return all;
11020 }
11021 EXPORT_SYMBOL(netdev_increment_features);
11022
11023 static struct hlist_head * __net_init netdev_create_hash(void)
11024 {
11025         int i;
11026         struct hlist_head *hash;
11027
11028         hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
11029         if (hash != NULL)
11030                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
11031                         INIT_HLIST_HEAD(&hash[i]);
11032
11033         return hash;
11034 }
11035
11036 /* Initialize per network namespace state */
11037 static int __net_init netdev_init(struct net *net)
11038 {
11039         BUILD_BUG_ON(GRO_HASH_BUCKETS >
11040                      8 * sizeof_field(struct napi_struct, gro_bitmask));
11041
11042         if (net != &init_net)
11043                 INIT_LIST_HEAD(&net->dev_base_head);
11044
11045         net->dev_name_head = netdev_create_hash();
11046         if (net->dev_name_head == NULL)
11047                 goto err_name;
11048
11049         net->dev_index_head = netdev_create_hash();
11050         if (net->dev_index_head == NULL)
11051                 goto err_idx;
11052
11053         RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
11054
11055         return 0;
11056
11057 err_idx:
11058         kfree(net->dev_name_head);
11059 err_name:
11060         return -ENOMEM;
11061 }
11062
11063 /**
11064  *      netdev_drivername - network driver for the device
11065  *      @dev: network device
11066  *
11067  *      Determine network driver for device.
11068  */
11069 const char *netdev_drivername(const struct net_device *dev)
11070 {
11071         const struct device_driver *driver;
11072         const struct device *parent;
11073         const char *empty = "";
11074
11075         parent = dev->dev.parent;
11076         if (!parent)
11077                 return empty;
11078
11079         driver = parent->driver;
11080         if (driver && driver->name)
11081                 return driver->name;
11082         return empty;
11083 }
11084
11085 static void __netdev_printk(const char *level, const struct net_device *dev,
11086                             struct va_format *vaf)
11087 {
11088         if (dev && dev->dev.parent) {
11089                 dev_printk_emit(level[1] - '0',
11090                                 dev->dev.parent,
11091                                 "%s %s %s%s: %pV",
11092                                 dev_driver_string(dev->dev.parent),
11093                                 dev_name(dev->dev.parent),
11094                                 netdev_name(dev), netdev_reg_state(dev),
11095                                 vaf);
11096         } else if (dev) {
11097                 printk("%s%s%s: %pV",
11098                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
11099         } else {
11100                 printk("%s(NULL net_device): %pV", level, vaf);
11101         }
11102 }
11103
11104 void netdev_printk(const char *level, const struct net_device *dev,
11105                    const char *format, ...)
11106 {
11107         struct va_format vaf;
11108         va_list args;
11109
11110         va_start(args, format);
11111
11112         vaf.fmt = format;
11113         vaf.va = &args;
11114
11115         __netdev_printk(level, dev, &vaf);
11116
11117         va_end(args);
11118 }
11119 EXPORT_SYMBOL(netdev_printk);
11120
11121 #define define_netdev_printk_level(func, level)                 \
11122 void func(const struct net_device *dev, const char *fmt, ...)   \
11123 {                                                               \
11124         struct va_format vaf;                                   \
11125         va_list args;                                           \
11126                                                                 \
11127         va_start(args, fmt);                                    \
11128                                                                 \
11129         vaf.fmt = fmt;                                          \
11130         vaf.va = &args;                                         \
11131                                                                 \
11132         __netdev_printk(level, dev, &vaf);                      \
11133                                                                 \
11134         va_end(args);                                           \
11135 }                                                               \
11136 EXPORT_SYMBOL(func);
11137
11138 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
11139 define_netdev_printk_level(netdev_alert, KERN_ALERT);
11140 define_netdev_printk_level(netdev_crit, KERN_CRIT);
11141 define_netdev_printk_level(netdev_err, KERN_ERR);
11142 define_netdev_printk_level(netdev_warn, KERN_WARNING);
11143 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
11144 define_netdev_printk_level(netdev_info, KERN_INFO);
11145
11146 static void __net_exit netdev_exit(struct net *net)
11147 {
11148         kfree(net->dev_name_head);
11149         kfree(net->dev_index_head);
11150         if (net != &init_net)
11151                 WARN_ON_ONCE(!list_empty(&net->dev_base_head));
11152 }
11153
11154 static struct pernet_operations __net_initdata netdev_net_ops = {
11155         .init = netdev_init,
11156         .exit = netdev_exit,
11157 };
11158
11159 static void __net_exit default_device_exit(struct net *net)
11160 {
11161         struct net_device *dev, *aux;
11162         /*
11163          * Push all migratable network devices back to the
11164          * initial network namespace
11165          */
11166         rtnl_lock();
11167         for_each_netdev_safe(net, dev, aux) {
11168                 int err;
11169                 char fb_name[IFNAMSIZ];
11170
11171                 /* Ignore unmoveable devices (i.e. loopback) */
11172                 if (dev->features & NETIF_F_NETNS_LOCAL)
11173                         continue;
11174
11175                 /* Leave virtual devices for the generic cleanup */
11176                 if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
11177                         continue;
11178
11179                 /* Push remaining network devices to init_net */
11180                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
11181                 if (__dev_get_by_name(&init_net, fb_name))
11182                         snprintf(fb_name, IFNAMSIZ, "dev%%d");
11183                 err = dev_change_net_namespace(dev, &init_net, fb_name);
11184                 if (err) {
11185                         pr_emerg("%s: failed to move %s to init_net: %d\n",
11186                                  __func__, dev->name, err);
11187                         BUG();
11188                 }
11189         }
11190         rtnl_unlock();
11191 }
11192
11193 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
11194 {
11195         /* Return with the rtnl_lock held when there are no network
11196          * devices unregistering in any network namespace in net_list.
11197          */
11198         struct net *net;
11199         bool unregistering;
11200         DEFINE_WAIT_FUNC(wait, woken_wake_function);
11201
11202         add_wait_queue(&netdev_unregistering_wq, &wait);
11203         for (;;) {
11204                 unregistering = false;
11205                 rtnl_lock();
11206                 list_for_each_entry(net, net_list, exit_list) {
11207                         if (net->dev_unreg_count > 0) {
11208                                 unregistering = true;
11209                                 break;
11210                         }
11211                 }
11212                 if (!unregistering)
11213                         break;
11214                 __rtnl_unlock();
11215
11216                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
11217         }
11218         remove_wait_queue(&netdev_unregistering_wq, &wait);
11219 }
11220
11221 static void __net_exit default_device_exit_batch(struct list_head *net_list)
11222 {
11223         /* At exit all network devices most be removed from a network
11224          * namespace.  Do this in the reverse order of registration.
11225          * Do this across as many network namespaces as possible to
11226          * improve batching efficiency.
11227          */
11228         struct net_device *dev;
11229         struct net *net;
11230         LIST_HEAD(dev_kill_list);
11231
11232         /* To prevent network device cleanup code from dereferencing
11233          * loopback devices or network devices that have been freed
11234          * wait here for all pending unregistrations to complete,
11235          * before unregistring the loopback device and allowing the
11236          * network namespace be freed.
11237          *
11238          * The netdev todo list containing all network devices
11239          * unregistrations that happen in default_device_exit_batch
11240          * will run in the rtnl_unlock() at the end of
11241          * default_device_exit_batch.
11242          */
11243         rtnl_lock_unregistering(net_list);
11244         list_for_each_entry(net, net_list, exit_list) {
11245                 for_each_netdev_reverse(net, dev) {
11246                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
11247                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
11248                         else
11249                                 unregister_netdevice_queue(dev, &dev_kill_list);
11250                 }
11251         }
11252         unregister_netdevice_many(&dev_kill_list);
11253         rtnl_unlock();
11254 }
11255
11256 static struct pernet_operations __net_initdata default_device_ops = {
11257         .exit = default_device_exit,
11258         .exit_batch = default_device_exit_batch,
11259 };
11260
11261 /*
11262  *      Initialize the DEV module. At boot time this walks the device list and
11263  *      unhooks any devices that fail to initialise (normally hardware not
11264  *      present) and leaves us with a valid list of present and active devices.
11265  *
11266  */
11267
11268 /*
11269  *       This is called single threaded during boot, so no need
11270  *       to take the rtnl semaphore.
11271  */
11272 static int __init net_dev_init(void)
11273 {
11274         int i, rc = -ENOMEM;
11275
11276         BUG_ON(!dev_boot_phase);
11277
11278         if (dev_proc_init())
11279                 goto out;
11280
11281         if (netdev_kobject_init())
11282                 goto out;
11283
11284         INIT_LIST_HEAD(&ptype_all);
11285         for (i = 0; i < PTYPE_HASH_SIZE; i++)
11286                 INIT_LIST_HEAD(&ptype_base[i]);
11287
11288         INIT_LIST_HEAD(&offload_base);
11289
11290         if (register_pernet_subsys(&netdev_net_ops))
11291                 goto out;
11292
11293         /*
11294          *      Initialise the packet receive queues.
11295          */
11296
11297         for_each_possible_cpu(i) {
11298                 struct work_struct *flush = per_cpu_ptr(&flush_works, i);
11299                 struct softnet_data *sd = &per_cpu(softnet_data, i);
11300
11301                 INIT_WORK(flush, flush_backlog);
11302
11303                 skb_queue_head_init(&sd->input_pkt_queue);
11304                 skb_queue_head_init(&sd->process_queue);
11305 #ifdef CONFIG_XFRM_OFFLOAD
11306                 skb_queue_head_init(&sd->xfrm_backlog);
11307 #endif
11308                 INIT_LIST_HEAD(&sd->poll_list);
11309                 sd->output_queue_tailp = &sd->output_queue;
11310 #ifdef CONFIG_RPS
11311                 sd->csd.func = rps_trigger_softirq;
11312                 sd->csd.info = sd;
11313                 sd->cpu = i;
11314 #endif
11315
11316                 init_gro_hash(&sd->backlog);
11317                 sd->backlog.poll = process_backlog;
11318                 sd->backlog.weight = weight_p;
11319         }
11320
11321         dev_boot_phase = 0;
11322
11323         /* The loopback device is special if any other network devices
11324          * is present in a network namespace the loopback device must
11325          * be present. Since we now dynamically allocate and free the
11326          * loopback device ensure this invariant is maintained by
11327          * keeping the loopback device as the first device on the
11328          * list of network devices.  Ensuring the loopback devices
11329          * is the first device that appears and the last network device
11330          * that disappears.
11331          */
11332         if (register_pernet_device(&loopback_net_ops))
11333                 goto out;
11334
11335         if (register_pernet_device(&default_device_ops))
11336                 goto out;
11337
11338         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
11339         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
11340
11341         rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
11342                                        NULL, dev_cpu_dead);
11343         WARN_ON(rc < 0);
11344         rc = 0;
11345 out:
11346         return rc;
11347 }
11348
11349 subsys_initcall(net_dev_init);