net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <asm/unaligned.h>
  95 #include <linux/capability.h>
  96 #include <linux/errno.h>
  97 #include <linux/errqueue.h>
  98 #include <linux/types.h>
  99 #include <linux/socket.h>
 100 #include <linux/in.h>
 101 #include <linux/kernel.h>
 102 #include <linux/module.h>
 103 #include <linux/proc_fs.h>
 104 #include <linux/seq_file.h>
 105 #include <linux/sched.h>
 106 #include <linux/sched/mm.h>
 107 #include <linux/timer.h>
 108 #include <linux/string.h>
 109 #include <linux/sockios.h>
 110 #include <linux/net.h>
 111 #include <linux/mm.h>
 112 #include <linux/slab.h>
 113 #include <linux/interrupt.h>
 114 #include <linux/poll.h>
 115 #include <linux/tcp.h>
 116 #include <linux/init.h>
 117 #include <linux/highmem.h>
 118 #include <linux/user_namespace.h>
 119 #include <linux/static_key.h>
 120 #include <linux/memcontrol.h>
 121 #include <linux/prefetch.h>
 122
 123 #include <linux/uaccess.h>
 124
 125 #include <linux/netdevice.h>
 126 #include <net/protocol.h>
 127 #include <linux/skbuff.h>
 128 #include <net/net_namespace.h>
 129 #include <net/request_sock.h>
 130 #include <net/sock.h>
 131 #include <linux/net_tstamp.h>
 132 #include <net/xfrm.h>
 133 #include <linux/ipsec.h>
 134 #include <net/cls_cgroup.h>
 135 #include <net/netprio_cgroup.h>
 136 #include <linux/sock_diag.h>
 137
 138 #include <linux/filter.h>
 139 #include <net/sock_reuseport.h>
 140
 141 #include <trace/events/sock.h>
 142
 143 #include <net/tcp.h>
 144 #include <net/busy_poll.h>
 145
 146 static DEFINE_MUTEX(proto_list_mutex);
 147 static LIST_HEAD(proto_list);
 148
 149 static void sock_inuse_add(struct net *net, int val);
 150
 151 /**
 152  * sk_ns_capable - General socket capability test
 153  * @sk: Socket to use a capability on or through
 154  * @user_ns: The user namespace of the capability to use
 155  * @cap: The capability to use
 156  *
 157  * Test to see if the opener of the socket had when the socket was
 158  * created and the current process has the capability @cap in the user
 159  * namespace @user_ns.
 160  */
 161 bool sk_ns_capable(const struct sock *sk,
 162                    struct user_namespace *user_ns, int cap)
 163 {
 164         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 165                 ns_capable(user_ns, cap);
 166 }
 167 EXPORT_SYMBOL(sk_ns_capable);
 168
 169 /**
 170  * sk_capable - Socket global capability test
 171  * @sk: Socket to use a capability on or through
 172  * @cap: The global capability to use
 173  *
 174  * Test to see if the opener of the socket had when the socket was
 175  * created and the current process has the capability @cap in all user
 176  * namespaces.
 177  */
 178 bool sk_capable(const struct sock *sk, int cap)
 179 {
 180         return sk_ns_capable(sk, &init_user_ns, cap);
 181 }
 182 EXPORT_SYMBOL(sk_capable);
 183
 184 /**
 185  * sk_net_capable - Network namespace socket capability test
 186  * @sk: Socket to use a capability on or through
 187  * @cap: The capability to use
 188  *
 189  * Test to see if the opener of the socket had when the socket was created
 190  * and the current process has the capability @cap over the network namespace
 191  * the socket is a member of.
 192  */
 193 bool sk_net_capable(const struct sock *sk, int cap)
 194 {
 195         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 196 }
 197 EXPORT_SYMBOL(sk_net_capable);
 198
 199 /*
 200  * Each address family might have different locking rules, so we have
 201  * one slock key per address family and separate keys for internal and
 202  * userspace sockets.
 203  */
 204 static struct lock_class_key af_family_keys[AF_MAX];
 205 static struct lock_class_key af_family_kern_keys[AF_MAX];
 206 static struct lock_class_key af_family_slock_keys[AF_MAX];
 207 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 208
 209 /*
 210  * Make lock validator output more readable. (we pre-construct these
 211  * strings build-time, so that runtime initialization of socket
 212  * locks is fast):
 213  */
 214
 215 #define _sock_locks(x)                                            \
 216   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 217   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 218   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 219   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 220   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 221   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 222   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 223   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 224   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 225   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 226   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 227   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 228   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 229   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 230   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 231   x "AF_MAX"
 232
 233 static const char *const af_family_key_strings[AF_MAX+1] = {
 234         _sock_locks("sk_lock-")
 235 };
 236 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 237         _sock_locks("slock-")
 238 };
 239 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 240         _sock_locks("clock-")
 241 };
 242
 243 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 244         _sock_locks("k-sk_lock-")
 245 };
 246 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 247         _sock_locks("k-slock-")
 248 };
 249 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 250         _sock_locks("k-clock-")
 251 };
 252 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 253         _sock_locks("rlock-")
 254 };
 255 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 256         _sock_locks("wlock-")
 257 };
 258 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 259         _sock_locks("elock-")
 260 };
 261
 262 /*
 263  * sk_callback_lock and sk queues locking rules are per-address-family,
 264  * so split the lock classes by using a per-AF key:
 265  */
 266 static struct lock_class_key af_callback_keys[AF_MAX];
 267 static struct lock_class_key af_rlock_keys[AF_MAX];
 268 static struct lock_class_key af_wlock_keys[AF_MAX];
 269 static struct lock_class_key af_elock_keys[AF_MAX];
 270 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 271
 272 /* Run time adjustable parameters. */
 273 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 274 EXPORT_SYMBOL(sysctl_wmem_max);
 275 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 276 EXPORT_SYMBOL(sysctl_rmem_max);
 277 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 278 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 279
 280 /* Maximal space eaten by iovec or ancillary data plus some space */
 281 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 282 EXPORT_SYMBOL(sysctl_optmem_max);
 283
 284 int sysctl_tstamp_allow_data __read_mostly = 1;
 285
 286 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 287 EXPORT_SYMBOL_GPL(memalloc_socks_key);
 288
 289 /**
 290  * sk_set_memalloc - sets %SOCK_MEMALLOC
 291  * @sk: socket to set it on
 292  *
 293  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 294  * It's the responsibility of the admin to adjust min_free_kbytes
 295  * to meet the requirements
 296  */
 297 void sk_set_memalloc(struct sock *sk)
 298 {
 299         sock_set_flag(sk, SOCK_MEMALLOC);
 300         sk->sk_allocation |= __GFP_MEMALLOC;
 301         static_branch_inc(&memalloc_socks_key);
 302 }
 303 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 304
 305 void sk_clear_memalloc(struct sock *sk)
 306 {
 307         sock_reset_flag(sk, SOCK_MEMALLOC);
 308         sk->sk_allocation &= ~__GFP_MEMALLOC;
 309         static_branch_dec(&memalloc_socks_key);
 310
 311         /*
 312          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 313          * progress of swapping. SOCK_MEMALLOC may be cleared while
 314          * it has rmem allocations due to the last swapfile being deactivated
 315          * but there is a risk that the socket is unusable due to exceeding
 316          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 317          */
 318         sk_mem_reclaim(sk);
 319 }
 320 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 321
 322 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 323 {
 324         int ret;
 325         unsigned int noreclaim_flag;
 326
 327         /* these should have been dropped before queueing */
 328         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 329
 330         noreclaim_flag = memalloc_noreclaim_save();
 331         ret = sk->sk_backlog_rcv(sk, skb);
 332         memalloc_noreclaim_restore(noreclaim_flag);
 333
 334         return ret;
 335 }
 336 EXPORT_SYMBOL(__sk_backlog_rcv);
 337
 338 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 339 {
 340         struct timeval tv;
 341
 342         if (optlen < sizeof(tv))
 343                 return -EINVAL;
 344         if (copy_from_user(&tv, optval, sizeof(tv)))
 345                 return -EFAULT;
 346         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 347                 return -EDOM;
 348
 349         if (tv.tv_sec < 0) {
 350                 static int warned __read_mostly;
 351
 352                 *timeo_p = 0;
 353                 if (warned < 10 && net_ratelimit()) {
 354                         warned++;
 355                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 356                                 __func__, current->comm, task_pid_nr(current));
 357                 }
 358                 return 0;
 359         }
 360         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 361         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 362                 return 0;
 363         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 364                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
 365         return 0;
 366 }
 367
 368 static void sock_warn_obsolete_bsdism(const char *name)
 369 {
 370         static int warned;
 371         static char warncomm[TASK_COMM_LEN];
 372         if (strcmp(warncomm, current->comm) && warned < 5) {
 373                 strcpy(warncomm,  current->comm);
 374                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 375                         warncomm, name);
 376                 warned++;
 377         }
 378 }
 379
 380 static bool sock_needs_netstamp(const struct sock *sk)
 381 {
 382         switch (sk->sk_family) {
 383         case AF_UNSPEC:
 384         case AF_UNIX:
 385                 return false;
 386         default:
 387                 return true;
 388         }
 389 }
 390
 391 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 392 {
 393         if (sk->sk_flags & flags) {
 394                 sk->sk_flags &= ~flags;
 395                 if (sock_needs_netstamp(sk) &&
 396                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 397                         net_disable_timestamp();
 398         }
 399 }
 400
 401
 402 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 403 {
 404         unsigned long flags;
 405         struct sk_buff_head *list = &sk->sk_receive_queue;
 406
 407         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 408                 atomic_inc(&sk->sk_drops);
 409                 trace_sock_rcvqueue_full(sk, skb);
 410                 return -ENOMEM;
 411         }
 412
 413         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 414                 atomic_inc(&sk->sk_drops);
 415                 return -ENOBUFS;
 416         }
 417
 418         skb->dev = NULL;
 419         skb_set_owner_r(skb, sk);
 420
 421         /* we escape from rcu protected region, make sure we dont leak
 422          * a norefcounted dst
 423          */
 424         skb_dst_force(skb);
 425
 426         spin_lock_irqsave(&list->lock, flags);
 427         sock_skb_set_dropcount(sk, skb);
 428         __skb_queue_tail(list, skb);
 429         spin_unlock_irqrestore(&list->lock, flags);
 430
 431         if (!sock_flag(sk, SOCK_DEAD))
 432                 sk->sk_data_ready(sk);
 433         return 0;
 434 }
 435 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 436
 437 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 438 {
 439         int err;
 440
 441         err = sk_filter(sk, skb);
 442         if (err)
 443                 return err;
 444
 445         return __sock_queue_rcv_skb(sk, skb);
 446 }
 447 EXPORT_SYMBOL(sock_queue_rcv_skb);
 448
 449 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 450                      const int nested, unsigned int trim_cap, bool refcounted)
 451 {
 452         int rc = NET_RX_SUCCESS;
 453
 454         if (sk_filter_trim_cap(sk, skb, trim_cap))
 455                 goto discard_and_relse;
 456
 457         skb->dev = NULL;
 458
 459         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 460                 atomic_inc(&sk->sk_drops);
 461                 goto discard_and_relse;
 462         }
 463         if (nested)
 464                 bh_lock_sock_nested(sk);
 465         else
 466                 bh_lock_sock(sk);
 467         if (!sock_owned_by_user(sk)) {
 468                 /*
 469                  * trylock + unlock semantics:
 470                  */
 471                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 472
 473                 rc = sk_backlog_rcv(sk, skb);
 474
 475                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 476         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 477                 bh_unlock_sock(sk);
 478                 atomic_inc(&sk->sk_drops);
 479                 goto discard_and_relse;
 480         }
 481
 482         bh_unlock_sock(sk);
 483 out:
 484         if (refcounted)
 485                 sock_put(sk);
 486         return rc;
 487 discard_and_relse:
 488         kfree_skb(skb);
 489         goto out;
 490 }
 491 EXPORT_SYMBOL(__sk_receive_skb);
 492
 493 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 494 {
 495         struct dst_entry *dst = __sk_dst_get(sk);
 496
 497         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 498                 sk_tx_queue_clear(sk);
 499                 sk->sk_dst_pending_confirm = 0;
 500                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 501                 dst_release(dst);
 502                 return NULL;
 503         }
 504
 505         return dst;
 506 }
 507 EXPORT_SYMBOL(__sk_dst_check);
 508
 509 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 510 {
 511         struct dst_entry *dst = sk_dst_get(sk);
 512
 513         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 514                 sk_dst_reset(sk);
 515                 dst_release(dst);
 516                 return NULL;
 517         }
 518
 519         return dst;
 520 }
 521 EXPORT_SYMBOL(sk_dst_check);
 522
 523 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 524                                 int optlen)
 525 {
 526         int ret = -ENOPROTOOPT;
 527 #ifdef CONFIG_NETDEVICES
 528         struct net *net = sock_net(sk);
 529         char devname[IFNAMSIZ];
 530         int index;
 531
 532         /* Sorry... */
 533         ret = -EPERM;
 534         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 535                 goto out;
 536
 537         ret = -EINVAL;
 538         if (optlen < 0)
 539                 goto out;
 540
 541         /* Bind this socket to a particular device like "eth0",
 542          * as specified in the passed interface name. If the
 543          * name is "" or the option length is zero the socket
 544          * is not bound.
 545          */
 546         if (optlen > IFNAMSIZ - 1)
 547                 optlen = IFNAMSIZ - 1;
 548         memset(devname, 0, sizeof(devname));
 549
 550         ret = -EFAULT;
 551         if (copy_from_user(devname, optval, optlen))
 552                 goto out;
 553
 554         index = 0;
 555         if (devname[0] != '\0') {
 556                 struct net_device *dev;
 557
 558                 rcu_read_lock();
 559                 dev = dev_get_by_name_rcu(net, devname);
 560                 if (dev)
 561                         index = dev->ifindex;
 562                 rcu_read_unlock();
 563                 ret = -ENODEV;
 564                 if (!dev)
 565                         goto out;
 566         }
 567
 568         lock_sock(sk);
 569         sk->sk_bound_dev_if = index;
 570         sk_dst_reset(sk);
 571         release_sock(sk);
 572
 573         ret = 0;
 574
 575 out:
 576 #endif
 577
 578         return ret;
 579 }
 580
 581 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 582                                 int __user *optlen, int len)
 583 {
 584         int ret = -ENOPROTOOPT;
 585 #ifdef CONFIG_NETDEVICES
 586         struct net *net = sock_net(sk);
 587         char devname[IFNAMSIZ];
 588
 589         if (sk->sk_bound_dev_if == 0) {
 590                 len = 0;
 591                 goto zero;
 592         }
 593
 594         ret = -EINVAL;
 595         if (len < IFNAMSIZ)
 596                 goto out;
 597
 598         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 599         if (ret)
 600                 goto out;
 601
 602         len = strlen(devname) + 1;
 603
 604         ret = -EFAULT;
 605         if (copy_to_user(optval, devname, len))
 606                 goto out;
 607
 608 zero:
 609         ret = -EFAULT;
 610         if (put_user(len, optlen))
 611                 goto out;
 612
 613         ret = 0;
 614
 615 out:
 616 #endif
 617
 618         return ret;
 619 }
 620
 621 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 622 {
 623         if (valbool)
 624                 sock_set_flag(sk, bit);
 625         else
 626                 sock_reset_flag(sk, bit);
 627 }
 628
 629 bool sk_mc_loop(struct sock *sk)
 630 {
 631         if (dev_recursion_level())
 632                 return false;
 633         if (!sk)
 634                 return true;
 635         switch (sk->sk_family) {
 636         case AF_INET:
 637                 return inet_sk(sk)->mc_loop;
 638 #if IS_ENABLED(CONFIG_IPV6)
 639         case AF_INET6:
 640                 return inet6_sk(sk)->mc_loop;
 641 #endif
 642         }
 643         WARN_ON_ONCE(1);
 644         return true;
 645 }
 646 EXPORT_SYMBOL(sk_mc_loop);
 647
 648 /*
 649  *      This is meant for all protocols to use and covers goings on
 650  *      at the socket level. Everything here is generic.
 651  */
 652
 653 int sock_setsockopt(struct socket *sock, int level, int optname,
 654                     char __user *optval, unsigned int optlen)
 655 {
 656         struct sock_txtime sk_txtime;
 657         struct sock *sk = sock->sk;
 658         int val;
 659         int valbool;
 660         struct linger ling;
 661         int ret = 0;
 662
 663         /*
 664          *      Options without arguments
 665          */
 666
 667         if (optname == SO_BINDTODEVICE)
 668                 return sock_setbindtodevice(sk, optval, optlen);
 669
 670         if (optlen < sizeof(int))
 671                 return -EINVAL;
 672
 673         if (get_user(val, (int __user *)optval))
 674                 return -EFAULT;
 675
 676         valbool = val ? 1 : 0;
 677
 678         lock_sock(sk);
 679
 680         switch (optname) {
 681         case SO_DEBUG:
 682                 if (val && !capable(CAP_NET_ADMIN))
 683                         ret = -EACCES;
 684                 else
 685                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 686                 break;
 687         case SO_REUSEADDR:
 688                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 689                 break;
 690         case SO_REUSEPORT:
 691                 sk->sk_reuseport = valbool;
 692                 break;
 693         case SO_TYPE:
 694         case SO_PROTOCOL:
 695         case SO_DOMAIN:
 696         case SO_ERROR:
 697                 ret = -ENOPROTOOPT;
 698                 break;
 699         case SO_DONTROUTE:
 700                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 701                 sk_dst_reset(sk);
 702                 break;
 703         case SO_BROADCAST:
 704                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 705                 break;
 706         case SO_SNDBUF:
 707                 /* Don't error on this BSD doesn't and if you think
 708                  * about it this is right. Otherwise apps have to
 709                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 710                  * are treated in BSD as hints
 711                  */
 712                 val = min_t(u32, val, sysctl_wmem_max);
 713 set_sndbuf:
 714                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 715                 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 716                 /* Wake up sending tasks if we upped the value. */
 717                 sk->sk_write_space(sk);
 718                 break;
 719
 720         case SO_SNDBUFFORCE:
 721                 if (!capable(CAP_NET_ADMIN)) {
 722                         ret = -EPERM;
 723                         break;
 724                 }
 725                 goto set_sndbuf;
 726
 727         case SO_RCVBUF:
 728                 /* Don't error on this BSD doesn't and if you think
 729                  * about it this is right. Otherwise apps have to
 730                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 731                  * are treated in BSD as hints
 732                  */
 733                 val = min_t(u32, val, sysctl_rmem_max);
 734 set_rcvbuf:
 735                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 736                 /*
 737                  * We double it on the way in to account for
 738                  * "struct sk_buff" etc. overhead.   Applications
 739                  * assume that the SO_RCVBUF setting they make will
 740                  * allow that much actual data to be received on that
 741                  * socket.
 742                  *
 743                  * Applications are unaware that "struct sk_buff" and
 744                  * other overheads allocate from the receive buffer
 745                  * during socket buffer allocation.
 746                  *
 747                  * And after considering the possible alternatives,
 748                  * returning the value we actually used in getsockopt
 749                  * is the most desirable behavior.
 750                  */
 751                 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 752                 break;
 753
 754         case SO_RCVBUFFORCE:
 755                 if (!capable(CAP_NET_ADMIN)) {
 756                         ret = -EPERM;
 757                         break;
 758                 }
 759                 goto set_rcvbuf;
 760
 761         case SO_KEEPALIVE:
 762                 if (sk->sk_prot->keepalive)
 763                         sk->sk_prot->keepalive(sk, valbool);
 764                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 765                 break;
 766
 767         case SO_OOBINLINE:
 768                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 769                 break;
 770
 771         case SO_NO_CHECK:
 772                 sk->sk_no_check_tx = valbool;
 773                 break;
 774
 775         case SO_PRIORITY:
 776                 if ((val >= 0 && val <= 6) ||
 777                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 778                         sk->sk_priority = val;
 779                 else
 780                         ret = -EPERM;
 781                 break;
 782
 783         case SO_LINGER:
 784                 if (optlen < sizeof(ling)) {
 785                         ret = -EINVAL;  /* 1003.1g */
 786                         break;
 787                 }
 788                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 789                         ret = -EFAULT;
 790                         break;
 791                 }
 792                 if (!ling.l_onoff)
 793                         sock_reset_flag(sk, SOCK_LINGER);
 794                 else {
 795 #if (BITS_PER_LONG == 32)
 796                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 797                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 798                         else
 799 #endif
 800                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 801                         sock_set_flag(sk, SOCK_LINGER);
 802                 }
 803                 break;
 804
 805         case SO_BSDCOMPAT:
 806                 sock_warn_obsolete_bsdism("setsockopt");
 807                 break;
 808
 809         case SO_PASSCRED:
 810                 if (valbool)
 811                         set_bit(SOCK_PASSCRED, &sock->flags);
 812                 else
 813                         clear_bit(SOCK_PASSCRED, &sock->flags);
 814                 break;
 815
 816         case SO_TIMESTAMP:
 817         case SO_TIMESTAMPNS:
 818                 if (valbool)  {
 819                         if (optname == SO_TIMESTAMP)
 820                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 821                         else
 822                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 823                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 824                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 825                 } else {
 826                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 827                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 828                 }
 829                 break;
 830
 831         case SO_TIMESTAMPING:
 832                 if (val & ~SOF_TIMESTAMPING_MASK) {
 833                         ret = -EINVAL;
 834                         break;
 835                 }
 836
 837                 if (val & SOF_TIMESTAMPING_OPT_ID &&
 838                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 839                         if (sk->sk_protocol == IPPROTO_TCP &&
 840                             sk->sk_type == SOCK_STREAM) {
 841                                 if ((1 << sk->sk_state) &
 842                                     (TCPF_CLOSE | TCPF_LISTEN)) {
 843                                         ret = -EINVAL;
 844                                         break;
 845                                 }
 846                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
 847                         } else {
 848                                 sk->sk_tskey = 0;
 849                         }
 850                 }
 851
 852                 if (val & SOF_TIMESTAMPING_OPT_STATS &&
 853                     !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 854                         ret = -EINVAL;
 855                         break;
 856                 }
 857
 858                 sk->sk_tsflags = val;
 859                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 860                         sock_enable_timestamp(sk,
 861                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 862                 else
 863                         sock_disable_timestamp(sk,
 864                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 865                 break;
 866
 867         case SO_RCVLOWAT:
 868                 if (val < 0)
 869                         val = INT_MAX;
 870                 if (sock->ops->set_rcvlowat)
 871                         ret = sock->ops->set_rcvlowat(sk, val);
 872                 else
 873                         sk->sk_rcvlowat = val ? : 1;
 874                 break;
 875
 876         case SO_RCVTIMEO:
 877                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 878                 break;
 879
 880         case SO_SNDTIMEO:
 881                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 882                 break;
 883
 884         case SO_ATTACH_FILTER:
 885                 ret = -EINVAL;
 886                 if (optlen == sizeof(struct sock_fprog)) {
 887                         struct sock_fprog fprog;
 888
 889                         ret = -EFAULT;
 890                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 891                                 break;
 892
 893                         ret = sk_attach_filter(&fprog, sk);
 894                 }
 895                 break;
 896
 897         case SO_ATTACH_BPF:
 898                 ret = -EINVAL;
 899                 if (optlen == sizeof(u32)) {
 900                         u32 ufd;
 901
 902                         ret = -EFAULT;
 903                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 904                                 break;
 905
 906                         ret = sk_attach_bpf(ufd, sk);
 907                 }
 908                 break;
 909
 910         case SO_ATTACH_REUSEPORT_CBPF:
 911                 ret = -EINVAL;
 912                 if (optlen == sizeof(struct sock_fprog)) {
 913                         struct sock_fprog fprog;
 914
 915                         ret = -EFAULT;
 916                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 917                                 break;
 918
 919                         ret = sk_reuseport_attach_filter(&fprog, sk);
 920                 }
 921                 break;
 922
 923         case SO_ATTACH_REUSEPORT_EBPF:
 924                 ret = -EINVAL;
 925                 if (optlen == sizeof(u32)) {
 926                         u32 ufd;
 927
 928                         ret = -EFAULT;
 929                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 930                                 break;
 931
 932                         ret = sk_reuseport_attach_bpf(ufd, sk);
 933                 }
 934                 break;
 935
 936         case SO_DETACH_FILTER:
 937                 ret = sk_detach_filter(sk);
 938                 break;
 939
 940         case SO_LOCK_FILTER:
 941                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 942                         ret = -EPERM;
 943                 else
 944                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 945                 break;
 946
 947         case SO_PASSSEC:
 948                 if (valbool)
 949                         set_bit(SOCK_PASSSEC, &sock->flags);
 950                 else
 951                         clear_bit(SOCK_PASSSEC, &sock->flags);
 952                 break;
 953         case SO_MARK:
 954                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 955                         ret = -EPERM;
 956                 else
 957                         sk->sk_mark = val;
 958                 break;
 959
 960         case SO_RXQ_OVFL:
 961                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 962                 break;
 963
 964         case SO_WIFI_STATUS:
 965                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 966                 break;
 967
 968         case SO_PEEK_OFF:
 969                 if (sock->ops->set_peek_off)
 970                         ret = sock->ops->set_peek_off(sk, val);
 971                 else
 972                         ret = -EOPNOTSUPP;
 973                 break;
 974
 975         case SO_NOFCS:
 976                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 977                 break;
 978
 979         case SO_SELECT_ERR_QUEUE:
 980                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 981                 break;
 982
 983 #ifdef CONFIG_NET_RX_BUSY_POLL
 984         case SO_BUSY_POLL:
 985                 /* allow unprivileged users to decrease the value */
 986                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 987                         ret = -EPERM;
 988                 else {
 989                         if (val < 0)
 990                                 ret = -EINVAL;
 991                         else
 992                                 WRITE_ONCE(sk->sk_ll_usec, val);
 993                 }
 994                 break;
 995 #endif
 996
 997         case SO_MAX_PACING_RATE:
 998                 if (val != ~0U)
 999                         cmpxchg(&sk->sk_pacing_status,
1000                                 SK_PACING_NONE,
1001                                 SK_PACING_NEEDED);
1002                 sk->sk_max_pacing_rate = val;
1003                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1004                                          sk->sk_max_pacing_rate);
1005                 break;
1006
1007         case SO_INCOMING_CPU:
1008                 WRITE_ONCE(sk->sk_incoming_cpu, val);
1009                 break;
1010
1011         case SO_CNX_ADVICE:
1012                 if (val == 1)
1013                         dst_negative_advice(sk);
1014                 break;
1015
1016         case SO_ZEROCOPY:
1017                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1018                         if (sk->sk_protocol != IPPROTO_TCP)
1019                                 ret = -ENOTSUPP;
1020                 } else if (sk->sk_family != PF_RDS) {
1021                         ret = -ENOTSUPP;
1022                 }
1023                 if (!ret) {
1024                         if (val < 0 || val > 1)
1025                                 ret = -EINVAL;
1026                         else
1027                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1028                 }
1029                 break;
1030
1031         case SO_TXTIME:
1032                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1033                         ret = -EPERM;
1034                 } else if (optlen != sizeof(struct sock_txtime)) {
1035                         ret = -EINVAL;
1036                 } else if (copy_from_user(&sk_txtime, optval,
1037                            sizeof(struct sock_txtime))) {
1038                         ret = -EFAULT;
1039                 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1040                         ret = -EINVAL;
1041                 } else {
1042                         sock_valbool_flag(sk, SOCK_TXTIME, true);
1043                         sk->sk_clockid = sk_txtime.clockid;
1044                         sk->sk_txtime_deadline_mode =
1045                                 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1046                         sk->sk_txtime_report_errors =
1047                                 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1048                 }
1049                 break;
1050
1051         default:
1052                 ret = -ENOPROTOOPT;
1053                 break;
1054         }
1055         release_sock(sk);
1056         return ret;
1057 }
1058 EXPORT_SYMBOL(sock_setsockopt);
1059
1060 static const struct cred *sk_get_peer_cred(struct sock *sk)
1061 {
1062         const struct cred *cred;
1063
1064         spin_lock(&sk->sk_peer_lock);
1065         cred = get_cred(sk->sk_peer_cred);
1066         spin_unlock(&sk->sk_peer_lock);
1067
1068         return cred;
1069 }
1070
1071 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1072                           struct ucred *ucred)
1073 {
1074         ucred->pid = pid_vnr(pid);
1075         ucred->uid = ucred->gid = -1;
1076         if (cred) {
1077                 struct user_namespace *current_ns = current_user_ns();
1078
1079                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1080                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1081         }
1082 }
1083
1084 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1085 {
1086         struct user_namespace *user_ns = current_user_ns();
1087         int i;
1088
1089         for (i = 0; i < src->ngroups; i++)
1090                 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1091                         return -EFAULT;
1092
1093         return 0;
1094 }
1095
1096 int sock_getsockopt(struct socket *sock, int level, int optname,
1097                     char __user *optval, int __user *optlen)
1098 {
1099         struct sock *sk = sock->sk;
1100
1101         union {
1102                 int val;
1103                 u64 val64;
1104                 struct linger ling;
1105                 struct timeval tm;
1106                 struct sock_txtime txtime;
1107         } v;
1108
1109         int lv = sizeof(int);
1110         int len;
1111
1112         if (get_user(len, optlen))
1113                 return -EFAULT;
1114         if (len < 0)
1115                 return -EINVAL;
1116
1117         memset(&v, 0, sizeof(v));
1118
1119         switch (optname) {
1120         case SO_DEBUG:
1121                 v.val = sock_flag(sk, SOCK_DBG);
1122                 break;
1123
1124         case SO_DONTROUTE:
1125                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1126                 break;
1127
1128         case SO_BROADCAST:
1129                 v.val = sock_flag(sk, SOCK_BROADCAST);
1130                 break;
1131
1132         case SO_SNDBUF:
1133                 v.val = sk->sk_sndbuf;
1134                 break;
1135
1136         case SO_RCVBUF:
1137                 v.val = sk->sk_rcvbuf;
1138                 break;
1139
1140         case SO_REUSEADDR:
1141                 v.val = sk->sk_reuse;
1142                 break;
1143
1144         case SO_REUSEPORT:
1145                 v.val = sk->sk_reuseport;
1146                 break;
1147
1148         case SO_KEEPALIVE:
1149                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1150                 break;
1151
1152         case SO_TYPE:
1153                 v.val = sk->sk_type;
1154                 break;
1155
1156         case SO_PROTOCOL:
1157                 v.val = sk->sk_protocol;
1158                 break;
1159
1160         case SO_DOMAIN:
1161                 v.val = sk->sk_family;
1162                 break;
1163
1164         case SO_ERROR:
1165                 v.val = -sock_error(sk);
1166                 if (v.val == 0)
1167                         v.val = xchg(&sk->sk_err_soft, 0);
1168                 break;
1169
1170         case SO_OOBINLINE:
1171                 v.val = sock_flag(sk, SOCK_URGINLINE);
1172                 break;
1173
1174         case SO_NO_CHECK:
1175                 v.val = sk->sk_no_check_tx;
1176                 break;
1177
1178         case SO_PRIORITY:
1179                 v.val = sk->sk_priority;
1180                 break;
1181
1182         case SO_LINGER:
1183                 lv              = sizeof(v.ling);
1184                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1185                 v.ling.l_linger = sk->sk_lingertime / HZ;
1186                 break;
1187
1188         case SO_BSDCOMPAT:
1189                 sock_warn_obsolete_bsdism("getsockopt");
1190                 break;
1191
1192         case SO_TIMESTAMP:
1193                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1194                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1195                 break;
1196
1197         case SO_TIMESTAMPNS:
1198                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1199                 break;
1200
1201         case SO_TIMESTAMPING:
1202                 v.val = sk->sk_tsflags;
1203                 break;
1204
1205         case SO_RCVTIMEO:
1206                 lv = sizeof(struct timeval);
1207                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1208                         v.tm.tv_sec = 0;
1209                         v.tm.tv_usec = 0;
1210                 } else {
1211                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1212                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1213                 }
1214                 break;
1215
1216         case SO_SNDTIMEO:
1217                 lv = sizeof(struct timeval);
1218                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1219                         v.tm.tv_sec = 0;
1220                         v.tm.tv_usec = 0;
1221                 } else {
1222                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1223                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1224                 }
1225                 break;
1226
1227         case SO_RCVLOWAT:
1228                 v.val = sk->sk_rcvlowat;
1229                 break;
1230
1231         case SO_SNDLOWAT:
1232                 v.val = 1;
1233                 break;
1234
1235         case SO_PASSCRED:
1236                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1237                 break;
1238
1239         case SO_PEERCRED:
1240         {
1241                 struct ucred peercred;
1242                 if (len > sizeof(peercred))
1243                         len = sizeof(peercred);
1244
1245                 spin_lock(&sk->sk_peer_lock);
1246                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1247                 spin_unlock(&sk->sk_peer_lock);
1248
1249                 if (copy_to_user(optval, &peercred, len))
1250                         return -EFAULT;
1251                 goto lenout;
1252         }
1253
1254         case SO_PEERGROUPS:
1255         {
1256                 const struct cred *cred;
1257                 int ret, n;
1258
1259                 cred = sk_get_peer_cred(sk);
1260                 if (!cred)
1261                         return -ENODATA;
1262
1263                 n = cred->group_info->ngroups;
1264                 if (len < n * sizeof(gid_t)) {
1265                         len = n * sizeof(gid_t);
1266                         put_cred(cred);
1267                         return put_user(len, optlen) ? -EFAULT : -ERANGE;
1268                 }
1269                 len = n * sizeof(gid_t);
1270
1271                 ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1272                 put_cred(cred);
1273                 if (ret)
1274                         return ret;
1275                 goto lenout;
1276         }
1277
1278         case SO_PEERNAME:
1279         {
1280                 char address[128];
1281
1282                 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1283                 if (lv < 0)
1284                         return -ENOTCONN;
1285                 if (lv < len)
1286                         return -EINVAL;
1287                 if (copy_to_user(optval, address, len))
1288                         return -EFAULT;
1289                 goto lenout;
1290         }
1291
1292         /* Dubious BSD thing... Probably nobody even uses it, but
1293          * the UNIX standard wants it for whatever reason... -DaveM
1294          */
1295         case SO_ACCEPTCONN:
1296                 v.val = sk->sk_state == TCP_LISTEN;
1297                 break;
1298
1299         case SO_PASSSEC:
1300                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1301                 break;
1302
1303         case SO_PEERSEC:
1304                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1305
1306         case SO_MARK:
1307                 v.val = sk->sk_mark;
1308                 break;
1309
1310         case SO_RXQ_OVFL:
1311                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1312                 break;
1313
1314         case SO_WIFI_STATUS:
1315                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1316                 break;
1317
1318         case SO_PEEK_OFF:
1319                 if (!sock->ops->set_peek_off)
1320                         return -EOPNOTSUPP;
1321
1322                 v.val = sk->sk_peek_off;
1323                 break;
1324         case SO_NOFCS:
1325                 v.val = sock_flag(sk, SOCK_NOFCS);
1326                 break;
1327
1328         case SO_BINDTODEVICE:
1329                 return sock_getbindtodevice(sk, optval, optlen, len);
1330
1331         case SO_GET_FILTER:
1332                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1333                 if (len < 0)
1334                         return len;
1335
1336                 goto lenout;
1337
1338         case SO_LOCK_FILTER:
1339                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1340                 break;
1341
1342         case SO_BPF_EXTENSIONS:
1343                 v.val = bpf_tell_extensions();
1344                 break;
1345
1346         case SO_SELECT_ERR_QUEUE:
1347                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1348                 break;
1349
1350 #ifdef CONFIG_NET_RX_BUSY_POLL
1351         case SO_BUSY_POLL:
1352                 v.val = sk->sk_ll_usec;
1353                 break;
1354 #endif
1355
1356         case SO_MAX_PACING_RATE:
1357                 v.val = sk->sk_max_pacing_rate;
1358                 break;
1359
1360         case SO_INCOMING_CPU:
1361                 v.val = READ_ONCE(sk->sk_incoming_cpu);
1362                 break;
1363
1364         case SO_MEMINFO:
1365         {
1366                 u32 meminfo[SK_MEMINFO_VARS];
1367
1368                 sk_get_meminfo(sk, meminfo);
1369
1370                 len = min_t(unsigned int, len, sizeof(meminfo));
1371                 if (copy_to_user(optval, &meminfo, len))
1372                         return -EFAULT;
1373
1374                 goto lenout;
1375         }
1376
1377 #ifdef CONFIG_NET_RX_BUSY_POLL
1378         case SO_INCOMING_NAPI_ID:
1379                 v.val = READ_ONCE(sk->sk_napi_id);
1380
1381                 /* aggregate non-NAPI IDs down to 0 */
1382                 if (v.val < MIN_NAPI_ID)
1383                         v.val = 0;
1384
1385                 break;
1386 #endif
1387
1388         case SO_COOKIE:
1389                 lv = sizeof(u64);
1390                 if (len < lv)
1391                         return -EINVAL;
1392                 v.val64 = sock_gen_cookie(sk);
1393                 break;
1394
1395         case SO_ZEROCOPY:
1396                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1397                 break;
1398
1399         case SO_TXTIME:
1400                 lv = sizeof(v.txtime);
1401                 v.txtime.clockid = sk->sk_clockid;
1402                 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1403                                   SOF_TXTIME_DEADLINE_MODE : 0;
1404                 v.txtime.flags |= sk->sk_txtime_report_errors ?
1405                                   SOF_TXTIME_REPORT_ERRORS : 0;
1406                 break;
1407
1408         default:
1409                 /* We implement the SO_SNDLOWAT etc to not be settable
1410                  * (1003.1g 7).
1411                  */
1412                 return -ENOPROTOOPT;
1413         }
1414
1415         if (len > lv)
1416                 len = lv;
1417         if (copy_to_user(optval, &v, len))
1418                 return -EFAULT;
1419 lenout:
1420         if (put_user(len, optlen))
1421                 return -EFAULT;
1422         return 0;
1423 }
1424
1425 /*
1426  * Initialize an sk_lock.
1427  *
1428  * (We also register the sk_lock with the lock validator.)
1429  */
1430 static inline void sock_lock_init(struct sock *sk)
1431 {
1432         if (sk->sk_kern_sock)
1433                 sock_lock_init_class_and_name(
1434                         sk,
1435                         af_family_kern_slock_key_strings[sk->sk_family],
1436                         af_family_kern_slock_keys + sk->sk_family,
1437                         af_family_kern_key_strings[sk->sk_family],
1438                         af_family_kern_keys + sk->sk_family);
1439         else
1440                 sock_lock_init_class_and_name(
1441                         sk,
1442                         af_family_slock_key_strings[sk->sk_family],
1443                         af_family_slock_keys + sk->sk_family,
1444                         af_family_key_strings[sk->sk_family],
1445                         af_family_keys + sk->sk_family);
1446 }
1447
1448 /*
1449  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1450  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1451  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1452  */
1453 static void sock_copy(struct sock *nsk, const struct sock *osk)
1454 {
1455 #ifdef CONFIG_SECURITY_NETWORK
1456         void *sptr = nsk->sk_security;
1457 #endif
1458         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1459
1460         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1461                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1462
1463 #ifdef CONFIG_SECURITY_NETWORK
1464         nsk->sk_security = sptr;
1465         security_sk_clone(osk, nsk);
1466 #endif
1467 }
1468
1469 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1470                 int family)
1471 {
1472         struct sock *sk;
1473         struct kmem_cache *slab;
1474
1475         slab = prot->slab;
1476         if (slab != NULL) {
1477                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1478                 if (!sk)
1479                         return sk;
1480                 if (priority & __GFP_ZERO)
1481                         sk_prot_clear_nulls(sk, prot->obj_size);
1482         } else
1483                 sk = kmalloc(prot->obj_size, priority);
1484
1485         if (sk != NULL) {
1486                 if (security_sk_alloc(sk, family, priority))
1487                         goto out_free;
1488
1489                 if (!try_module_get(prot->owner))
1490                         goto out_free_sec;
1491                 sk_tx_queue_clear(sk);
1492         }
1493
1494         return sk;
1495
1496 out_free_sec:
1497         security_sk_free(sk);
1498 out_free:
1499         if (slab != NULL)
1500                 kmem_cache_free(slab, sk);
1501         else
1502                 kfree(sk);
1503         return NULL;
1504 }
1505
1506 static void sk_prot_free(struct proto *prot, struct sock *sk)
1507 {
1508         struct kmem_cache *slab;
1509         struct module *owner;
1510
1511         owner = prot->owner;
1512         slab = prot->slab;
1513
1514         cgroup_sk_free(&sk->sk_cgrp_data);
1515         mem_cgroup_sk_free(sk);
1516         security_sk_free(sk);
1517         if (slab != NULL)
1518                 kmem_cache_free(slab, sk);
1519         else
1520                 kfree(sk);
1521         module_put(owner);
1522 }
1523
1524 /**
1525  *      sk_alloc - All socket objects are allocated here
1526  *      @net: the applicable net namespace
1527  *      @family: protocol family
1528  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1529  *      @prot: struct proto associated with this new sock instance
1530  *      @kern: is this to be a kernel socket?
1531  */
1532 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1533                       struct proto *prot, int kern)
1534 {
1535         struct sock *sk;
1536
1537         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1538         if (sk) {
1539                 sk->sk_family = family;
1540                 /*
1541                  * See comment in struct sock definition to understand
1542                  * why we need sk_prot_creator -acme
1543                  */
1544                 sk->sk_prot = sk->sk_prot_creator = prot;
1545                 sk->sk_kern_sock = kern;
1546                 sock_lock_init(sk);
1547                 sk->sk_net_refcnt = kern ? 0 : 1;
1548                 if (likely(sk->sk_net_refcnt)) {
1549                         get_net(net);
1550                         sock_inuse_add(net, 1);
1551                 }
1552
1553                 sock_net_set(sk, net);
1554                 refcount_set(&sk->sk_wmem_alloc, 1);
1555
1556                 mem_cgroup_sk_alloc(sk);
1557                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1558                 sock_update_classid(&sk->sk_cgrp_data);
1559                 sock_update_netprioidx(&sk->sk_cgrp_data);
1560                 sk_tx_queue_clear(sk);
1561         }
1562
1563         return sk;
1564 }
1565 EXPORT_SYMBOL(sk_alloc);
1566
1567 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1568  * grace period. This is the case for UDP sockets and TCP listeners.
1569  */
1570 static void __sk_destruct(struct rcu_head *head)
1571 {
1572         struct sock *sk = container_of(head, struct sock, sk_rcu);
1573         struct sk_filter *filter;
1574
1575         if (sk->sk_destruct)
1576                 sk->sk_destruct(sk);
1577
1578         filter = rcu_dereference_check(sk->sk_filter,
1579                                        refcount_read(&sk->sk_wmem_alloc) == 0);
1580         if (filter) {
1581                 sk_filter_uncharge(sk, filter);
1582                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1583         }
1584
1585         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1586
1587         if (atomic_read(&sk->sk_omem_alloc))
1588                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1589                          __func__, atomic_read(&sk->sk_omem_alloc));
1590
1591         if (sk->sk_frag.page) {
1592                 put_page(sk->sk_frag.page);
1593                 sk->sk_frag.page = NULL;
1594         }
1595
1596         /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
1597         put_cred(sk->sk_peer_cred);
1598         put_pid(sk->sk_peer_pid);
1599
1600         if (likely(sk->sk_net_refcnt))
1601                 put_net(sock_net(sk));
1602         sk_prot_free(sk->sk_prot_creator, sk);
1603 }
1604
1605 void sk_destruct(struct sock *sk)
1606 {
1607         bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1608
1609         if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1610                 reuseport_detach_sock(sk);
1611                 use_call_rcu = true;
1612         }
1613
1614         if (use_call_rcu)
1615                 call_rcu(&sk->sk_rcu, __sk_destruct);
1616         else
1617                 __sk_destruct(&sk->sk_rcu);
1618 }
1619
1620 static void __sk_free(struct sock *sk)
1621 {
1622         if (likely(sk->sk_net_refcnt))
1623                 sock_inuse_add(sock_net(sk), -1);
1624
1625         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1626                 sock_diag_broadcast_destroy(sk);
1627         else
1628                 sk_destruct(sk);
1629 }
1630
1631 void sk_free(struct sock *sk)
1632 {
1633         /*
1634          * We subtract one from sk_wmem_alloc and can know if
1635          * some packets are still in some tx queue.
1636          * If not null, sock_wfree() will call __sk_free(sk) later
1637          */
1638         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1639                 __sk_free(sk);
1640 }
1641 EXPORT_SYMBOL(sk_free);
1642
1643 static void sk_init_common(struct sock *sk)
1644 {
1645         skb_queue_head_init(&sk->sk_receive_queue);
1646         skb_queue_head_init(&sk->sk_write_queue);
1647         skb_queue_head_init(&sk->sk_error_queue);
1648
1649         rwlock_init(&sk->sk_callback_lock);
1650         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1651                         af_rlock_keys + sk->sk_family,
1652                         af_family_rlock_key_strings[sk->sk_family]);
1653         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1654                         af_wlock_keys + sk->sk_family,
1655                         af_family_wlock_key_strings[sk->sk_family]);
1656         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1657                         af_elock_keys + sk->sk_family,
1658                         af_family_elock_key_strings[sk->sk_family]);
1659         lockdep_set_class_and_name(&sk->sk_callback_lock,
1660                         af_callback_keys + sk->sk_family,
1661                         af_family_clock_key_strings[sk->sk_family]);
1662 }
1663
1664 /**
1665  *      sk_clone_lock - clone a socket, and lock its clone
1666  *      @sk: the socket to clone
1667  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1668  *
1669  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1670  */
1671 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1672 {
1673         struct sock *newsk;
1674         bool is_charged = true;
1675
1676         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1677         if (newsk != NULL) {
1678                 struct sk_filter *filter;
1679
1680                 sock_copy(newsk, sk);
1681
1682                 newsk->sk_prot_creator = sk->sk_prot;
1683
1684                 /* SANITY */
1685                 if (likely(newsk->sk_net_refcnt))
1686                         get_net(sock_net(newsk));
1687                 sk_node_init(&newsk->sk_node);
1688                 sock_lock_init(newsk);
1689                 bh_lock_sock(newsk);
1690                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1691                 newsk->sk_backlog.len = 0;
1692
1693                 atomic_set(&newsk->sk_rmem_alloc, 0);
1694                 /*
1695                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1696                  */
1697                 refcount_set(&newsk->sk_wmem_alloc, 1);
1698                 atomic_set(&newsk->sk_omem_alloc, 0);
1699                 sk_init_common(newsk);
1700
1701                 newsk->sk_dst_cache     = NULL;
1702                 newsk->sk_dst_pending_confirm = 0;
1703                 newsk->sk_wmem_queued   = 0;
1704                 newsk->sk_forward_alloc = 0;
1705                 atomic_set(&newsk->sk_drops, 0);
1706                 newsk->sk_send_head     = NULL;
1707                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1708                 atomic_set(&newsk->sk_zckey, 0);
1709
1710                 sock_reset_flag(newsk, SOCK_DONE);
1711
1712                 /* sk->sk_memcg will be populated at accept() time */
1713                 newsk->sk_memcg = NULL;
1714
1715                 cgroup_sk_clone(&newsk->sk_cgrp_data);
1716
1717                 rcu_read_lock();
1718                 filter = rcu_dereference(sk->sk_filter);
1719                 if (filter != NULL)
1720                         /* though it's an empty new sock, the charging may fail
1721                          * if sysctl_optmem_max was changed between creation of
1722                          * original socket and cloning
1723                          */
1724                         is_charged = sk_filter_charge(newsk, filter);
1725                 RCU_INIT_POINTER(newsk->sk_filter, filter);
1726                 rcu_read_unlock();
1727
1728                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1729                         /* We need to make sure that we don't uncharge the new
1730                          * socket if we couldn't charge it in the first place
1731                          * as otherwise we uncharge the parent's filter.
1732                          */
1733                         if (!is_charged)
1734                                 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1735                         sk_free_unlock_clone(newsk);
1736                         newsk = NULL;
1737                         goto out;
1738                 }
1739                 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1740
1741                 newsk->sk_err      = 0;
1742                 newsk->sk_err_soft = 0;
1743                 newsk->sk_priority = 0;
1744                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1745                 atomic64_set(&newsk->sk_cookie, 0);
1746                 if (likely(newsk->sk_net_refcnt))
1747                         sock_inuse_add(sock_net(newsk), 1);
1748
1749                 /*
1750                  * Before updating sk_refcnt, we must commit prior changes to memory
1751                  * (Documentation/RCU/rculist_nulls.txt for details)
1752                  */
1753                 smp_wmb();
1754                 refcount_set(&newsk->sk_refcnt, 2);
1755
1756                 /*
1757                  * Increment the counter in the same struct proto as the master
1758                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1759                  * is the same as sk->sk_prot->socks, as this field was copied
1760                  * with memcpy).
1761                  *
1762                  * This _changes_ the previous behaviour, where
1763                  * tcp_create_openreq_child always was incrementing the
1764                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1765                  * to be taken into account in all callers. -acme
1766                  */
1767                 sk_refcnt_debug_inc(newsk);
1768                 sk_set_socket(newsk, NULL);
1769                 sk_tx_queue_clear(newsk);
1770                 newsk->sk_wq = NULL;
1771
1772                 if (newsk->sk_prot->sockets_allocated)
1773                         sk_sockets_allocated_inc(newsk);
1774
1775                 if (sock_needs_netstamp(sk) &&
1776                     newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1777                         net_enable_timestamp();
1778         }
1779 out:
1780         return newsk;
1781 }
1782 EXPORT_SYMBOL_GPL(sk_clone_lock);
1783
1784 void sk_free_unlock_clone(struct sock *sk)
1785 {
1786         /* It is still raw copy of parent, so invalidate
1787          * destructor and make plain sk_free() */
1788         sk->sk_destruct = NULL;
1789         bh_unlock_sock(sk);
1790         sk_free(sk);
1791 }
1792 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1793
1794 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1795 {
1796         u32 max_segs = 1;
1797
1798         sk_dst_set(sk, dst);
1799         sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1800         if (sk->sk_route_caps & NETIF_F_GSO)
1801                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1802         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1803         if (sk_can_gso(sk)) {
1804                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1805                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1806                 } else {
1807                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1808                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1809                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1810                 }
1811         }
1812         sk->sk_gso_max_segs = max_segs;
1813 }
1814 EXPORT_SYMBOL_GPL(sk_setup_caps);
1815
1816 /*
1817  *      Simple resource managers for sockets.
1818  */
1819
1820
1821 /*
1822  * Write buffer destructor automatically called from kfree_skb.
1823  */
1824 void sock_wfree(struct sk_buff *skb)
1825 {
1826         struct sock *sk = skb->sk;
1827         unsigned int len = skb->truesize;
1828
1829         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1830                 /*
1831                  * Keep a reference on sk_wmem_alloc, this will be released
1832                  * after sk_write_space() call
1833                  */
1834                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1835                 sk->sk_write_space(sk);
1836                 len = 1;
1837         }
1838         /*
1839          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1840          * could not do because of in-flight packets
1841          */
1842         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1843                 __sk_free(sk);
1844 }
1845 EXPORT_SYMBOL(sock_wfree);
1846
1847 /* This variant of sock_wfree() is used by TCP,
1848  * since it sets SOCK_USE_WRITE_QUEUE.
1849  */
1850 void __sock_wfree(struct sk_buff *skb)
1851 {
1852         struct sock *sk = skb->sk;
1853
1854         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1855                 __sk_free(sk);
1856 }
1857
1858 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1859 {
1860         skb_orphan(skb);
1861         skb->sk = sk;
1862 #ifdef CONFIG_INET
1863         if (unlikely(!sk_fullsock(sk))) {
1864                 skb->destructor = sock_edemux;
1865                 sock_hold(sk);
1866                 return;
1867         }
1868 #endif
1869         skb->destructor = sock_wfree;
1870         skb_set_hash_from_sk(skb, sk);
1871         /*
1872          * We used to take a refcount on sk, but following operation
1873          * is enough to guarantee sk_free() wont free this sock until
1874          * all in-flight packets are completed
1875          */
1876         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1877 }
1878 EXPORT_SYMBOL(skb_set_owner_w);
1879
1880 /* This helper is used by netem, as it can hold packets in its
1881  * delay queue. We want to allow the owner socket to send more
1882  * packets, as if they were already TX completed by a typical driver.
1883  * But we also want to keep skb->sk set because some packet schedulers
1884  * rely on it (sch_fq for example).
1885  */
1886 void skb_orphan_partial(struct sk_buff *skb)
1887 {
1888         if (skb_is_tcp_pure_ack(skb))
1889                 return;
1890
1891         if (skb->destructor == sock_wfree
1892 #ifdef CONFIG_INET
1893             || skb->destructor == tcp_wfree
1894 #endif
1895                 ) {
1896                 struct sock *sk = skb->sk;
1897
1898                 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1899                         WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1900                         skb->destructor = sock_efree;
1901                 }
1902         } else {
1903                 skb_orphan(skb);
1904         }
1905 }
1906 EXPORT_SYMBOL(skb_orphan_partial);
1907
1908 /*
1909  * Read buffer destructor automatically called from kfree_skb.
1910  */
1911 void sock_rfree(struct sk_buff *skb)
1912 {
1913         struct sock *sk = skb->sk;
1914         unsigned int len = skb->truesize;
1915
1916         atomic_sub(len, &sk->sk_rmem_alloc);
1917         sk_mem_uncharge(sk, len);
1918 }
1919 EXPORT_SYMBOL(sock_rfree);
1920
1921 /*
1922  * Buffer destructor for skbs that are not used directly in read or write
1923  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1924  */
1925 void sock_efree(struct sk_buff *skb)
1926 {
1927         sock_put(skb->sk);
1928 }
1929 EXPORT_SYMBOL(sock_efree);
1930
1931 kuid_t sock_i_uid(struct sock *sk)
1932 {
1933         kuid_t uid;
1934
1935         read_lock_bh(&sk->sk_callback_lock);
1936         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1937         read_unlock_bh(&sk->sk_callback_lock);
1938         return uid;
1939 }
1940 EXPORT_SYMBOL(sock_i_uid);
1941
1942 unsigned long sock_i_ino(struct sock *sk)
1943 {
1944         unsigned long ino;
1945
1946         read_lock_bh(&sk->sk_callback_lock);
1947         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1948         read_unlock_bh(&sk->sk_callback_lock);
1949         return ino;
1950 }
1951 EXPORT_SYMBOL(sock_i_ino);
1952
1953 /*
1954  * Allocate a skb from the socket's send buffer.
1955  */
1956 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1957                              gfp_t priority)
1958 {
1959         if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1960                 struct sk_buff *skb = alloc_skb(size, priority);
1961                 if (skb) {
1962                         skb_set_owner_w(skb, sk);
1963                         return skb;
1964                 }
1965         }
1966         return NULL;
1967 }
1968 EXPORT_SYMBOL(sock_wmalloc);
1969
1970 static void sock_ofree(struct sk_buff *skb)
1971 {
1972         struct sock *sk = skb->sk;
1973
1974         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1975 }
1976
1977 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1978                              gfp_t priority)
1979 {
1980         struct sk_buff *skb;
1981
1982         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1983         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1984             sysctl_optmem_max)
1985                 return NULL;
1986
1987         skb = alloc_skb(size, priority);
1988         if (!skb)
1989                 return NULL;
1990
1991         atomic_add(skb->truesize, &sk->sk_omem_alloc);
1992         skb->sk = sk;
1993         skb->destructor = sock_ofree;
1994         return skb;
1995 }
1996
1997 /*
1998  * Allocate a memory block from the socket's option memory buffer.
1999  */
2000 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2001 {
2002         if ((unsigned int)size <= sysctl_optmem_max &&
2003             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2004                 void *mem;
2005                 /* First do the add, to avoid the race if kmalloc
2006                  * might sleep.
2007                  */
2008                 atomic_add(size, &sk->sk_omem_alloc);
2009                 mem = kmalloc(size, priority);
2010                 if (mem)
2011                         return mem;
2012                 atomic_sub(size, &sk->sk_omem_alloc);
2013         }
2014         return NULL;
2015 }
2016 EXPORT_SYMBOL(sock_kmalloc);
2017
2018 /* Free an option memory block. Note, we actually want the inline
2019  * here as this allows gcc to detect the nullify and fold away the
2020  * condition entirely.
2021  */
2022 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2023                                   const bool nullify)
2024 {
2025         if (WARN_ON_ONCE(!mem))
2026                 return;
2027         if (nullify)
2028                 kzfree(mem);
2029         else
2030                 kfree(mem);
2031         atomic_sub(size, &sk->sk_omem_alloc);
2032 }
2033
2034 void sock_kfree_s(struct sock *sk, void *mem, int size)
2035 {
2036         __sock_kfree_s(sk, mem, size, false);
2037 }
2038 EXPORT_SYMBOL(sock_kfree_s);
2039
2040 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2041 {
2042         __sock_kfree_s(sk, mem, size, true);
2043 }
2044 EXPORT_SYMBOL(sock_kzfree_s);
2045
2046 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2047    I think, these locks should be removed for datagram sockets.
2048  */
2049 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2050 {
2051         DEFINE_WAIT(wait);
2052
2053         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2054         for (;;) {
2055                 if (!timeo)
2056                         break;
2057                 if (signal_pending(current))
2058                         break;
2059                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2060                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2061                 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2062                         break;
2063                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2064                         break;
2065                 if (sk->sk_err)
2066                         break;
2067                 timeo = schedule_timeout(timeo);
2068         }
2069         finish_wait(sk_sleep(sk), &wait);
2070         return timeo;
2071 }
2072
2073
2074 /*
2075  *      Generic send/receive buffer handlers
2076  */
2077
2078 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2079                                      unsigned long data_len, int noblock,
2080                                      int *errcode, int max_page_order)
2081 {
2082         struct sk_buff *skb;
2083         long timeo;
2084         int err;
2085
2086         timeo = sock_sndtimeo(sk, noblock);
2087         for (;;) {
2088                 err = sock_error(sk);
2089                 if (err != 0)
2090                         goto failure;
2091
2092                 err = -EPIPE;
2093                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2094                         goto failure;
2095
2096                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2097                         break;
2098
2099                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2100                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2101                 err = -EAGAIN;
2102                 if (!timeo)
2103                         goto failure;
2104                 if (signal_pending(current))
2105                         goto interrupted;
2106                 timeo = sock_wait_for_wmem(sk, timeo);
2107         }
2108         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2109                                    errcode, sk->sk_allocation);
2110         if (skb)
2111                 skb_set_owner_w(skb, sk);
2112         return skb;
2113
2114 interrupted:
2115         err = sock_intr_errno(timeo);
2116 failure:
2117         *errcode = err;
2118         return NULL;
2119 }
2120 EXPORT_SYMBOL(sock_alloc_send_pskb);
2121
2122 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2123                                     int noblock, int *errcode)
2124 {
2125         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2126 }
2127 EXPORT_SYMBOL(sock_alloc_send_skb);
2128
2129 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2130                      struct sockcm_cookie *sockc)
2131 {
2132         u32 tsflags;
2133
2134         switch (cmsg->cmsg_type) {
2135         case SO_MARK:
2136                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2137                         return -EPERM;
2138                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2139                         return -EINVAL;
2140                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2141                 break;
2142         case SO_TIMESTAMPING:
2143                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2144                         return -EINVAL;
2145
2146                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2147                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2148                         return -EINVAL;
2149
2150                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2151                 sockc->tsflags |= tsflags;
2152                 break;
2153         case SCM_TXTIME:
2154                 if (!sock_flag(sk, SOCK_TXTIME))
2155                         return -EINVAL;
2156                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2157                         return -EINVAL;
2158                 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2159                 break;
2160         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2161         case SCM_RIGHTS:
2162         case SCM_CREDENTIALS:
2163                 break;
2164         default:
2165                 return -EINVAL;
2166         }
2167         return 0;
2168 }
2169 EXPORT_SYMBOL(__sock_cmsg_send);
2170
2171 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2172                    struct sockcm_cookie *sockc)
2173 {
2174         struct cmsghdr *cmsg;
2175         int ret;
2176
2177         for_each_cmsghdr(cmsg, msg) {
2178                 if (!CMSG_OK(msg, cmsg))
2179                         return -EINVAL;
2180                 if (cmsg->cmsg_level != SOL_SOCKET)
2181                         continue;
2182                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2183                 if (ret)
2184                         return ret;
2185         }
2186         return 0;
2187 }
2188 EXPORT_SYMBOL(sock_cmsg_send);
2189
2190 static void sk_enter_memory_pressure(struct sock *sk)
2191 {
2192         if (!sk->sk_prot->enter_memory_pressure)
2193                 return;
2194
2195         sk->sk_prot->enter_memory_pressure(sk);
2196 }
2197
2198 static void sk_leave_memory_pressure(struct sock *sk)
2199 {
2200         if (sk->sk_prot->leave_memory_pressure) {
2201                 sk->sk_prot->leave_memory_pressure(sk);
2202         } else {
2203                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2204
2205                 if (memory_pressure && READ_ONCE(*memory_pressure))
2206                         WRITE_ONCE(*memory_pressure, 0);
2207         }
2208 }
2209
2210 /* On 32bit arches, an skb frag is limited to 2^15 */
2211 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
2212
2213 /**
2214  * skb_page_frag_refill - check that a page_frag contains enough room
2215  * @sz: minimum size of the fragment we want to get
2216  * @pfrag: pointer to page_frag
2217  * @gfp: priority for memory allocation
2218  *
2219  * Note: While this allocator tries to use high order pages, there is
2220  * no guarantee that allocations succeed. Therefore, @sz MUST be
2221  * less or equal than PAGE_SIZE.
2222  */
2223 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2224 {
2225         if (pfrag->page) {
2226                 if (page_ref_count(pfrag->page) == 1) {
2227                         pfrag->offset = 0;
2228                         return true;
2229                 }
2230                 if (pfrag->offset + sz <= pfrag->size)
2231                         return true;
2232                 put_page(pfrag->page);
2233         }
2234
2235         pfrag->offset = 0;
2236         if (SKB_FRAG_PAGE_ORDER) {
2237                 /* Avoid direct reclaim but allow kswapd to wake */
2238                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2239                                           __GFP_COMP | __GFP_NOWARN |
2240                                           __GFP_NORETRY,
2241                                           SKB_FRAG_PAGE_ORDER);
2242                 if (likely(pfrag->page)) {
2243                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2244                         return true;
2245                 }
2246         }
2247         pfrag->page = alloc_page(gfp);
2248         if (likely(pfrag->page)) {
2249                 pfrag->size = PAGE_SIZE;
2250                 return true;
2251         }
2252         return false;
2253 }
2254 EXPORT_SYMBOL(skb_page_frag_refill);
2255
2256 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2257 {
2258         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2259                 return true;
2260
2261         sk_enter_memory_pressure(sk);
2262         sk_stream_moderate_sndbuf(sk);
2263         return false;
2264 }
2265 EXPORT_SYMBOL(sk_page_frag_refill);
2266
2267 int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
2268                 int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
2269                 int first_coalesce)
2270 {
2271         int sg_curr = *sg_curr_index, use = 0, rc = 0;
2272         unsigned int size = *sg_curr_size;
2273         struct page_frag *pfrag;
2274         struct scatterlist *sge;
2275
2276         len -= size;
2277         pfrag = sk_page_frag(sk);
2278
2279         while (len > 0) {
2280                 unsigned int orig_offset;
2281
2282                 if (!sk_page_frag_refill(sk, pfrag)) {
2283                         rc = -ENOMEM;
2284                         goto out;
2285                 }
2286
2287                 use = min_t(int, len, pfrag->size - pfrag->offset);
2288
2289                 if (!sk_wmem_schedule(sk, use)) {
2290                         rc = -ENOMEM;
2291                         goto out;
2292                 }
2293
2294                 sk_mem_charge(sk, use);
2295                 size += use;
2296                 orig_offset = pfrag->offset;
2297                 pfrag->offset += use;
2298
2299                 sge = sg + sg_curr - 1;
2300                 if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page &&
2301                     sge->offset + sge->length == orig_offset) {
2302                         sge->length += use;
2303                 } else {
2304                         sge = sg + sg_curr;
2305                         sg_unmark_end(sge);
2306                         sg_set_page(sge, pfrag->page, use, orig_offset);
2307                         get_page(pfrag->page);
2308                         sg_curr++;
2309
2310                         if (sg_curr == MAX_SKB_FRAGS)
2311                                 sg_curr = 0;
2312
2313                         if (sg_curr == sg_start) {
2314                                 rc = -ENOSPC;
2315                                 break;
2316                         }
2317                 }
2318
2319                 len -= use;
2320         }
2321 out:
2322         *sg_curr_size = size;
2323         *sg_curr_index = sg_curr;
2324         return rc;
2325 }
2326 EXPORT_SYMBOL(sk_alloc_sg);
2327
2328 static void __lock_sock(struct sock *sk)
2329         __releases(&sk->sk_lock.slock)
2330         __acquires(&sk->sk_lock.slock)
2331 {
2332         DEFINE_WAIT(wait);
2333
2334         for (;;) {
2335                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2336                                         TASK_UNINTERRUPTIBLE);
2337                 spin_unlock_bh(&sk->sk_lock.slock);
2338                 schedule();
2339                 spin_lock_bh(&sk->sk_lock.slock);
2340                 if (!sock_owned_by_user(sk))
2341                         break;
2342         }
2343         finish_wait(&sk->sk_lock.wq, &wait);
2344 }
2345
2346 void __release_sock(struct sock *sk)
2347         __releases(&sk->sk_lock.slock)
2348         __acquires(&sk->sk_lock.slock)
2349 {
2350         struct sk_buff *skb, *next;
2351
2352         while ((skb = sk->sk_backlog.head) != NULL) {
2353                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2354
2355                 spin_unlock_bh(&sk->sk_lock.slock);
2356
2357                 do {
2358                         next = skb->next;
2359                         prefetch(next);
2360                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2361                         skb->next = NULL;
2362                         sk_backlog_rcv(sk, skb);
2363
2364                         cond_resched();
2365
2366                         skb = next;
2367                 } while (skb != NULL);
2368
2369                 spin_lock_bh(&sk->sk_lock.slock);
2370         }
2371
2372         /*
2373          * Doing the zeroing here guarantee we can not loop forever
2374          * while a wild producer attempts to flood us.
2375          */
2376         sk->sk_backlog.len = 0;
2377 }
2378
2379 void __sk_flush_backlog(struct sock *sk)
2380 {
2381         spin_lock_bh(&sk->sk_lock.slock);
2382         __release_sock(sk);
2383         spin_unlock_bh(&sk->sk_lock.slock);
2384 }
2385
2386 /**
2387  * sk_wait_data - wait for data to arrive at sk_receive_queue
2388  * @sk:    sock to wait on
2389  * @timeo: for how long
2390  * @skb:   last skb seen on sk_receive_queue
2391  *
2392  * Now socket state including sk->sk_err is changed only under lock,
2393  * hence we may omit checks after joining wait queue.
2394  * We check receive queue before schedule() only as optimization;
2395  * it is very likely that release_sock() added new data.
2396  */
2397 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2398 {
2399         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2400         int rc;
2401
2402         add_wait_queue(sk_sleep(sk), &wait);
2403         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2404         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2405         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2406         remove_wait_queue(sk_sleep(sk), &wait);
2407         return rc;
2408 }
2409 EXPORT_SYMBOL(sk_wait_data);
2410
2411 /**
2412  *      __sk_mem_raise_allocated - increase memory_allocated
2413  *      @sk: socket
2414  *      @size: memory size to allocate
2415  *      @amt: pages to allocate
2416  *      @kind: allocation type
2417  *
2418  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2419  */
2420 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2421 {
2422         struct proto *prot = sk->sk_prot;
2423         long allocated = sk_memory_allocated_add(sk, amt);
2424         bool charged = true;
2425
2426         if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2427             !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2428                 goto suppress_allocation;
2429
2430         /* Under limit. */
2431         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2432                 sk_leave_memory_pressure(sk);
2433                 return 1;
2434         }
2435
2436         /* Under pressure. */
2437         if (allocated > sk_prot_mem_limits(sk, 1))
2438                 sk_enter_memory_pressure(sk);
2439
2440         /* Over hard limit. */
2441         if (allocated > sk_prot_mem_limits(sk, 2))
2442                 goto suppress_allocation;
2443
2444         /* guarantee minimum buffer size under pressure */
2445         if (kind == SK_MEM_RECV) {
2446                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2447                         return 1;
2448
2449         } else { /* SK_MEM_SEND */
2450                 int wmem0 = sk_get_wmem0(sk, prot);
2451
2452                 if (sk->sk_type == SOCK_STREAM) {
2453                         if (sk->sk_wmem_queued < wmem0)
2454                                 return 1;
2455                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2456                                 return 1;
2457                 }
2458         }
2459
2460         if (sk_has_memory_pressure(sk)) {
2461                 u64 alloc;
2462
2463                 if (!sk_under_memory_pressure(sk))
2464                         return 1;
2465                 alloc = sk_sockets_allocated_read_positive(sk);
2466                 if (sk_prot_mem_limits(sk, 2) > alloc *
2467                     sk_mem_pages(sk->sk_wmem_queued +
2468                                  atomic_read(&sk->sk_rmem_alloc) +
2469                                  sk->sk_forward_alloc))
2470                         return 1;
2471         }
2472
2473 suppress_allocation:
2474
2475         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2476                 sk_stream_moderate_sndbuf(sk);
2477
2478                 /* Fail only if socket is _under_ its sndbuf.
2479                  * In this case we cannot block, so that we have to fail.
2480                  */
2481                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2482                         return 1;
2483         }
2484
2485         if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2486                 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2487
2488         sk_memory_allocated_sub(sk, amt);
2489
2490         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2491                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2492
2493         return 0;
2494 }
2495 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2496
2497 /**
2498  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2499  *      @sk: socket
2500  *      @size: memory size to allocate
2501  *      @kind: allocation type
2502  *
2503  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2504  *      rmem allocation. This function assumes that protocols which have
2505  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2506  */
2507 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2508 {
2509         int ret, amt = sk_mem_pages(size);
2510
2511         sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2512         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2513         if (!ret)
2514                 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2515         return ret;
2516 }
2517 EXPORT_SYMBOL(__sk_mem_schedule);
2518
2519 /**
2520  *      __sk_mem_reduce_allocated - reclaim memory_allocated
2521  *      @sk: socket
2522  *      @amount: number of quanta
2523  *
2524  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2525  */
2526 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2527 {
2528         sk_memory_allocated_sub(sk, amount);
2529
2530         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2531                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2532
2533         if (sk_under_memory_pressure(sk) &&
2534             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2535                 sk_leave_memory_pressure(sk);
2536 }
2537 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2538
2539 /**
2540  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2541  *      @sk: socket
2542  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2543  */
2544 void __sk_mem_reclaim(struct sock *sk, int amount)
2545 {
2546         amount >>= SK_MEM_QUANTUM_SHIFT;
2547         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2548         __sk_mem_reduce_allocated(sk, amount);
2549 }
2550 EXPORT_SYMBOL(__sk_mem_reclaim);
2551
2552 int sk_set_peek_off(struct sock *sk, int val)
2553 {
2554         sk->sk_peek_off = val;
2555         return 0;
2556 }
2557 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2558
2559 /*
2560  * Set of default routines for initialising struct proto_ops when
2561  * the protocol does not support a particular function. In certain
2562  * cases where it makes no sense for a protocol to have a "do nothing"
2563  * function, some default processing is provided.
2564  */
2565
2566 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2567 {
2568         return -EOPNOTSUPP;
2569 }
2570 EXPORT_SYMBOL(sock_no_bind);
2571
2572 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2573                     int len, int flags)
2574 {
2575         return -EOPNOTSUPP;
2576 }
2577 EXPORT_SYMBOL(sock_no_connect);
2578
2579 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2580 {
2581         return -EOPNOTSUPP;
2582 }
2583 EXPORT_SYMBOL(sock_no_socketpair);
2584
2585 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2586                    bool kern)
2587 {
2588         return -EOPNOTSUPP;
2589 }
2590 EXPORT_SYMBOL(sock_no_accept);
2591
2592 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2593                     int peer)
2594 {
2595         return -EOPNOTSUPP;
2596 }
2597 EXPORT_SYMBOL(sock_no_getname);
2598
2599 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2600 {
2601         return -EOPNOTSUPP;
2602 }
2603 EXPORT_SYMBOL(sock_no_ioctl);
2604
2605 int sock_no_listen(struct socket *sock, int backlog)
2606 {
2607         return -EOPNOTSUPP;
2608 }
2609 EXPORT_SYMBOL(sock_no_listen);
2610
2611 int sock_no_shutdown(struct socket *sock, int how)
2612 {
2613         return -EOPNOTSUPP;
2614 }
2615 EXPORT_SYMBOL(sock_no_shutdown);
2616
2617 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2618                     char __user *optval, unsigned int optlen)
2619 {
2620         return -EOPNOTSUPP;
2621 }
2622 EXPORT_SYMBOL(sock_no_setsockopt);
2623
2624 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2625                     char __user *optval, int __user *optlen)
2626 {
2627         return -EOPNOTSUPP;
2628 }
2629 EXPORT_SYMBOL(sock_no_getsockopt);
2630
2631 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2632 {
2633         return -EOPNOTSUPP;
2634 }
2635 EXPORT_SYMBOL(sock_no_sendmsg);
2636
2637 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2638 {
2639         return -EOPNOTSUPP;
2640 }
2641 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2642
2643 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2644                     int flags)
2645 {
2646         return -EOPNOTSUPP;
2647 }
2648 EXPORT_SYMBOL(sock_no_recvmsg);
2649
2650 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2651 {
2652         /* Mirror missing mmap method error code */
2653         return -ENODEV;
2654 }
2655 EXPORT_SYMBOL(sock_no_mmap);
2656
2657 /*
2658  * When a file is received (via SCM_RIGHTS, etc), we must bump the
2659  * various sock-based usage counts.
2660  */
2661 void __receive_sock(struct file *file)
2662 {
2663         struct socket *sock;
2664         int error;
2665
2666         /*
2667          * The resulting value of "error" is ignored here since we only
2668          * need to take action when the file is a socket and testing
2669          * "sock" for NULL is sufficient.
2670          */
2671         sock = sock_from_file(file, &error);
2672         if (sock) {
2673                 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2674                 sock_update_classid(&sock->sk->sk_cgrp_data);
2675         }
2676 }
2677
2678 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2679 {
2680         ssize_t res;
2681         struct msghdr msg = {.msg_flags = flags};
2682         struct kvec iov;
2683         char *kaddr = kmap(page);
2684         iov.iov_base = kaddr + offset;
2685         iov.iov_len = size;
2686         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2687         kunmap(page);
2688         return res;
2689 }
2690 EXPORT_SYMBOL(sock_no_sendpage);
2691
2692 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2693                                 int offset, size_t size, int flags)
2694 {
2695         ssize_t res;
2696         struct msghdr msg = {.msg_flags = flags};
2697         struct kvec iov;
2698         char *kaddr = kmap(page);
2699
2700         iov.iov_base = kaddr + offset;
2701         iov.iov_len = size;
2702         res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2703         kunmap(page);
2704         return res;
2705 }
2706 EXPORT_SYMBOL(sock_no_sendpage_locked);
2707
2708 /*
2709  *      Default Socket Callbacks
2710  */
2711
2712 static void sock_def_wakeup(struct sock *sk)
2713 {
2714         struct socket_wq *wq;
2715
2716         rcu_read_lock();
2717         wq = rcu_dereference(sk->sk_wq);
2718         if (skwq_has_sleeper(wq))
2719                 wake_up_interruptible_all(&wq->wait);
2720         rcu_read_unlock();
2721 }
2722
2723 static void sock_def_error_report(struct sock *sk)
2724 {
2725         struct socket_wq *wq;
2726
2727         rcu_read_lock();
2728         wq = rcu_dereference(sk->sk_wq);
2729         if (skwq_has_sleeper(wq))
2730                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2731         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2732         rcu_read_unlock();
2733 }
2734
2735 static void sock_def_readable(struct sock *sk)
2736 {
2737         struct socket_wq *wq;
2738
2739         rcu_read_lock();
2740         wq = rcu_dereference(sk->sk_wq);
2741         if (skwq_has_sleeper(wq))
2742                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2743                                                 EPOLLRDNORM | EPOLLRDBAND);
2744         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2745         rcu_read_unlock();
2746 }
2747
2748 static void sock_def_write_space(struct sock *sk)
2749 {
2750         struct socket_wq *wq;
2751
2752         rcu_read_lock();
2753
2754         /* Do not wake up a writer until he can make "significant"
2755          * progress.  --DaveM
2756          */
2757         if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2758                 wq = rcu_dereference(sk->sk_wq);
2759                 if (skwq_has_sleeper(wq))
2760                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2761                                                 EPOLLWRNORM | EPOLLWRBAND);
2762
2763                 /* Should agree with poll, otherwise some programs break */
2764                 if (sock_writeable(sk))
2765                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2766         }
2767
2768         rcu_read_unlock();
2769 }
2770
2771 static void sock_def_destruct(struct sock *sk)
2772 {
2773 }
2774
2775 void sk_send_sigurg(struct sock *sk)
2776 {
2777         if (sk->sk_socket && sk->sk_socket->file)
2778                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2779                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2780 }
2781 EXPORT_SYMBOL(sk_send_sigurg);
2782
2783 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2784                     unsigned long expires)
2785 {
2786         if (!mod_timer(timer, expires))
2787                 sock_hold(sk);
2788 }
2789 EXPORT_SYMBOL(sk_reset_timer);
2790
2791 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2792 {
2793         if (del_timer(timer))
2794                 __sock_put(sk);
2795 }
2796 EXPORT_SYMBOL(sk_stop_timer);
2797
2798 void sock_init_data(struct socket *sock, struct sock *sk)
2799 {
2800         sk_init_common(sk);
2801         sk->sk_send_head        =       NULL;
2802
2803         timer_setup(&sk->sk_timer, NULL, 0);
2804
2805         sk->sk_allocation       =       GFP_KERNEL;
2806         sk->sk_rcvbuf           =       sysctl_rmem_default;
2807         sk->sk_sndbuf           =       sysctl_wmem_default;
2808         sk->sk_state            =       TCP_CLOSE;
2809         sk_set_socket(sk, sock);
2810
2811         sock_set_flag(sk, SOCK_ZAPPED);
2812
2813         if (sock) {
2814                 sk->sk_type     =       sock->type;
2815                 sk->sk_wq       =       sock->wq;
2816                 sock->sk        =       sk;
2817                 sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2818         } else {
2819                 sk->sk_wq       =       NULL;
2820                 sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2821         }
2822
2823         rwlock_init(&sk->sk_callback_lock);
2824         if (sk->sk_kern_sock)
2825                 lockdep_set_class_and_name(
2826                         &sk->sk_callback_lock,
2827                         af_kern_callback_keys + sk->sk_family,
2828                         af_family_kern_clock_key_strings[sk->sk_family]);
2829         else
2830                 lockdep_set_class_and_name(
2831                         &sk->sk_callback_lock,
2832                         af_callback_keys + sk->sk_family,
2833                         af_family_clock_key_strings[sk->sk_family]);
2834
2835         sk->sk_state_change     =       sock_def_wakeup;
2836         sk->sk_data_ready       =       sock_def_readable;
2837         sk->sk_write_space      =       sock_def_write_space;
2838         sk->sk_error_report     =       sock_def_error_report;
2839         sk->sk_destruct         =       sock_def_destruct;
2840
2841         sk->sk_frag.page        =       NULL;
2842         sk->sk_frag.offset      =       0;
2843         sk->sk_peek_off         =       -1;
2844
2845         sk->sk_peer_pid         =       NULL;
2846         sk->sk_peer_cred        =       NULL;
2847         spin_lock_init(&sk->sk_peer_lock);
2848
2849         sk->sk_write_pending    =       0;
2850         sk->sk_rcvlowat         =       1;
2851         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2852         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2853
2854         sk->sk_stamp = SK_DEFAULT_STAMP;
2855 #if BITS_PER_LONG==32
2856         seqlock_init(&sk->sk_stamp_seq);
2857 #endif
2858         atomic_set(&sk->sk_zckey, 0);
2859
2860 #ifdef CONFIG_NET_RX_BUSY_POLL
2861         sk->sk_napi_id          =       0;
2862         sk->sk_ll_usec          =       sysctl_net_busy_read;
2863 #endif
2864
2865         sk->sk_max_pacing_rate = ~0U;
2866         sk->sk_pacing_rate = ~0U;
2867         sk->sk_pacing_shift = 10;
2868         sk->sk_incoming_cpu = -1;
2869
2870         sk_rx_queue_clear(sk);
2871         /*
2872          * Before updating sk_refcnt, we must commit prior changes to memory
2873          * (Documentation/RCU/rculist_nulls.txt for details)
2874          */
2875         smp_wmb();
2876         refcount_set(&sk->sk_refcnt, 1);
2877         atomic_set(&sk->sk_drops, 0);
2878 }
2879 EXPORT_SYMBOL(sock_init_data);
2880
2881 void lock_sock_nested(struct sock *sk, int subclass)
2882 {
2883         might_sleep();
2884         spin_lock_bh(&sk->sk_lock.slock);
2885         if (sk->sk_lock.owned)
2886                 __lock_sock(sk);
2887         sk->sk_lock.owned = 1;
2888         spin_unlock(&sk->sk_lock.slock);
2889         /*
2890          * The sk_lock has mutex_lock() semantics here:
2891          */
2892         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2893         local_bh_enable();
2894 }
2895 EXPORT_SYMBOL(lock_sock_nested);
2896
2897 void release_sock(struct sock *sk)
2898 {
2899         spin_lock_bh(&sk->sk_lock.slock);
2900         if (sk->sk_backlog.tail)
2901                 __release_sock(sk);
2902
2903         /* Warning : release_cb() might need to release sk ownership,
2904          * ie call sock_release_ownership(sk) before us.
2905          */
2906         if (sk->sk_prot->release_cb)
2907                 sk->sk_prot->release_cb(sk);
2908
2909         sock_release_ownership(sk);
2910         if (waitqueue_active(&sk->sk_lock.wq))
2911                 wake_up(&sk->sk_lock.wq);
2912         spin_unlock_bh(&sk->sk_lock.slock);
2913 }
2914 EXPORT_SYMBOL(release_sock);
2915
2916 /**
2917  * lock_sock_fast - fast version of lock_sock
2918  * @sk: socket
2919  *
2920  * This version should be used for very small section, where process wont block
2921  * return false if fast path is taken:
2922  *
2923  *   sk_lock.slock locked, owned = 0, BH disabled
2924  *
2925  * return true if slow path is taken:
2926  *
2927  *   sk_lock.slock unlocked, owned = 1, BH enabled
2928  */
2929 bool lock_sock_fast(struct sock *sk)
2930 {
2931         might_sleep();
2932         spin_lock_bh(&sk->sk_lock.slock);
2933
2934         if (!sk->sk_lock.owned)
2935                 /*
2936                  * Note : We must disable BH
2937                  */
2938                 return false;
2939
2940         __lock_sock(sk);
2941         sk->sk_lock.owned = 1;
2942         spin_unlock(&sk->sk_lock.slock);
2943         /*
2944          * The sk_lock has mutex_lock() semantics here:
2945          */
2946         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2947         local_bh_enable();
2948         return true;
2949 }
2950 EXPORT_SYMBOL(lock_sock_fast);
2951
2952 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2953 {
2954         struct timeval tv;
2955
2956         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2957         tv = ktime_to_timeval(sock_read_timestamp(sk));
2958         if (tv.tv_sec == -1)
2959                 return -ENOENT;
2960         if (tv.tv_sec == 0) {
2961                 ktime_t kt = ktime_get_real();
2962                 sock_write_timestamp(sk, kt);
2963                 tv = ktime_to_timeval(kt);
2964         }
2965         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2966 }
2967 EXPORT_SYMBOL(sock_get_timestamp);
2968
2969 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2970 {
2971         struct timespec ts;
2972
2973         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2974         ts = ktime_to_timespec(sock_read_timestamp(sk));
2975         if (ts.tv_sec == -1)
2976                 return -ENOENT;
2977         if (ts.tv_sec == 0) {
2978                 ktime_t kt = ktime_get_real();
2979                 sock_write_timestamp(sk, kt);
2980                 ts = ktime_to_timespec(sk->sk_stamp);
2981         }
2982         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2983 }
2984 EXPORT_SYMBOL(sock_get_timestampns);
2985
2986 void sock_enable_timestamp(struct sock *sk, int flag)
2987 {
2988         if (!sock_flag(sk, flag)) {
2989                 unsigned long previous_flags = sk->sk_flags;
2990
2991                 sock_set_flag(sk, flag);
2992                 /*
2993                  * we just set one of the two flags which require net
2994                  * time stamping, but time stamping might have been on
2995                  * already because of the other one
2996                  */
2997                 if (sock_needs_netstamp(sk) &&
2998                     !(previous_flags & SK_FLAGS_TIMESTAMP))
2999                         net_enable_timestamp();
3000         }
3001 }
3002
3003 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3004                        int level, int type)
3005 {
3006         struct sock_exterr_skb *serr;
3007         struct sk_buff *skb;
3008         int copied, err;
3009
3010         err = -EAGAIN;
3011         skb = sock_dequeue_err_skb(sk);
3012         if (skb == NULL)
3013                 goto out;
3014
3015         copied = skb->len;
3016         if (copied > len) {
3017                 msg->msg_flags |= MSG_TRUNC;
3018                 copied = len;
3019         }
3020         err = skb_copy_datagram_msg(skb, 0, msg, copied);
3021         if (err)
3022                 goto out_free_skb;
3023
3024         sock_recv_timestamp(msg, sk, skb);
3025
3026         serr = SKB_EXT_ERR(skb);
3027         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3028
3029         msg->msg_flags |= MSG_ERRQUEUE;
3030         err = copied;
3031
3032 out_free_skb:
3033         kfree_skb(skb);
3034 out:
3035         return err;
3036 }
3037 EXPORT_SYMBOL(sock_recv_errqueue);
3038
3039 /*
3040  *      Get a socket option on an socket.
3041  *
3042  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3043  *      asynchronous errors should be reported by getsockopt. We assume
3044  *      this means if you specify SO_ERROR (otherwise whats the point of it).
3045  */
3046 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3047                            char __user *optval, int __user *optlen)
3048 {
3049         struct sock *sk = sock->sk;
3050
3051         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3052 }
3053 EXPORT_SYMBOL(sock_common_getsockopt);
3054
3055 #ifdef CONFIG_COMPAT
3056 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3057                                   char __user *optval, int __user *optlen)
3058 {
3059         struct sock *sk = sock->sk;
3060
3061         if (sk->sk_prot->compat_getsockopt != NULL)
3062                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
3063                                                       optval, optlen);
3064         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3065 }
3066 EXPORT_SYMBOL(compat_sock_common_getsockopt);
3067 #endif
3068
3069 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3070                         int flags)
3071 {
3072         struct sock *sk = sock->sk;
3073         int addr_len = 0;
3074         int err;
3075
3076         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3077                                    flags & ~MSG_DONTWAIT, &addr_len);
3078         if (err >= 0)
3079                 msg->msg_namelen = addr_len;
3080         return err;
3081 }
3082 EXPORT_SYMBOL(sock_common_recvmsg);
3083
3084 /*
3085  *      Set socket options on an inet socket.
3086  */
3087 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3088                            char __user *optval, unsigned int optlen)
3089 {
3090         struct sock *sk = sock->sk;
3091
3092         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3093 }
3094 EXPORT_SYMBOL(sock_common_setsockopt);
3095
3096 #ifdef CONFIG_COMPAT
3097 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
3098                                   char __user *optval, unsigned int optlen)
3099 {
3100         struct sock *sk = sock->sk;
3101
3102         if (sk->sk_prot->compat_setsockopt != NULL)
3103                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
3104                                                       optval, optlen);
3105         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3106 }
3107 EXPORT_SYMBOL(compat_sock_common_setsockopt);
3108 #endif
3109
3110 void sk_common_release(struct sock *sk)
3111 {
3112         if (sk->sk_prot->destroy)
3113                 sk->sk_prot->destroy(sk);
3114
3115         /*
3116          * Observation: when sock_common_release is called, processes have
3117          * no access to socket. But net still has.
3118          * Step one, detach it from networking:
3119          *
3120          * A. Remove from hash tables.
3121          */
3122
3123         sk->sk_prot->unhash(sk);
3124
3125         /*
3126          * In this point socket cannot receive new packets, but it is possible
3127          * that some packets are in flight because some CPU runs receiver and
3128          * did hash table lookup before we unhashed socket. They will achieve
3129          * receive queue and will be purged by socket destructor.
3130          *
3131          * Also we still have packets pending on receive queue and probably,
3132          * our own packets waiting in device queues. sock_destroy will drain
3133          * receive queue, but transmitted packets will delay socket destruction
3134          * until the last reference will be released.
3135          */
3136
3137         sock_orphan(sk);
3138
3139         xfrm_sk_free_policy(sk);
3140
3141         sk_refcnt_debug_release(sk);
3142
3143         sock_put(sk);
3144 }
3145 EXPORT_SYMBOL(sk_common_release);
3146
3147 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3148 {
3149         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3150
3151         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3152         mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3153         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3154         mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3155         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3156         mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3157         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3158         mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3159         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3160 }
3161
3162 #ifdef CONFIG_PROC_FS
3163 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
3164 struct prot_inuse {
3165         int val[PROTO_INUSE_NR];
3166 };
3167
3168 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3169
3170 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3171 {
3172         __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3173 }
3174 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3175
3176 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3177 {
3178         int cpu, idx = prot->inuse_idx;
3179         int res = 0;
3180
3181         for_each_possible_cpu(cpu)
3182                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3183
3184         return res >= 0 ? res : 0;
3185 }
3186 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3187
3188 static void sock_inuse_add(struct net *net, int val)
3189 {
3190         this_cpu_add(*net->core.sock_inuse, val);
3191 }
3192
3193 int sock_inuse_get(struct net *net)
3194 {
3195         int cpu, res = 0;
3196
3197         for_each_possible_cpu(cpu)
3198                 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3199
3200         return res;
3201 }
3202
3203 EXPORT_SYMBOL_GPL(sock_inuse_get);
3204
3205 static int __net_init sock_inuse_init_net(struct net *net)
3206 {
3207         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3208         if (net->core.prot_inuse == NULL)
3209                 return -ENOMEM;
3210
3211         net->core.sock_inuse = alloc_percpu(int);
3212         if (net->core.sock_inuse == NULL)
3213                 goto out;
3214
3215         return 0;
3216
3217 out:
3218         free_percpu(net->core.prot_inuse);
3219         return -ENOMEM;
3220 }
3221
3222 static void __net_exit sock_inuse_exit_net(struct net *net)
3223 {
3224         free_percpu(net->core.prot_inuse);
3225         free_percpu(net->core.sock_inuse);
3226 }
3227
3228 static struct pernet_operations net_inuse_ops = {
3229         .init = sock_inuse_init_net,
3230         .exit = sock_inuse_exit_net,
3231 };
3232
3233 static __init int net_inuse_init(void)
3234 {
3235         if (register_pernet_subsys(&net_inuse_ops))
3236                 panic("Cannot initialize net inuse counters");
3237
3238         return 0;
3239 }
3240
3241 core_initcall(net_inuse_init);
3242
3243 static void assign_proto_idx(struct proto *prot)
3244 {
3245         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3246
3247         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3248                 pr_err("PROTO_INUSE_NR exhausted\n");
3249                 return;
3250         }
3251
3252         set_bit(prot->inuse_idx, proto_inuse_idx);
3253 }
3254
3255 static void release_proto_idx(struct proto *prot)
3256 {
3257         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3258                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3259 }
3260 #else
3261 static inline void assign_proto_idx(struct proto *prot)
3262 {
3263 }
3264
3265 static inline void release_proto_idx(struct proto *prot)
3266 {
3267 }
3268
3269 static void sock_inuse_add(struct net *net, int val)
3270 {
3271 }
3272 #endif
3273
3274 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3275 {
3276         if (!rsk_prot)
3277                 return;
3278         kfree(rsk_prot->slab_name);
3279         rsk_prot->slab_name = NULL;
3280         kmem_cache_destroy(rsk_prot->slab);
3281         rsk_prot->slab = NULL;
3282 }
3283
3284 static int req_prot_init(const struct proto *prot)
3285 {
3286         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3287
3288         if (!rsk_prot)
3289                 return 0;
3290
3291         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3292                                         prot->name);
3293         if (!rsk_prot->slab_name)
3294                 return -ENOMEM;
3295
3296         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3297                                            rsk_prot->obj_size, 0,
3298                                            SLAB_ACCOUNT | prot->slab_flags,
3299                                            NULL);
3300
3301         if (!rsk_prot->slab) {
3302                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3303                         prot->name);
3304                 return -ENOMEM;
3305         }
3306         return 0;
3307 }
3308
3309 int proto_register(struct proto *prot, int alloc_slab)
3310 {
3311         if (alloc_slab) {
3312                 prot->slab = kmem_cache_create_usercopy(prot->name,
3313                                         prot->obj_size, 0,
3314                                         SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3315                                         prot->slab_flags,
3316                                         prot->useroffset, prot->usersize,
3317                                         NULL);
3318
3319                 if (prot->slab == NULL) {
3320                         pr_crit("%s: Can't create sock SLAB cache!\n",
3321                                 prot->name);
3322                         goto out;
3323                 }
3324
3325                 if (req_prot_init(prot))
3326                         goto out_free_request_sock_slab;
3327
3328                 if (prot->twsk_prot != NULL) {
3329                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3330
3331                         if (prot->twsk_prot->twsk_slab_name == NULL)
3332                                 goto out_free_request_sock_slab;
3333
3334                         prot->twsk_prot->twsk_slab =
3335                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3336                                                   prot->twsk_prot->twsk_obj_size,
3337                                                   0,
3338                                                   SLAB_ACCOUNT |
3339                                                   prot->slab_flags,
3340                                                   NULL);
3341                         if (prot->twsk_prot->twsk_slab == NULL)
3342                                 goto out_free_timewait_sock_slab_name;
3343                 }
3344         }
3345
3346         mutex_lock(&proto_list_mutex);
3347         list_add(&prot->node, &proto_list);
3348         assign_proto_idx(prot);
3349         mutex_unlock(&proto_list_mutex);
3350         return 0;
3351
3352 out_free_timewait_sock_slab_name:
3353         kfree(prot->twsk_prot->twsk_slab_name);
3354 out_free_request_sock_slab:
3355         req_prot_cleanup(prot->rsk_prot);
3356
3357         kmem_cache_destroy(prot->slab);
3358         prot->slab = NULL;
3359 out:
3360         return -ENOBUFS;
3361 }
3362 EXPORT_SYMBOL(proto_register);
3363
3364 void proto_unregister(struct proto *prot)
3365 {
3366         mutex_lock(&proto_list_mutex);
3367         release_proto_idx(prot);
3368         list_del(&prot->node);
3369         mutex_unlock(&proto_list_mutex);
3370
3371         kmem_cache_destroy(prot->slab);
3372         prot->slab = NULL;
3373
3374         req_prot_cleanup(prot->rsk_prot);
3375
3376         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3377                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3378                 kfree(prot->twsk_prot->twsk_slab_name);
3379                 prot->twsk_prot->twsk_slab = NULL;
3380         }
3381 }
3382 EXPORT_SYMBOL(proto_unregister);
3383
3384 int sock_load_diag_module(int family, int protocol)
3385 {
3386         if (!protocol) {
3387                 if (!sock_is_registered(family))
3388                         return -ENOENT;
3389
3390                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3391                                       NETLINK_SOCK_DIAG, family);
3392         }
3393
3394 #ifdef CONFIG_INET
3395         if (family == AF_INET &&
3396             protocol != IPPROTO_RAW &&
3397             !rcu_access_pointer(inet_protos[protocol]))
3398                 return -ENOENT;
3399 #endif
3400
3401         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3402                               NETLINK_SOCK_DIAG, family, protocol);
3403 }
3404 EXPORT_SYMBOL(sock_load_diag_module);
3405
3406 #ifdef CONFIG_PROC_FS
3407 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3408         __acquires(proto_list_mutex)
3409 {
3410         mutex_lock(&proto_list_mutex);
3411         return seq_list_start_head(&proto_list, *pos);
3412 }
3413
3414 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3415 {
3416         return seq_list_next(v, &proto_list, pos);
3417 }
3418
3419 static void proto_seq_stop(struct seq_file *seq, void *v)
3420         __releases(proto_list_mutex)
3421 {
3422         mutex_unlock(&proto_list_mutex);
3423 }
3424
3425 static char proto_method_implemented(const void *method)
3426 {
3427         return method == NULL ? 'n' : 'y';
3428 }
3429 static long sock_prot_memory_allocated(struct proto *proto)
3430 {
3431         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3432 }
3433
3434 static char *sock_prot_memory_pressure(struct proto *proto)
3435 {
3436         return proto->memory_pressure != NULL ?
3437         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3438 }
3439
3440 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3441 {
3442
3443         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3444                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3445                    proto->name,
3446                    proto->obj_size,
3447                    sock_prot_inuse_get(seq_file_net(seq), proto),
3448                    sock_prot_memory_allocated(proto),
3449                    sock_prot_memory_pressure(proto),
3450                    proto->max_header,
3451                    proto->slab == NULL ? "no" : "yes",
3452                    module_name(proto->owner),
3453                    proto_method_implemented(proto->close),
3454                    proto_method_implemented(proto->connect),
3455                    proto_method_implemented(proto->disconnect),
3456                    proto_method_implemented(proto->accept),
3457                    proto_method_implemented(proto->ioctl),
3458                    proto_method_implemented(proto->init),
3459                    proto_method_implemented(proto->destroy),
3460                    proto_method_implemented(proto->shutdown),
3461                    proto_method_implemented(proto->setsockopt),
3462                    proto_method_implemented(proto->getsockopt),
3463                    proto_method_implemented(proto->sendmsg),
3464                    proto_method_implemented(proto->recvmsg),
3465                    proto_method_implemented(proto->sendpage),
3466                    proto_method_implemented(proto->bind),
3467                    proto_method_implemented(proto->backlog_rcv),
3468                    proto_method_implemented(proto->hash),
3469                    proto_method_implemented(proto->unhash),
3470                    proto_method_implemented(proto->get_port),
3471                    proto_method_implemented(proto->enter_memory_pressure));
3472 }
3473
3474 static int proto_seq_show(struct seq_file *seq, void *v)
3475 {
3476         if (v == &proto_list)
3477                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3478                            "protocol",
3479                            "size",
3480                            "sockets",
3481                            "memory",
3482                            "press",
3483                            "maxhdr",
3484                            "slab",
3485                            "module",
3486                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3487         else
3488                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3489         return 0;
3490 }
3491
3492 static const struct seq_operations proto_seq_ops = {
3493         .start  = proto_seq_start,
3494         .next   = proto_seq_next,
3495         .stop   = proto_seq_stop,
3496         .show   = proto_seq_show,
3497 };
3498
3499 static __net_init int proto_init_net(struct net *net)
3500 {
3501         if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3502                         sizeof(struct seq_net_private)))
3503                 return -ENOMEM;
3504
3505         return 0;
3506 }
3507
3508 static __net_exit void proto_exit_net(struct net *net)
3509 {
3510         remove_proc_entry("protocols", net->proc_net);
3511 }
3512
3513
3514 static __net_initdata struct pernet_operations proto_net_ops = {
3515         .init = proto_init_net,
3516         .exit = proto_exit_net,
3517 };
3518
3519 static int __init proto_init(void)
3520 {
3521         return register_pernet_subsys(&proto_net_ops);
3522 }
3523
3524 subsys_initcall(proto_init);
3525
3526 #endif /* PROC_FS */
3527
3528 #ifdef CONFIG_NET_RX_BUSY_POLL
3529 bool sk_busy_loop_end(void *p, unsigned long start_time)
3530 {
3531         struct sock *sk = p;
3532
3533         return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3534                sk_busy_loop_timeout(sk, start_time);
3535 }
3536 EXPORT_SYMBOL(sk_busy_loop_end);
3537 #endif /* CONFIG_NET_RX_BUSY_POLL */