net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <asm/unaligned.h>
  95 #include <linux/capability.h>
  96 #include <linux/errno.h>
  97 #include <linux/errqueue.h>
  98 #include <linux/types.h>
  99 #include <linux/socket.h>
 100 #include <linux/in.h>
 101 #include <linux/kernel.h>
 102 #include <linux/module.h>
 103 #include <linux/proc_fs.h>
 104 #include <linux/seq_file.h>
 105 #include <linux/sched.h>
 106 #include <linux/sched/mm.h>
 107 #include <linux/timer.h>
 108 #include <linux/string.h>
 109 #include <linux/sockios.h>
 110 #include <linux/net.h>
 111 #include <linux/mm.h>
 112 #include <linux/slab.h>
 113 #include <linux/interrupt.h>
 114 #include <linux/poll.h>
 115 #include <linux/tcp.h>
 116 #include <linux/init.h>
 117 #include <linux/highmem.h>
 118 #include <linux/user_namespace.h>
 119 #include <linux/static_key.h>
 120 #include <linux/memcontrol.h>
 121 #include <linux/prefetch.h>
 122
 123 #include <linux/uaccess.h>
 124
 125 #include <linux/netdevice.h>
 126 #include <net/protocol.h>
 127 #include <linux/skbuff.h>
 128 #include <net/net_namespace.h>
 129 #include <net/request_sock.h>
 130 #include <net/sock.h>
 131 #include <linux/net_tstamp.h>
 132 #include <net/xfrm.h>
 133 #include <linux/ipsec.h>
 134 #include <net/cls_cgroup.h>
 135 #include <net/netprio_cgroup.h>
 136 #include <linux/sock_diag.h>
 137
 138 #include <linux/filter.h>
 139 #include <net/sock_reuseport.h>
 140
 141 #include <trace/events/sock.h>
 142
 143 #include <net/tcp.h>
 144 #include <net/busy_poll.h>
 145
 146 static DEFINE_MUTEX(proto_list_mutex);
 147 static LIST_HEAD(proto_list);
 148
 149 static void sock_inuse_add(struct net *net, int val);
 150
 151 /**
 152  * sk_ns_capable - General socket capability test
 153  * @sk: Socket to use a capability on or through
 154  * @user_ns: The user namespace of the capability to use
 155  * @cap: The capability to use
 156  *
 157  * Test to see if the opener of the socket had when the socket was
 158  * created and the current process has the capability @cap in the user
 159  * namespace @user_ns.
 160  */
 161 bool sk_ns_capable(const struct sock *sk,
 162                    struct user_namespace *user_ns, int cap)
 163 {
 164         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 165                 ns_capable(user_ns, cap);
 166 }
 167 EXPORT_SYMBOL(sk_ns_capable);
 168
 169 /**
 170  * sk_capable - Socket global capability test
 171  * @sk: Socket to use a capability on or through
 172  * @cap: The global capability to use
 173  *
 174  * Test to see if the opener of the socket had when the socket was
 175  * created and the current process has the capability @cap in all user
 176  * namespaces.
 177  */
 178 bool sk_capable(const struct sock *sk, int cap)
 179 {
 180         return sk_ns_capable(sk, &init_user_ns, cap);
 181 }
 182 EXPORT_SYMBOL(sk_capable);
 183
 184 /**
 185  * sk_net_capable - Network namespace socket capability test
 186  * @sk: Socket to use a capability on or through
 187  * @cap: The capability to use
 188  *
 189  * Test to see if the opener of the socket had when the socket was created
 190  * and the current process has the capability @cap over the network namespace
 191  * the socket is a member of.
 192  */
 193 bool sk_net_capable(const struct sock *sk, int cap)
 194 {
 195         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 196 }
 197 EXPORT_SYMBOL(sk_net_capable);
 198
 199 /*
 200  * Each address family might have different locking rules, so we have
 201  * one slock key per address family and separate keys for internal and
 202  * userspace sockets.
 203  */
 204 static struct lock_class_key af_family_keys[AF_MAX];
 205 static struct lock_class_key af_family_kern_keys[AF_MAX];
 206 static struct lock_class_key af_family_slock_keys[AF_MAX];
 207 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 208
 209 /*
 210  * Make lock validator output more readable. (we pre-construct these
 211  * strings build-time, so that runtime initialization of socket
 212  * locks is fast):
 213  */
 214
 215 #define _sock_locks(x)                                            \
 216   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 217   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 218   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 219   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 220   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 221   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 222   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 223   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 224   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 225   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 226   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 227   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 228   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 229   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 230   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 231   x "AF_MAX"
 232
 233 static const char *const af_family_key_strings[AF_MAX+1] = {
 234         _sock_locks("sk_lock-")
 235 };
 236 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 237         _sock_locks("slock-")
 238 };
 239 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 240         _sock_locks("clock-")
 241 };
 242
 243 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 244         _sock_locks("k-sk_lock-")
 245 };
 246 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 247         _sock_locks("k-slock-")
 248 };
 249 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 250         _sock_locks("k-clock-")
 251 };
 252 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 253         _sock_locks("rlock-")
 254 };
 255 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 256         _sock_locks("wlock-")
 257 };
 258 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 259         _sock_locks("elock-")
 260 };
 261
 262 /*
 263  * sk_callback_lock and sk queues locking rules are per-address-family,
 264  * so split the lock classes by using a per-AF key:
 265  */
 266 static struct lock_class_key af_callback_keys[AF_MAX];
 267 static struct lock_class_key af_rlock_keys[AF_MAX];
 268 static struct lock_class_key af_wlock_keys[AF_MAX];
 269 static struct lock_class_key af_elock_keys[AF_MAX];
 270 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 271
 272 /* Run time adjustable parameters. */
 273 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 274 EXPORT_SYMBOL(sysctl_wmem_max);
 275 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 276 EXPORT_SYMBOL(sysctl_rmem_max);
 277 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 278 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 279
 280 /* Maximal space eaten by iovec or ancillary data plus some space */
 281 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 282 EXPORT_SYMBOL(sysctl_optmem_max);
 283
 284 int sysctl_tstamp_allow_data __read_mostly = 1;
 285
 286 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 287 EXPORT_SYMBOL_GPL(memalloc_socks_key);
 288
 289 /**
 290  * sk_set_memalloc - sets %SOCK_MEMALLOC
 291  * @sk: socket to set it on
 292  *
 293  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 294  * It's the responsibility of the admin to adjust min_free_kbytes
 295  * to meet the requirements
 296  */
 297 void sk_set_memalloc(struct sock *sk)
 298 {
 299         sock_set_flag(sk, SOCK_MEMALLOC);
 300         sk->sk_allocation |= __GFP_MEMALLOC;
 301         static_branch_inc(&memalloc_socks_key);
 302 }
 303 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 304
 305 void sk_clear_memalloc(struct sock *sk)
 306 {
 307         sock_reset_flag(sk, SOCK_MEMALLOC);
 308         sk->sk_allocation &= ~__GFP_MEMALLOC;
 309         static_branch_dec(&memalloc_socks_key);
 310
 311         /*
 312          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 313          * progress of swapping. SOCK_MEMALLOC may be cleared while
 314          * it has rmem allocations due to the last swapfile being deactivated
 315          * but there is a risk that the socket is unusable due to exceeding
 316          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 317          */
 318         sk_mem_reclaim(sk);
 319 }
 320 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 321
 322 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 323 {
 324         int ret;
 325         unsigned int noreclaim_flag;
 326
 327         /* these should have been dropped before queueing */
 328         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 329
 330         noreclaim_flag = memalloc_noreclaim_save();
 331         ret = sk->sk_backlog_rcv(sk, skb);
 332         memalloc_noreclaim_restore(noreclaim_flag);
 333
 334         return ret;
 335 }
 336 EXPORT_SYMBOL(__sk_backlog_rcv);
 337
 338 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 339 {
 340         struct timeval tv;
 341
 342         if (optlen < sizeof(tv))
 343                 return -EINVAL;
 344         if (copy_from_user(&tv, optval, sizeof(tv)))
 345                 return -EFAULT;
 346         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 347                 return -EDOM;
 348
 349         if (tv.tv_sec < 0) {
 350                 static int warned __read_mostly;
 351
 352                 *timeo_p = 0;
 353                 if (warned < 10 && net_ratelimit()) {
 354                         warned++;
 355                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 356                                 __func__, current->comm, task_pid_nr(current));
 357                 }
 358                 return 0;
 359         }
 360         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 361         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 362                 return 0;
 363         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 364                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
 365         return 0;
 366 }
 367
 368 static void sock_warn_obsolete_bsdism(const char *name)
 369 {
 370         static int warned;
 371         static char warncomm[TASK_COMM_LEN];
 372         if (strcmp(warncomm, current->comm) && warned < 5) {
 373                 strcpy(warncomm,  current->comm);
 374                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 375                         warncomm, name);
 376                 warned++;
 377         }
 378 }
 379
 380 static bool sock_needs_netstamp(const struct sock *sk)
 381 {
 382         switch (sk->sk_family) {
 383         case AF_UNSPEC:
 384         case AF_UNIX:
 385                 return false;
 386         default:
 387                 return true;
 388         }
 389 }
 390
 391 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 392 {
 393         if (sk->sk_flags & flags) {
 394                 sk->sk_flags &= ~flags;
 395                 if (sock_needs_netstamp(sk) &&
 396                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 397                         net_disable_timestamp();
 398         }
 399 }
 400
 401
 402 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 403 {
 404         unsigned long flags;
 405         struct sk_buff_head *list = &sk->sk_receive_queue;
 406
 407         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 408                 atomic_inc(&sk->sk_drops);
 409                 trace_sock_rcvqueue_full(sk, skb);
 410                 return -ENOMEM;
 411         }
 412
 413         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 414                 atomic_inc(&sk->sk_drops);
 415                 return -ENOBUFS;
 416         }
 417
 418         skb->dev = NULL;
 419         skb_set_owner_r(skb, sk);
 420
 421         /* we escape from rcu protected region, make sure we dont leak
 422          * a norefcounted dst
 423          */
 424         skb_dst_force(skb);
 425
 426         spin_lock_irqsave(&list->lock, flags);
 427         sock_skb_set_dropcount(sk, skb);
 428         __skb_queue_tail(list, skb);
 429         spin_unlock_irqrestore(&list->lock, flags);
 430
 431         if (!sock_flag(sk, SOCK_DEAD))
 432                 sk->sk_data_ready(sk);
 433         return 0;
 434 }
 435 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 436
 437 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 438 {
 439         int err;
 440
 441         err = sk_filter(sk, skb);
 442         if (err)
 443                 return err;
 444
 445         return __sock_queue_rcv_skb(sk, skb);
 446 }
 447 EXPORT_SYMBOL(sock_queue_rcv_skb);
 448
 449 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 450                      const int nested, unsigned int trim_cap, bool refcounted)
 451 {
 452         int rc = NET_RX_SUCCESS;
 453
 454         if (sk_filter_trim_cap(sk, skb, trim_cap))
 455                 goto discard_and_relse;
 456
 457         skb->dev = NULL;
 458
 459         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 460                 atomic_inc(&sk->sk_drops);
 461                 goto discard_and_relse;
 462         }
 463         if (nested)
 464                 bh_lock_sock_nested(sk);
 465         else
 466                 bh_lock_sock(sk);
 467         if (!sock_owned_by_user(sk)) {
 468                 /*
 469                  * trylock + unlock semantics:
 470                  */
 471                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 472
 473                 rc = sk_backlog_rcv(sk, skb);
 474
 475                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 476         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 477                 bh_unlock_sock(sk);
 478                 atomic_inc(&sk->sk_drops);
 479                 goto discard_and_relse;
 480         }
 481
 482         bh_unlock_sock(sk);
 483 out:
 484         if (refcounted)
 485                 sock_put(sk);
 486         return rc;
 487 discard_and_relse:
 488         kfree_skb(skb);
 489         goto out;
 490 }
 491 EXPORT_SYMBOL(__sk_receive_skb);
 492
 493 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 494 {
 495         struct dst_entry *dst = __sk_dst_get(sk);
 496
 497         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 498                 sk_tx_queue_clear(sk);
 499                 sk->sk_dst_pending_confirm = 0;
 500                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 501                 dst_release(dst);
 502                 return NULL;
 503         }
 504
 505         return dst;
 506 }
 507 EXPORT_SYMBOL(__sk_dst_check);
 508
 509 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 510 {
 511         struct dst_entry *dst = sk_dst_get(sk);
 512
 513         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 514                 sk_dst_reset(sk);
 515                 dst_release(dst);
 516                 return NULL;
 517         }
 518
 519         return dst;
 520 }
 521 EXPORT_SYMBOL(sk_dst_check);
 522
 523 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 524                                 int optlen)
 525 {
 526         int ret = -ENOPROTOOPT;
 527 #ifdef CONFIG_NETDEVICES
 528         struct net *net = sock_net(sk);
 529         char devname[IFNAMSIZ];
 530         int index;
 531
 532         /* Sorry... */
 533         ret = -EPERM;
 534         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 535                 goto out;
 536
 537         ret = -EINVAL;
 538         if (optlen < 0)
 539                 goto out;
 540
 541         /* Bind this socket to a particular device like "eth0",
 542          * as specified in the passed interface name. If the
 543          * name is "" or the option length is zero the socket
 544          * is not bound.
 545          */
 546         if (optlen > IFNAMSIZ - 1)
 547                 optlen = IFNAMSIZ - 1;
 548         memset(devname, 0, sizeof(devname));
 549
 550         ret = -EFAULT;
 551         if (copy_from_user(devname, optval, optlen))
 552                 goto out;
 553
 554         index = 0;
 555         if (devname[0] != '\0') {
 556                 struct net_device *dev;
 557
 558                 rcu_read_lock();
 559                 dev = dev_get_by_name_rcu(net, devname);
 560                 if (dev)
 561                         index = dev->ifindex;
 562                 rcu_read_unlock();
 563                 ret = -ENODEV;
 564                 if (!dev)
 565                         goto out;
 566         }
 567
 568         lock_sock(sk);
 569         sk->sk_bound_dev_if = index;
 570         sk_dst_reset(sk);
 571         release_sock(sk);
 572
 573         ret = 0;
 574
 575 out:
 576 #endif
 577
 578         return ret;
 579 }
 580
 581 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 582                                 int __user *optlen, int len)
 583 {
 584         int ret = -ENOPROTOOPT;
 585 #ifdef CONFIG_NETDEVICES
 586         struct net *net = sock_net(sk);
 587         char devname[IFNAMSIZ];
 588
 589         if (sk->sk_bound_dev_if == 0) {
 590                 len = 0;
 591                 goto zero;
 592         }
 593
 594         ret = -EINVAL;
 595         if (len < IFNAMSIZ)
 596                 goto out;
 597
 598         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 599         if (ret)
 600                 goto out;
 601
 602         len = strlen(devname) + 1;
 603
 604         ret = -EFAULT;
 605         if (copy_to_user(optval, devname, len))
 606                 goto out;
 607
 608 zero:
 609         ret = -EFAULT;
 610         if (put_user(len, optlen))
 611                 goto out;
 612
 613         ret = 0;
 614
 615 out:
 616 #endif
 617
 618         return ret;
 619 }
 620
 621 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 622 {
 623         if (valbool)
 624                 sock_set_flag(sk, bit);
 625         else
 626                 sock_reset_flag(sk, bit);
 627 }
 628
 629 bool sk_mc_loop(struct sock *sk)
 630 {
 631         if (dev_recursion_level())
 632                 return false;
 633         if (!sk)
 634                 return true;
 635         switch (sk->sk_family) {
 636         case AF_INET:
 637                 return inet_sk(sk)->mc_loop;
 638 #if IS_ENABLED(CONFIG_IPV6)
 639         case AF_INET6:
 640                 return inet6_sk(sk)->mc_loop;
 641 #endif
 642         }
 643         WARN_ON_ONCE(1);
 644         return true;
 645 }
 646 EXPORT_SYMBOL(sk_mc_loop);
 647
 648 /*
 649  *      This is meant for all protocols to use and covers goings on
 650  *      at the socket level. Everything here is generic.
 651  */
 652
 653 int sock_setsockopt(struct socket *sock, int level, int optname,
 654                     char __user *optval, unsigned int optlen)
 655 {
 656         struct sock_txtime sk_txtime;
 657         struct sock *sk = sock->sk;
 658         int val;
 659         int valbool;
 660         struct linger ling;
 661         int ret = 0;
 662
 663         /*
 664          *      Options without arguments
 665          */
 666
 667         if (optname == SO_BINDTODEVICE)
 668                 return sock_setbindtodevice(sk, optval, optlen);
 669
 670         if (optlen < sizeof(int))
 671                 return -EINVAL;
 672
 673         if (get_user(val, (int __user *)optval))
 674                 return -EFAULT;
 675
 676         valbool = val ? 1 : 0;
 677
 678         lock_sock(sk);
 679
 680         switch (optname) {
 681         case SO_DEBUG:
 682                 if (val && !capable(CAP_NET_ADMIN))
 683                         ret = -EACCES;
 684                 else
 685                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 686                 break;
 687         case SO_REUSEADDR:
 688                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 689                 break;
 690         case SO_REUSEPORT:
 691                 sk->sk_reuseport = valbool;
 692                 break;
 693         case SO_TYPE:
 694         case SO_PROTOCOL:
 695         case SO_DOMAIN:
 696         case SO_ERROR:
 697                 ret = -ENOPROTOOPT;
 698                 break;
 699         case SO_DONTROUTE:
 700                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 701                 sk_dst_reset(sk);
 702                 break;
 703         case SO_BROADCAST:
 704                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 705                 break;
 706         case SO_SNDBUF:
 707                 /* Don't error on this BSD doesn't and if you think
 708                  * about it this is right. Otherwise apps have to
 709                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 710                  * are treated in BSD as hints
 711                  */
 712                 val = min_t(u32, val, sysctl_wmem_max);
 713 set_sndbuf:
 714                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 715                 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 716                 /* Wake up sending tasks if we upped the value. */
 717                 sk->sk_write_space(sk);
 718                 break;
 719
 720         case SO_SNDBUFFORCE:
 721                 if (!capable(CAP_NET_ADMIN)) {
 722                         ret = -EPERM;
 723                         break;
 724                 }
 725                 goto set_sndbuf;
 726
 727         case SO_RCVBUF:
 728                 /* Don't error on this BSD doesn't and if you think
 729                  * about it this is right. Otherwise apps have to
 730                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 731                  * are treated in BSD as hints
 732                  */
 733                 val = min_t(u32, val, sysctl_rmem_max);
 734 set_rcvbuf:
 735                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 736                 /*
 737                  * We double it on the way in to account for
 738                  * "struct sk_buff" etc. overhead.   Applications
 739                  * assume that the SO_RCVBUF setting they make will
 740                  * allow that much actual data to be received on that
 741                  * socket.
 742                  *
 743                  * Applications are unaware that "struct sk_buff" and
 744                  * other overheads allocate from the receive buffer
 745                  * during socket buffer allocation.
 746                  *
 747                  * And after considering the possible alternatives,
 748                  * returning the value we actually used in getsockopt
 749                  * is the most desirable behavior.
 750                  */
 751                 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 752                 break;
 753
 754         case SO_RCVBUFFORCE:
 755                 if (!capable(CAP_NET_ADMIN)) {
 756                         ret = -EPERM;
 757                         break;
 758                 }
 759                 goto set_rcvbuf;
 760
 761         case SO_KEEPALIVE:
 762                 if (sk->sk_prot->keepalive)
 763                         sk->sk_prot->keepalive(sk, valbool);
 764                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 765                 break;
 766
 767         case SO_OOBINLINE:
 768                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 769                 break;
 770
 771         case SO_NO_CHECK:
 772                 sk->sk_no_check_tx = valbool;
 773                 break;
 774
 775         case SO_PRIORITY:
 776                 if ((val >= 0 && val <= 6) ||
 777                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 778                         sk->sk_priority = val;
 779                 else
 780                         ret = -EPERM;
 781                 break;
 782
 783         case SO_LINGER:
 784                 if (optlen < sizeof(ling)) {
 785                         ret = -EINVAL;  /* 1003.1g */
 786                         break;
 787                 }
 788                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 789                         ret = -EFAULT;
 790                         break;
 791                 }
 792                 if (!ling.l_onoff)
 793                         sock_reset_flag(sk, SOCK_LINGER);
 794                 else {
 795 #if (BITS_PER_LONG == 32)
 796                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 797                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 798                         else
 799 #endif
 800                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 801                         sock_set_flag(sk, SOCK_LINGER);
 802                 }
 803                 break;
 804
 805         case SO_BSDCOMPAT:
 806                 sock_warn_obsolete_bsdism("setsockopt");
 807                 break;
 808
 809         case SO_PASSCRED:
 810                 if (valbool)
 811                         set_bit(SOCK_PASSCRED, &sock->flags);
 812                 else
 813                         clear_bit(SOCK_PASSCRED, &sock->flags);
 814                 break;
 815
 816         case SO_TIMESTAMP:
 817         case SO_TIMESTAMPNS:
 818                 if (valbool)  {
 819                         if (optname == SO_TIMESTAMP)
 820                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 821                         else
 822                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 823                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 824                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 825                 } else {
 826                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 827                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 828                 }
 829                 break;
 830
 831         case SO_TIMESTAMPING:
 832                 if (val & ~SOF_TIMESTAMPING_MASK) {
 833                         ret = -EINVAL;
 834                         break;
 835                 }
 836
 837                 if (val & SOF_TIMESTAMPING_OPT_ID &&
 838                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 839                         if (sk->sk_protocol == IPPROTO_TCP &&
 840                             sk->sk_type == SOCK_STREAM) {
 841                                 if ((1 << sk->sk_state) &
 842                                     (TCPF_CLOSE | TCPF_LISTEN)) {
 843                                         ret = -EINVAL;
 844                                         break;
 845                                 }
 846                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
 847                         } else {
 848                                 sk->sk_tskey = 0;
 849                         }
 850                 }
 851
 852                 if (val & SOF_TIMESTAMPING_OPT_STATS &&
 853                     !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 854                         ret = -EINVAL;
 855                         break;
 856                 }
 857
 858                 sk->sk_tsflags = val;
 859                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 860                         sock_enable_timestamp(sk,
 861                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 862                 else
 863                         sock_disable_timestamp(sk,
 864                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 865                 break;
 866
 867         case SO_RCVLOWAT:
 868                 if (val < 0)
 869                         val = INT_MAX;
 870                 if (sock->ops->set_rcvlowat)
 871                         ret = sock->ops->set_rcvlowat(sk, val);
 872                 else
 873                         sk->sk_rcvlowat = val ? : 1;
 874                 break;
 875
 876         case SO_RCVTIMEO:
 877                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 878                 break;
 879
 880         case SO_SNDTIMEO:
 881                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 882                 break;
 883
 884         case SO_ATTACH_FILTER:
 885                 ret = -EINVAL;
 886                 if (optlen == sizeof(struct sock_fprog)) {
 887                         struct sock_fprog fprog;
 888
 889                         ret = -EFAULT;
 890                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 891                                 break;
 892
 893                         ret = sk_attach_filter(&fprog, sk);
 894                 }
 895                 break;
 896
 897         case SO_ATTACH_BPF:
 898                 ret = -EINVAL;
 899                 if (optlen == sizeof(u32)) {
 900                         u32 ufd;
 901
 902                         ret = -EFAULT;
 903                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 904                                 break;
 905
 906                         ret = sk_attach_bpf(ufd, sk);
 907                 }
 908                 break;
 909
 910         case SO_ATTACH_REUSEPORT_CBPF:
 911                 ret = -EINVAL;
 912                 if (optlen == sizeof(struct sock_fprog)) {
 913                         struct sock_fprog fprog;
 914
 915                         ret = -EFAULT;
 916                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 917                                 break;
 918
 919                         ret = sk_reuseport_attach_filter(&fprog, sk);
 920                 }
 921                 break;
 922
 923         case SO_ATTACH_REUSEPORT_EBPF:
 924                 ret = -EINVAL;
 925                 if (optlen == sizeof(u32)) {
 926                         u32 ufd;
 927
 928                         ret = -EFAULT;
 929                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 930                                 break;
 931
 932                         ret = sk_reuseport_attach_bpf(ufd, sk);
 933                 }
 934                 break;
 935
 936         case SO_DETACH_FILTER:
 937                 ret = sk_detach_filter(sk);
 938                 break;
 939
 940         case SO_LOCK_FILTER:
 941                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 942                         ret = -EPERM;
 943                 else
 944                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 945                 break;
 946
 947         case SO_PASSSEC:
 948                 if (valbool)
 949                         set_bit(SOCK_PASSSEC, &sock->flags);
 950                 else
 951                         clear_bit(SOCK_PASSSEC, &sock->flags);
 952                 break;
 953         case SO_MARK:
 954                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 955                         ret = -EPERM;
 956                 else
 957                         sk->sk_mark = val;
 958                 break;
 959
 960         case SO_RXQ_OVFL:
 961                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 962                 break;
 963
 964         case SO_WIFI_STATUS:
 965                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 966                 break;
 967
 968         case SO_PEEK_OFF:
 969                 if (sock->ops->set_peek_off)
 970                         ret = sock->ops->set_peek_off(sk, val);
 971                 else
 972                         ret = -EOPNOTSUPP;
 973                 break;
 974
 975         case SO_NOFCS:
 976                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 977                 break;
 978
 979         case SO_SELECT_ERR_QUEUE:
 980                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 981                 break;
 982
 983 #ifdef CONFIG_NET_RX_BUSY_POLL
 984         case SO_BUSY_POLL:
 985                 /* allow unprivileged users to decrease the value */
 986                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 987                         ret = -EPERM;
 988                 else {
 989                         if (val < 0)
 990                                 ret = -EINVAL;
 991                         else
 992                                 WRITE_ONCE(sk->sk_ll_usec, val);
 993                 }
 994                 break;
 995 #endif
 996
 997         case SO_MAX_PACING_RATE:
 998                 if (val != ~0U)
 999                         cmpxchg(&sk->sk_pacing_status,
1000                                 SK_PACING_NONE,
1001                                 SK_PACING_NEEDED);
1002                 sk->sk_max_pacing_rate = val;
1003                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1004                                          sk->sk_max_pacing_rate);
1005                 break;
1006
1007         case SO_INCOMING_CPU:
1008                 WRITE_ONCE(sk->sk_incoming_cpu, val);
1009                 break;
1010
1011         case SO_CNX_ADVICE:
1012                 if (val == 1)
1013                         dst_negative_advice(sk);
1014                 break;
1015
1016         case SO_ZEROCOPY:
1017                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1018                         if (sk->sk_protocol != IPPROTO_TCP)
1019                                 ret = -ENOTSUPP;
1020                 } else if (sk->sk_family != PF_RDS) {
1021                         ret = -ENOTSUPP;
1022                 }
1023                 if (!ret) {
1024                         if (val < 0 || val > 1)
1025                                 ret = -EINVAL;
1026                         else
1027                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1028                 }
1029                 break;
1030
1031         case SO_TXTIME:
1032                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1033                         ret = -EPERM;
1034                 } else if (optlen != sizeof(struct sock_txtime)) {
1035                         ret = -EINVAL;
1036                 } else if (copy_from_user(&sk_txtime, optval,
1037                            sizeof(struct sock_txtime))) {
1038                         ret = -EFAULT;
1039                 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1040                         ret = -EINVAL;
1041                 } else {
1042                         sock_valbool_flag(sk, SOCK_TXTIME, true);
1043                         sk->sk_clockid = sk_txtime.clockid;
1044                         sk->sk_txtime_deadline_mode =
1045                                 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1046                         sk->sk_txtime_report_errors =
1047                                 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1048                 }
1049                 break;
1050
1051         default:
1052                 ret = -ENOPROTOOPT;
1053                 break;
1054         }
1055         release_sock(sk);
1056         return ret;
1057 }
1058 EXPORT_SYMBOL(sock_setsockopt);
1059
1060
1061 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1062                           struct ucred *ucred)
1063 {
1064         ucred->pid = pid_vnr(pid);
1065         ucred->uid = ucred->gid = -1;
1066         if (cred) {
1067                 struct user_namespace *current_ns = current_user_ns();
1068
1069                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1070                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1071         }
1072 }
1073
1074 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1075 {
1076         struct user_namespace *user_ns = current_user_ns();
1077         int i;
1078
1079         for (i = 0; i < src->ngroups; i++)
1080                 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1081                         return -EFAULT;
1082
1083         return 0;
1084 }
1085
1086 int sock_getsockopt(struct socket *sock, int level, int optname,
1087                     char __user *optval, int __user *optlen)
1088 {
1089         struct sock *sk = sock->sk;
1090
1091         union {
1092                 int val;
1093                 u64 val64;
1094                 struct linger ling;
1095                 struct timeval tm;
1096                 struct sock_txtime txtime;
1097         } v;
1098
1099         int lv = sizeof(int);
1100         int len;
1101
1102         if (get_user(len, optlen))
1103                 return -EFAULT;
1104         if (len < 0)
1105                 return -EINVAL;
1106
1107         memset(&v, 0, sizeof(v));
1108
1109         switch (optname) {
1110         case SO_DEBUG:
1111                 v.val = sock_flag(sk, SOCK_DBG);
1112                 break;
1113
1114         case SO_DONTROUTE:
1115                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1116                 break;
1117
1118         case SO_BROADCAST:
1119                 v.val = sock_flag(sk, SOCK_BROADCAST);
1120                 break;
1121
1122         case SO_SNDBUF:
1123                 v.val = sk->sk_sndbuf;
1124                 break;
1125
1126         case SO_RCVBUF:
1127                 v.val = sk->sk_rcvbuf;
1128                 break;
1129
1130         case SO_REUSEADDR:
1131                 v.val = sk->sk_reuse;
1132                 break;
1133
1134         case SO_REUSEPORT:
1135                 v.val = sk->sk_reuseport;
1136                 break;
1137
1138         case SO_KEEPALIVE:
1139                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1140                 break;
1141
1142         case SO_TYPE:
1143                 v.val = sk->sk_type;
1144                 break;
1145
1146         case SO_PROTOCOL:
1147                 v.val = sk->sk_protocol;
1148                 break;
1149
1150         case SO_DOMAIN:
1151                 v.val = sk->sk_family;
1152                 break;
1153
1154         case SO_ERROR:
1155                 v.val = -sock_error(sk);
1156                 if (v.val == 0)
1157                         v.val = xchg(&sk->sk_err_soft, 0);
1158                 break;
1159
1160         case SO_OOBINLINE:
1161                 v.val = sock_flag(sk, SOCK_URGINLINE);
1162                 break;
1163
1164         case SO_NO_CHECK:
1165                 v.val = sk->sk_no_check_tx;
1166                 break;
1167
1168         case SO_PRIORITY:
1169                 v.val = sk->sk_priority;
1170                 break;
1171
1172         case SO_LINGER:
1173                 lv              = sizeof(v.ling);
1174                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1175                 v.ling.l_linger = sk->sk_lingertime / HZ;
1176                 break;
1177
1178         case SO_BSDCOMPAT:
1179                 sock_warn_obsolete_bsdism("getsockopt");
1180                 break;
1181
1182         case SO_TIMESTAMP:
1183                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1184                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1185                 break;
1186
1187         case SO_TIMESTAMPNS:
1188                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1189                 break;
1190
1191         case SO_TIMESTAMPING:
1192                 v.val = sk->sk_tsflags;
1193                 break;
1194
1195         case SO_RCVTIMEO:
1196                 lv = sizeof(struct timeval);
1197                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1198                         v.tm.tv_sec = 0;
1199                         v.tm.tv_usec = 0;
1200                 } else {
1201                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1202                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1203                 }
1204                 break;
1205
1206         case SO_SNDTIMEO:
1207                 lv = sizeof(struct timeval);
1208                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1209                         v.tm.tv_sec = 0;
1210                         v.tm.tv_usec = 0;
1211                 } else {
1212                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1213                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1214                 }
1215                 break;
1216
1217         case SO_RCVLOWAT:
1218                 v.val = sk->sk_rcvlowat;
1219                 break;
1220
1221         case SO_SNDLOWAT:
1222                 v.val = 1;
1223                 break;
1224
1225         case SO_PASSCRED:
1226                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1227                 break;
1228
1229         case SO_PEERCRED:
1230         {
1231                 struct ucred peercred;
1232                 if (len > sizeof(peercred))
1233                         len = sizeof(peercred);
1234                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1235                 if (copy_to_user(optval, &peercred, len))
1236                         return -EFAULT;
1237                 goto lenout;
1238         }
1239
1240         case SO_PEERGROUPS:
1241         {
1242                 int ret, n;
1243
1244                 if (!sk->sk_peer_cred)
1245                         return -ENODATA;
1246
1247                 n = sk->sk_peer_cred->group_info->ngroups;
1248                 if (len < n * sizeof(gid_t)) {
1249                         len = n * sizeof(gid_t);
1250                         return put_user(len, optlen) ? -EFAULT : -ERANGE;
1251                 }
1252                 len = n * sizeof(gid_t);
1253
1254                 ret = groups_to_user((gid_t __user *)optval,
1255                                      sk->sk_peer_cred->group_info);
1256                 if (ret)
1257                         return ret;
1258                 goto lenout;
1259         }
1260
1261         case SO_PEERNAME:
1262         {
1263                 char address[128];
1264
1265                 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1266                 if (lv < 0)
1267                         return -ENOTCONN;
1268                 if (lv < len)
1269                         return -EINVAL;
1270                 if (copy_to_user(optval, address, len))
1271                         return -EFAULT;
1272                 goto lenout;
1273         }
1274
1275         /* Dubious BSD thing... Probably nobody even uses it, but
1276          * the UNIX standard wants it for whatever reason... -DaveM
1277          */
1278         case SO_ACCEPTCONN:
1279                 v.val = sk->sk_state == TCP_LISTEN;
1280                 break;
1281
1282         case SO_PASSSEC:
1283                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1284                 break;
1285
1286         case SO_PEERSEC:
1287                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1288
1289         case SO_MARK:
1290                 v.val = sk->sk_mark;
1291                 break;
1292
1293         case SO_RXQ_OVFL:
1294                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1295                 break;
1296
1297         case SO_WIFI_STATUS:
1298                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1299                 break;
1300
1301         case SO_PEEK_OFF:
1302                 if (!sock->ops->set_peek_off)
1303                         return -EOPNOTSUPP;
1304
1305                 v.val = sk->sk_peek_off;
1306                 break;
1307         case SO_NOFCS:
1308                 v.val = sock_flag(sk, SOCK_NOFCS);
1309                 break;
1310
1311         case SO_BINDTODEVICE:
1312                 return sock_getbindtodevice(sk, optval, optlen, len);
1313
1314         case SO_GET_FILTER:
1315                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1316                 if (len < 0)
1317                         return len;
1318
1319                 goto lenout;
1320
1321         case SO_LOCK_FILTER:
1322                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1323                 break;
1324
1325         case SO_BPF_EXTENSIONS:
1326                 v.val = bpf_tell_extensions();
1327                 break;
1328
1329         case SO_SELECT_ERR_QUEUE:
1330                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1331                 break;
1332
1333 #ifdef CONFIG_NET_RX_BUSY_POLL
1334         case SO_BUSY_POLL:
1335                 v.val = sk->sk_ll_usec;
1336                 break;
1337 #endif
1338
1339         case SO_MAX_PACING_RATE:
1340                 v.val = sk->sk_max_pacing_rate;
1341                 break;
1342
1343         case SO_INCOMING_CPU:
1344                 v.val = READ_ONCE(sk->sk_incoming_cpu);
1345                 break;
1346
1347         case SO_MEMINFO:
1348         {
1349                 u32 meminfo[SK_MEMINFO_VARS];
1350
1351                 sk_get_meminfo(sk, meminfo);
1352
1353                 len = min_t(unsigned int, len, sizeof(meminfo));
1354                 if (copy_to_user(optval, &meminfo, len))
1355                         return -EFAULT;
1356
1357                 goto lenout;
1358         }
1359
1360 #ifdef CONFIG_NET_RX_BUSY_POLL
1361         case SO_INCOMING_NAPI_ID:
1362                 v.val = READ_ONCE(sk->sk_napi_id);
1363
1364                 /* aggregate non-NAPI IDs down to 0 */
1365                 if (v.val < MIN_NAPI_ID)
1366                         v.val = 0;
1367
1368                 break;
1369 #endif
1370
1371         case SO_COOKIE:
1372                 lv = sizeof(u64);
1373                 if (len < lv)
1374                         return -EINVAL;
1375                 v.val64 = sock_gen_cookie(sk);
1376                 break;
1377
1378         case SO_ZEROCOPY:
1379                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1380                 break;
1381
1382         case SO_TXTIME:
1383                 lv = sizeof(v.txtime);
1384                 v.txtime.clockid = sk->sk_clockid;
1385                 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1386                                   SOF_TXTIME_DEADLINE_MODE : 0;
1387                 v.txtime.flags |= sk->sk_txtime_report_errors ?
1388                                   SOF_TXTIME_REPORT_ERRORS : 0;
1389                 break;
1390
1391         default:
1392                 /* We implement the SO_SNDLOWAT etc to not be settable
1393                  * (1003.1g 7).
1394                  */
1395                 return -ENOPROTOOPT;
1396         }
1397
1398         if (len > lv)
1399                 len = lv;
1400         if (copy_to_user(optval, &v, len))
1401                 return -EFAULT;
1402 lenout:
1403         if (put_user(len, optlen))
1404                 return -EFAULT;
1405         return 0;
1406 }
1407
1408 /*
1409  * Initialize an sk_lock.
1410  *
1411  * (We also register the sk_lock with the lock validator.)
1412  */
1413 static inline void sock_lock_init(struct sock *sk)
1414 {
1415         if (sk->sk_kern_sock)
1416                 sock_lock_init_class_and_name(
1417                         sk,
1418                         af_family_kern_slock_key_strings[sk->sk_family],
1419                         af_family_kern_slock_keys + sk->sk_family,
1420                         af_family_kern_key_strings[sk->sk_family],
1421                         af_family_kern_keys + sk->sk_family);
1422         else
1423                 sock_lock_init_class_and_name(
1424                         sk,
1425                         af_family_slock_key_strings[sk->sk_family],
1426                         af_family_slock_keys + sk->sk_family,
1427                         af_family_key_strings[sk->sk_family],
1428                         af_family_keys + sk->sk_family);
1429 }
1430
1431 /*
1432  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1433  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1434  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1435  */
1436 static void sock_copy(struct sock *nsk, const struct sock *osk)
1437 {
1438 #ifdef CONFIG_SECURITY_NETWORK
1439         void *sptr = nsk->sk_security;
1440 #endif
1441         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1442
1443         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1444                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1445
1446 #ifdef CONFIG_SECURITY_NETWORK
1447         nsk->sk_security = sptr;
1448         security_sk_clone(osk, nsk);
1449 #endif
1450 }
1451
1452 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1453                 int family)
1454 {
1455         struct sock *sk;
1456         struct kmem_cache *slab;
1457
1458         slab = prot->slab;
1459         if (slab != NULL) {
1460                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1461                 if (!sk)
1462                         return sk;
1463                 if (priority & __GFP_ZERO)
1464                         sk_prot_clear_nulls(sk, prot->obj_size);
1465         } else
1466                 sk = kmalloc(prot->obj_size, priority);
1467
1468         if (sk != NULL) {
1469                 if (security_sk_alloc(sk, family, priority))
1470                         goto out_free;
1471
1472                 if (!try_module_get(prot->owner))
1473                         goto out_free_sec;
1474                 sk_tx_queue_clear(sk);
1475         }
1476
1477         return sk;
1478
1479 out_free_sec:
1480         security_sk_free(sk);
1481 out_free:
1482         if (slab != NULL)
1483                 kmem_cache_free(slab, sk);
1484         else
1485                 kfree(sk);
1486         return NULL;
1487 }
1488
1489 static void sk_prot_free(struct proto *prot, struct sock *sk)
1490 {
1491         struct kmem_cache *slab;
1492         struct module *owner;
1493
1494         owner = prot->owner;
1495         slab = prot->slab;
1496
1497         cgroup_sk_free(&sk->sk_cgrp_data);
1498         mem_cgroup_sk_free(sk);
1499         security_sk_free(sk);
1500         if (slab != NULL)
1501                 kmem_cache_free(slab, sk);
1502         else
1503                 kfree(sk);
1504         module_put(owner);
1505 }
1506
1507 /**
1508  *      sk_alloc - All socket objects are allocated here
1509  *      @net: the applicable net namespace
1510  *      @family: protocol family
1511  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1512  *      @prot: struct proto associated with this new sock instance
1513  *      @kern: is this to be a kernel socket?
1514  */
1515 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1516                       struct proto *prot, int kern)
1517 {
1518         struct sock *sk;
1519
1520         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1521         if (sk) {
1522                 sk->sk_family = family;
1523                 /*
1524                  * See comment in struct sock definition to understand
1525                  * why we need sk_prot_creator -acme
1526                  */
1527                 sk->sk_prot = sk->sk_prot_creator = prot;
1528                 sk->sk_kern_sock = kern;
1529                 sock_lock_init(sk);
1530                 sk->sk_net_refcnt = kern ? 0 : 1;
1531                 if (likely(sk->sk_net_refcnt)) {
1532                         get_net(net);
1533                         sock_inuse_add(net, 1);
1534                 }
1535
1536                 sock_net_set(sk, net);
1537                 refcount_set(&sk->sk_wmem_alloc, 1);
1538
1539                 mem_cgroup_sk_alloc(sk);
1540                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1541                 sock_update_classid(&sk->sk_cgrp_data);
1542                 sock_update_netprioidx(&sk->sk_cgrp_data);
1543                 sk_tx_queue_clear(sk);
1544         }
1545
1546         return sk;
1547 }
1548 EXPORT_SYMBOL(sk_alloc);
1549
1550 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1551  * grace period. This is the case for UDP sockets and TCP listeners.
1552  */
1553 static void __sk_destruct(struct rcu_head *head)
1554 {
1555         struct sock *sk = container_of(head, struct sock, sk_rcu);
1556         struct sk_filter *filter;
1557
1558         if (sk->sk_destruct)
1559                 sk->sk_destruct(sk);
1560
1561         filter = rcu_dereference_check(sk->sk_filter,
1562                                        refcount_read(&sk->sk_wmem_alloc) == 0);
1563         if (filter) {
1564                 sk_filter_uncharge(sk, filter);
1565                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1566         }
1567
1568         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1569
1570         if (atomic_read(&sk->sk_omem_alloc))
1571                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1572                          __func__, atomic_read(&sk->sk_omem_alloc));
1573
1574         if (sk->sk_frag.page) {
1575                 put_page(sk->sk_frag.page);
1576                 sk->sk_frag.page = NULL;
1577         }
1578
1579         if (sk->sk_peer_cred)
1580                 put_cred(sk->sk_peer_cred);
1581         put_pid(sk->sk_peer_pid);
1582         if (likely(sk->sk_net_refcnt))
1583                 put_net(sock_net(sk));
1584         sk_prot_free(sk->sk_prot_creator, sk);
1585 }
1586
1587 void sk_destruct(struct sock *sk)
1588 {
1589         bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1590
1591         if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1592                 reuseport_detach_sock(sk);
1593                 use_call_rcu = true;
1594         }
1595
1596         if (use_call_rcu)
1597                 call_rcu(&sk->sk_rcu, __sk_destruct);
1598         else
1599                 __sk_destruct(&sk->sk_rcu);
1600 }
1601
1602 static void __sk_free(struct sock *sk)
1603 {
1604         if (likely(sk->sk_net_refcnt))
1605                 sock_inuse_add(sock_net(sk), -1);
1606
1607         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1608                 sock_diag_broadcast_destroy(sk);
1609         else
1610                 sk_destruct(sk);
1611 }
1612
1613 void sk_free(struct sock *sk)
1614 {
1615         /*
1616          * We subtract one from sk_wmem_alloc and can know if
1617          * some packets are still in some tx queue.
1618          * If not null, sock_wfree() will call __sk_free(sk) later
1619          */
1620         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1621                 __sk_free(sk);
1622 }
1623 EXPORT_SYMBOL(sk_free);
1624
1625 static void sk_init_common(struct sock *sk)
1626 {
1627         skb_queue_head_init(&sk->sk_receive_queue);
1628         skb_queue_head_init(&sk->sk_write_queue);
1629         skb_queue_head_init(&sk->sk_error_queue);
1630
1631         rwlock_init(&sk->sk_callback_lock);
1632         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1633                         af_rlock_keys + sk->sk_family,
1634                         af_family_rlock_key_strings[sk->sk_family]);
1635         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1636                         af_wlock_keys + sk->sk_family,
1637                         af_family_wlock_key_strings[sk->sk_family]);
1638         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1639                         af_elock_keys + sk->sk_family,
1640                         af_family_elock_key_strings[sk->sk_family]);
1641         lockdep_set_class_and_name(&sk->sk_callback_lock,
1642                         af_callback_keys + sk->sk_family,
1643                         af_family_clock_key_strings[sk->sk_family]);
1644 }
1645
1646 /**
1647  *      sk_clone_lock - clone a socket, and lock its clone
1648  *      @sk: the socket to clone
1649  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1650  *
1651  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1652  */
1653 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1654 {
1655         struct sock *newsk;
1656         bool is_charged = true;
1657
1658         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1659         if (newsk != NULL) {
1660                 struct sk_filter *filter;
1661
1662                 sock_copy(newsk, sk);
1663
1664                 newsk->sk_prot_creator = sk->sk_prot;
1665
1666                 /* SANITY */
1667                 if (likely(newsk->sk_net_refcnt))
1668                         get_net(sock_net(newsk));
1669                 sk_node_init(&newsk->sk_node);
1670                 sock_lock_init(newsk);
1671                 bh_lock_sock(newsk);
1672                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1673                 newsk->sk_backlog.len = 0;
1674
1675                 atomic_set(&newsk->sk_rmem_alloc, 0);
1676                 /*
1677                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1678                  */
1679                 refcount_set(&newsk->sk_wmem_alloc, 1);
1680                 atomic_set(&newsk->sk_omem_alloc, 0);
1681                 sk_init_common(newsk);
1682
1683                 newsk->sk_dst_cache     = NULL;
1684                 newsk->sk_dst_pending_confirm = 0;
1685                 newsk->sk_wmem_queued   = 0;
1686                 newsk->sk_forward_alloc = 0;
1687                 atomic_set(&newsk->sk_drops, 0);
1688                 newsk->sk_send_head     = NULL;
1689                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1690                 atomic_set(&newsk->sk_zckey, 0);
1691
1692                 sock_reset_flag(newsk, SOCK_DONE);
1693
1694                 /* sk->sk_memcg will be populated at accept() time */
1695                 newsk->sk_memcg = NULL;
1696
1697                 cgroup_sk_clone(&newsk->sk_cgrp_data);
1698
1699                 rcu_read_lock();
1700                 filter = rcu_dereference(sk->sk_filter);
1701                 if (filter != NULL)
1702                         /* though it's an empty new sock, the charging may fail
1703                          * if sysctl_optmem_max was changed between creation of
1704                          * original socket and cloning
1705                          */
1706                         is_charged = sk_filter_charge(newsk, filter);
1707                 RCU_INIT_POINTER(newsk->sk_filter, filter);
1708                 rcu_read_unlock();
1709
1710                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1711                         /* We need to make sure that we don't uncharge the new
1712                          * socket if we couldn't charge it in the first place
1713                          * as otherwise we uncharge the parent's filter.
1714                          */
1715                         if (!is_charged)
1716                                 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1717                         sk_free_unlock_clone(newsk);
1718                         newsk = NULL;
1719                         goto out;
1720                 }
1721                 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1722
1723                 newsk->sk_err      = 0;
1724                 newsk->sk_err_soft = 0;
1725                 newsk->sk_priority = 0;
1726                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1727                 atomic64_set(&newsk->sk_cookie, 0);
1728                 if (likely(newsk->sk_net_refcnt))
1729                         sock_inuse_add(sock_net(newsk), 1);
1730
1731                 /*
1732                  * Before updating sk_refcnt, we must commit prior changes to memory
1733                  * (Documentation/RCU/rculist_nulls.txt for details)
1734                  */
1735                 smp_wmb();
1736                 refcount_set(&newsk->sk_refcnt, 2);
1737
1738                 /*
1739                  * Increment the counter in the same struct proto as the master
1740                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1741                  * is the same as sk->sk_prot->socks, as this field was copied
1742                  * with memcpy).
1743                  *
1744                  * This _changes_ the previous behaviour, where
1745                  * tcp_create_openreq_child always was incrementing the
1746                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1747                  * to be taken into account in all callers. -acme
1748                  */
1749                 sk_refcnt_debug_inc(newsk);
1750                 sk_set_socket(newsk, NULL);
1751                 sk_tx_queue_clear(newsk);
1752                 newsk->sk_wq = NULL;
1753
1754                 if (newsk->sk_prot->sockets_allocated)
1755                         sk_sockets_allocated_inc(newsk);
1756
1757                 if (sock_needs_netstamp(sk) &&
1758                     newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1759                         net_enable_timestamp();
1760         }
1761 out:
1762         return newsk;
1763 }
1764 EXPORT_SYMBOL_GPL(sk_clone_lock);
1765
1766 void sk_free_unlock_clone(struct sock *sk)
1767 {
1768         /* It is still raw copy of parent, so invalidate
1769          * destructor and make plain sk_free() */
1770         sk->sk_destruct = NULL;
1771         bh_unlock_sock(sk);
1772         sk_free(sk);
1773 }
1774 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1775
1776 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1777 {
1778         u32 max_segs = 1;
1779
1780         sk_dst_set(sk, dst);
1781         sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1782         if (sk->sk_route_caps & NETIF_F_GSO)
1783                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1784         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1785         if (sk_can_gso(sk)) {
1786                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1787                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1788                 } else {
1789                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1790                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1791                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1792                 }
1793         }
1794         sk->sk_gso_max_segs = max_segs;
1795 }
1796 EXPORT_SYMBOL_GPL(sk_setup_caps);
1797
1798 /*
1799  *      Simple resource managers for sockets.
1800  */
1801
1802
1803 /*
1804  * Write buffer destructor automatically called from kfree_skb.
1805  */
1806 void sock_wfree(struct sk_buff *skb)
1807 {
1808         struct sock *sk = skb->sk;
1809         unsigned int len = skb->truesize;
1810
1811         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1812                 /*
1813                  * Keep a reference on sk_wmem_alloc, this will be released
1814                  * after sk_write_space() call
1815                  */
1816                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1817                 sk->sk_write_space(sk);
1818                 len = 1;
1819         }
1820         /*
1821          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1822          * could not do because of in-flight packets
1823          */
1824         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1825                 __sk_free(sk);
1826 }
1827 EXPORT_SYMBOL(sock_wfree);
1828
1829 /* This variant of sock_wfree() is used by TCP,
1830  * since it sets SOCK_USE_WRITE_QUEUE.
1831  */
1832 void __sock_wfree(struct sk_buff *skb)
1833 {
1834         struct sock *sk = skb->sk;
1835
1836         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1837                 __sk_free(sk);
1838 }
1839
1840 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1841 {
1842         skb_orphan(skb);
1843         skb->sk = sk;
1844 #ifdef CONFIG_INET
1845         if (unlikely(!sk_fullsock(sk))) {
1846                 skb->destructor = sock_edemux;
1847                 sock_hold(sk);
1848                 return;
1849         }
1850 #endif
1851         skb->destructor = sock_wfree;
1852         skb_set_hash_from_sk(skb, sk);
1853         /*
1854          * We used to take a refcount on sk, but following operation
1855          * is enough to guarantee sk_free() wont free this sock until
1856          * all in-flight packets are completed
1857          */
1858         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1859 }
1860 EXPORT_SYMBOL(skb_set_owner_w);
1861
1862 /* This helper is used by netem, as it can hold packets in its
1863  * delay queue. We want to allow the owner socket to send more
1864  * packets, as if they were already TX completed by a typical driver.
1865  * But we also want to keep skb->sk set because some packet schedulers
1866  * rely on it (sch_fq for example).
1867  */
1868 void skb_orphan_partial(struct sk_buff *skb)
1869 {
1870         if (skb_is_tcp_pure_ack(skb))
1871                 return;
1872
1873         if (skb->destructor == sock_wfree
1874 #ifdef CONFIG_INET
1875             || skb->destructor == tcp_wfree
1876 #endif
1877                 ) {
1878                 struct sock *sk = skb->sk;
1879
1880                 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1881                         WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1882                         skb->destructor = sock_efree;
1883                 }
1884         } else {
1885                 skb_orphan(skb);
1886         }
1887 }
1888 EXPORT_SYMBOL(skb_orphan_partial);
1889
1890 /*
1891  * Read buffer destructor automatically called from kfree_skb.
1892  */
1893 void sock_rfree(struct sk_buff *skb)
1894 {
1895         struct sock *sk = skb->sk;
1896         unsigned int len = skb->truesize;
1897
1898         atomic_sub(len, &sk->sk_rmem_alloc);
1899         sk_mem_uncharge(sk, len);
1900 }
1901 EXPORT_SYMBOL(sock_rfree);
1902
1903 /*
1904  * Buffer destructor for skbs that are not used directly in read or write
1905  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1906  */
1907 void sock_efree(struct sk_buff *skb)
1908 {
1909         sock_put(skb->sk);
1910 }
1911 EXPORT_SYMBOL(sock_efree);
1912
1913 kuid_t sock_i_uid(struct sock *sk)
1914 {
1915         kuid_t uid;
1916
1917         read_lock_bh(&sk->sk_callback_lock);
1918         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1919         read_unlock_bh(&sk->sk_callback_lock);
1920         return uid;
1921 }
1922 EXPORT_SYMBOL(sock_i_uid);
1923
1924 unsigned long sock_i_ino(struct sock *sk)
1925 {
1926         unsigned long ino;
1927
1928         read_lock_bh(&sk->sk_callback_lock);
1929         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1930         read_unlock_bh(&sk->sk_callback_lock);
1931         return ino;
1932 }
1933 EXPORT_SYMBOL(sock_i_ino);
1934
1935 /*
1936  * Allocate a skb from the socket's send buffer.
1937  */
1938 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1939                              gfp_t priority)
1940 {
1941         if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1942                 struct sk_buff *skb = alloc_skb(size, priority);
1943                 if (skb) {
1944                         skb_set_owner_w(skb, sk);
1945                         return skb;
1946                 }
1947         }
1948         return NULL;
1949 }
1950 EXPORT_SYMBOL(sock_wmalloc);
1951
1952 static void sock_ofree(struct sk_buff *skb)
1953 {
1954         struct sock *sk = skb->sk;
1955
1956         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1957 }
1958
1959 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1960                              gfp_t priority)
1961 {
1962         struct sk_buff *skb;
1963
1964         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1965         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1966             sysctl_optmem_max)
1967                 return NULL;
1968
1969         skb = alloc_skb(size, priority);
1970         if (!skb)
1971                 return NULL;
1972
1973         atomic_add(skb->truesize, &sk->sk_omem_alloc);
1974         skb->sk = sk;
1975         skb->destructor = sock_ofree;
1976         return skb;
1977 }
1978
1979 /*
1980  * Allocate a memory block from the socket's option memory buffer.
1981  */
1982 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1983 {
1984         if ((unsigned int)size <= sysctl_optmem_max &&
1985             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1986                 void *mem;
1987                 /* First do the add, to avoid the race if kmalloc
1988                  * might sleep.
1989                  */
1990                 atomic_add(size, &sk->sk_omem_alloc);
1991                 mem = kmalloc(size, priority);
1992                 if (mem)
1993                         return mem;
1994                 atomic_sub(size, &sk->sk_omem_alloc);
1995         }
1996         return NULL;
1997 }
1998 EXPORT_SYMBOL(sock_kmalloc);
1999
2000 /* Free an option memory block. Note, we actually want the inline
2001  * here as this allows gcc to detect the nullify and fold away the
2002  * condition entirely.
2003  */
2004 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2005                                   const bool nullify)
2006 {
2007         if (WARN_ON_ONCE(!mem))
2008                 return;
2009         if (nullify)
2010                 kzfree(mem);
2011         else
2012                 kfree(mem);
2013         atomic_sub(size, &sk->sk_omem_alloc);
2014 }
2015
2016 void sock_kfree_s(struct sock *sk, void *mem, int size)
2017 {
2018         __sock_kfree_s(sk, mem, size, false);
2019 }
2020 EXPORT_SYMBOL(sock_kfree_s);
2021
2022 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2023 {
2024         __sock_kfree_s(sk, mem, size, true);
2025 }
2026 EXPORT_SYMBOL(sock_kzfree_s);
2027
2028 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2029    I think, these locks should be removed for datagram sockets.
2030  */
2031 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2032 {
2033         DEFINE_WAIT(wait);
2034
2035         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2036         for (;;) {
2037                 if (!timeo)
2038                         break;
2039                 if (signal_pending(current))
2040                         break;
2041                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2042                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2043                 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2044                         break;
2045                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2046                         break;
2047                 if (sk->sk_err)
2048                         break;
2049                 timeo = schedule_timeout(timeo);
2050         }
2051         finish_wait(sk_sleep(sk), &wait);
2052         return timeo;
2053 }
2054
2055
2056 /*
2057  *      Generic send/receive buffer handlers
2058  */
2059
2060 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2061                                      unsigned long data_len, int noblock,
2062                                      int *errcode, int max_page_order)
2063 {
2064         struct sk_buff *skb;
2065         long timeo;
2066         int err;
2067
2068         timeo = sock_sndtimeo(sk, noblock);
2069         for (;;) {
2070                 err = sock_error(sk);
2071                 if (err != 0)
2072                         goto failure;
2073
2074                 err = -EPIPE;
2075                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2076                         goto failure;
2077
2078                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2079                         break;
2080
2081                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2082                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2083                 err = -EAGAIN;
2084                 if (!timeo)
2085                         goto failure;
2086                 if (signal_pending(current))
2087                         goto interrupted;
2088                 timeo = sock_wait_for_wmem(sk, timeo);
2089         }
2090         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2091                                    errcode, sk->sk_allocation);
2092         if (skb)
2093                 skb_set_owner_w(skb, sk);
2094         return skb;
2095
2096 interrupted:
2097         err = sock_intr_errno(timeo);
2098 failure:
2099         *errcode = err;
2100         return NULL;
2101 }
2102 EXPORT_SYMBOL(sock_alloc_send_pskb);
2103
2104 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2105                                     int noblock, int *errcode)
2106 {
2107         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2108 }
2109 EXPORT_SYMBOL(sock_alloc_send_skb);
2110
2111 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2112                      struct sockcm_cookie *sockc)
2113 {
2114         u32 tsflags;
2115
2116         switch (cmsg->cmsg_type) {
2117         case SO_MARK:
2118                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2119                         return -EPERM;
2120                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2121                         return -EINVAL;
2122                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2123                 break;
2124         case SO_TIMESTAMPING:
2125                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2126                         return -EINVAL;
2127
2128                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2129                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2130                         return -EINVAL;
2131
2132                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2133                 sockc->tsflags |= tsflags;
2134                 break;
2135         case SCM_TXTIME:
2136                 if (!sock_flag(sk, SOCK_TXTIME))
2137                         return -EINVAL;
2138                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2139                         return -EINVAL;
2140                 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2141                 break;
2142         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2143         case SCM_RIGHTS:
2144         case SCM_CREDENTIALS:
2145                 break;
2146         default:
2147                 return -EINVAL;
2148         }
2149         return 0;
2150 }
2151 EXPORT_SYMBOL(__sock_cmsg_send);
2152
2153 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2154                    struct sockcm_cookie *sockc)
2155 {
2156         struct cmsghdr *cmsg;
2157         int ret;
2158
2159         for_each_cmsghdr(cmsg, msg) {
2160                 if (!CMSG_OK(msg, cmsg))
2161                         return -EINVAL;
2162                 if (cmsg->cmsg_level != SOL_SOCKET)
2163                         continue;
2164                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2165                 if (ret)
2166                         return ret;
2167         }
2168         return 0;
2169 }
2170 EXPORT_SYMBOL(sock_cmsg_send);
2171
2172 static void sk_enter_memory_pressure(struct sock *sk)
2173 {
2174         if (!sk->sk_prot->enter_memory_pressure)
2175                 return;
2176
2177         sk->sk_prot->enter_memory_pressure(sk);
2178 }
2179
2180 static void sk_leave_memory_pressure(struct sock *sk)
2181 {
2182         if (sk->sk_prot->leave_memory_pressure) {
2183                 sk->sk_prot->leave_memory_pressure(sk);
2184         } else {
2185                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2186
2187                 if (memory_pressure && READ_ONCE(*memory_pressure))
2188                         WRITE_ONCE(*memory_pressure, 0);
2189         }
2190 }
2191
2192 /* On 32bit arches, an skb frag is limited to 2^15 */
2193 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
2194
2195 /**
2196  * skb_page_frag_refill - check that a page_frag contains enough room
2197  * @sz: minimum size of the fragment we want to get
2198  * @pfrag: pointer to page_frag
2199  * @gfp: priority for memory allocation
2200  *
2201  * Note: While this allocator tries to use high order pages, there is
2202  * no guarantee that allocations succeed. Therefore, @sz MUST be
2203  * less or equal than PAGE_SIZE.
2204  */
2205 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2206 {
2207         if (pfrag->page) {
2208                 if (page_ref_count(pfrag->page) == 1) {
2209                         pfrag->offset = 0;
2210                         return true;
2211                 }
2212                 if (pfrag->offset + sz <= pfrag->size)
2213                         return true;
2214                 put_page(pfrag->page);
2215         }
2216
2217         pfrag->offset = 0;
2218         if (SKB_FRAG_PAGE_ORDER) {
2219                 /* Avoid direct reclaim but allow kswapd to wake */
2220                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2221                                           __GFP_COMP | __GFP_NOWARN |
2222                                           __GFP_NORETRY,
2223                                           SKB_FRAG_PAGE_ORDER);
2224                 if (likely(pfrag->page)) {
2225                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2226                         return true;
2227                 }
2228         }
2229         pfrag->page = alloc_page(gfp);
2230         if (likely(pfrag->page)) {
2231                 pfrag->size = PAGE_SIZE;
2232                 return true;
2233         }
2234         return false;
2235 }
2236 EXPORT_SYMBOL(skb_page_frag_refill);
2237
2238 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2239 {
2240         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2241                 return true;
2242
2243         sk_enter_memory_pressure(sk);
2244         sk_stream_moderate_sndbuf(sk);
2245         return false;
2246 }
2247 EXPORT_SYMBOL(sk_page_frag_refill);
2248
2249 int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
2250                 int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
2251                 int first_coalesce)
2252 {
2253         int sg_curr = *sg_curr_index, use = 0, rc = 0;
2254         unsigned int size = *sg_curr_size;
2255         struct page_frag *pfrag;
2256         struct scatterlist *sge;
2257
2258         len -= size;
2259         pfrag = sk_page_frag(sk);
2260
2261         while (len > 0) {
2262                 unsigned int orig_offset;
2263
2264                 if (!sk_page_frag_refill(sk, pfrag)) {
2265                         rc = -ENOMEM;
2266                         goto out;
2267                 }
2268
2269                 use = min_t(int, len, pfrag->size - pfrag->offset);
2270
2271                 if (!sk_wmem_schedule(sk, use)) {
2272                         rc = -ENOMEM;
2273                         goto out;
2274                 }
2275
2276                 sk_mem_charge(sk, use);
2277                 size += use;
2278                 orig_offset = pfrag->offset;
2279                 pfrag->offset += use;
2280
2281                 sge = sg + sg_curr - 1;
2282                 if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page &&
2283                     sge->offset + sge->length == orig_offset) {
2284                         sge->length += use;
2285                 } else {
2286                         sge = sg + sg_curr;
2287                         sg_unmark_end(sge);
2288                         sg_set_page(sge, pfrag->page, use, orig_offset);
2289                         get_page(pfrag->page);
2290                         sg_curr++;
2291
2292                         if (sg_curr == MAX_SKB_FRAGS)
2293                                 sg_curr = 0;
2294
2295                         if (sg_curr == sg_start) {
2296                                 rc = -ENOSPC;
2297                                 break;
2298                         }
2299                 }
2300
2301                 len -= use;
2302         }
2303 out:
2304         *sg_curr_size = size;
2305         *sg_curr_index = sg_curr;
2306         return rc;
2307 }
2308 EXPORT_SYMBOL(sk_alloc_sg);
2309
2310 static void __lock_sock(struct sock *sk)
2311         __releases(&sk->sk_lock.slock)
2312         __acquires(&sk->sk_lock.slock)
2313 {
2314         DEFINE_WAIT(wait);
2315
2316         for (;;) {
2317                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2318                                         TASK_UNINTERRUPTIBLE);
2319                 spin_unlock_bh(&sk->sk_lock.slock);
2320                 schedule();
2321                 spin_lock_bh(&sk->sk_lock.slock);
2322                 if (!sock_owned_by_user(sk))
2323                         break;
2324         }
2325         finish_wait(&sk->sk_lock.wq, &wait);
2326 }
2327
2328 void __release_sock(struct sock *sk)
2329         __releases(&sk->sk_lock.slock)
2330         __acquires(&sk->sk_lock.slock)
2331 {
2332         struct sk_buff *skb, *next;
2333
2334         while ((skb = sk->sk_backlog.head) != NULL) {
2335                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2336
2337                 spin_unlock_bh(&sk->sk_lock.slock);
2338
2339                 do {
2340                         next = skb->next;
2341                         prefetch(next);
2342                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2343                         skb->next = NULL;
2344                         sk_backlog_rcv(sk, skb);
2345
2346                         cond_resched();
2347
2348                         skb = next;
2349                 } while (skb != NULL);
2350
2351                 spin_lock_bh(&sk->sk_lock.slock);
2352         }
2353
2354         /*
2355          * Doing the zeroing here guarantee we can not loop forever
2356          * while a wild producer attempts to flood us.
2357          */
2358         sk->sk_backlog.len = 0;
2359 }
2360
2361 void __sk_flush_backlog(struct sock *sk)
2362 {
2363         spin_lock_bh(&sk->sk_lock.slock);
2364         __release_sock(sk);
2365         spin_unlock_bh(&sk->sk_lock.slock);
2366 }
2367
2368 /**
2369  * sk_wait_data - wait for data to arrive at sk_receive_queue
2370  * @sk:    sock to wait on
2371  * @timeo: for how long
2372  * @skb:   last skb seen on sk_receive_queue
2373  *
2374  * Now socket state including sk->sk_err is changed only under lock,
2375  * hence we may omit checks after joining wait queue.
2376  * We check receive queue before schedule() only as optimization;
2377  * it is very likely that release_sock() added new data.
2378  */
2379 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2380 {
2381         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2382         int rc;
2383
2384         add_wait_queue(sk_sleep(sk), &wait);
2385         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2386         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2387         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2388         remove_wait_queue(sk_sleep(sk), &wait);
2389         return rc;
2390 }
2391 EXPORT_SYMBOL(sk_wait_data);
2392
2393 /**
2394  *      __sk_mem_raise_allocated - increase memory_allocated
2395  *      @sk: socket
2396  *      @size: memory size to allocate
2397  *      @amt: pages to allocate
2398  *      @kind: allocation type
2399  *
2400  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2401  */
2402 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2403 {
2404         struct proto *prot = sk->sk_prot;
2405         long allocated = sk_memory_allocated_add(sk, amt);
2406         bool charged = true;
2407
2408         if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2409             !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2410                 goto suppress_allocation;
2411
2412         /* Under limit. */
2413         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2414                 sk_leave_memory_pressure(sk);
2415                 return 1;
2416         }
2417
2418         /* Under pressure. */
2419         if (allocated > sk_prot_mem_limits(sk, 1))
2420                 sk_enter_memory_pressure(sk);
2421
2422         /* Over hard limit. */
2423         if (allocated > sk_prot_mem_limits(sk, 2))
2424                 goto suppress_allocation;
2425
2426         /* guarantee minimum buffer size under pressure */
2427         if (kind == SK_MEM_RECV) {
2428                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2429                         return 1;
2430
2431         } else { /* SK_MEM_SEND */
2432                 int wmem0 = sk_get_wmem0(sk, prot);
2433
2434                 if (sk->sk_type == SOCK_STREAM) {
2435                         if (sk->sk_wmem_queued < wmem0)
2436                                 return 1;
2437                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2438                                 return 1;
2439                 }
2440         }
2441
2442         if (sk_has_memory_pressure(sk)) {
2443                 u64 alloc;
2444
2445                 if (!sk_under_memory_pressure(sk))
2446                         return 1;
2447                 alloc = sk_sockets_allocated_read_positive(sk);
2448                 if (sk_prot_mem_limits(sk, 2) > alloc *
2449                     sk_mem_pages(sk->sk_wmem_queued +
2450                                  atomic_read(&sk->sk_rmem_alloc) +
2451                                  sk->sk_forward_alloc))
2452                         return 1;
2453         }
2454
2455 suppress_allocation:
2456
2457         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2458                 sk_stream_moderate_sndbuf(sk);
2459
2460                 /* Fail only if socket is _under_ its sndbuf.
2461                  * In this case we cannot block, so that we have to fail.
2462                  */
2463                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2464                         return 1;
2465         }
2466
2467         if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2468                 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2469
2470         sk_memory_allocated_sub(sk, amt);
2471
2472         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2473                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2474
2475         return 0;
2476 }
2477 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2478
2479 /**
2480  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2481  *      @sk: socket
2482  *      @size: memory size to allocate
2483  *      @kind: allocation type
2484  *
2485  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2486  *      rmem allocation. This function assumes that protocols which have
2487  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2488  */
2489 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2490 {
2491         int ret, amt = sk_mem_pages(size);
2492
2493         sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2494         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2495         if (!ret)
2496                 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2497         return ret;
2498 }
2499 EXPORT_SYMBOL(__sk_mem_schedule);
2500
2501 /**
2502  *      __sk_mem_reduce_allocated - reclaim memory_allocated
2503  *      @sk: socket
2504  *      @amount: number of quanta
2505  *
2506  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2507  */
2508 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2509 {
2510         sk_memory_allocated_sub(sk, amount);
2511
2512         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2513                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2514
2515         if (sk_under_memory_pressure(sk) &&
2516             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2517                 sk_leave_memory_pressure(sk);
2518 }
2519 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2520
2521 /**
2522  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2523  *      @sk: socket
2524  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2525  */
2526 void __sk_mem_reclaim(struct sock *sk, int amount)
2527 {
2528         amount >>= SK_MEM_QUANTUM_SHIFT;
2529         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2530         __sk_mem_reduce_allocated(sk, amount);
2531 }
2532 EXPORT_SYMBOL(__sk_mem_reclaim);
2533
2534 int sk_set_peek_off(struct sock *sk, int val)
2535 {
2536         sk->sk_peek_off = val;
2537         return 0;
2538 }
2539 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2540
2541 /*
2542  * Set of default routines for initialising struct proto_ops when
2543  * the protocol does not support a particular function. In certain
2544  * cases where it makes no sense for a protocol to have a "do nothing"
2545  * function, some default processing is provided.
2546  */
2547
2548 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2549 {
2550         return -EOPNOTSUPP;
2551 }
2552 EXPORT_SYMBOL(sock_no_bind);
2553
2554 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2555                     int len, int flags)
2556 {
2557         return -EOPNOTSUPP;
2558 }
2559 EXPORT_SYMBOL(sock_no_connect);
2560
2561 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2562 {
2563         return -EOPNOTSUPP;
2564 }
2565 EXPORT_SYMBOL(sock_no_socketpair);
2566
2567 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2568                    bool kern)
2569 {
2570         return -EOPNOTSUPP;
2571 }
2572 EXPORT_SYMBOL(sock_no_accept);
2573
2574 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2575                     int peer)
2576 {
2577         return -EOPNOTSUPP;
2578 }
2579 EXPORT_SYMBOL(sock_no_getname);
2580
2581 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2582 {
2583         return -EOPNOTSUPP;
2584 }
2585 EXPORT_SYMBOL(sock_no_ioctl);
2586
2587 int sock_no_listen(struct socket *sock, int backlog)
2588 {
2589         return -EOPNOTSUPP;
2590 }
2591 EXPORT_SYMBOL(sock_no_listen);
2592
2593 int sock_no_shutdown(struct socket *sock, int how)
2594 {
2595         return -EOPNOTSUPP;
2596 }
2597 EXPORT_SYMBOL(sock_no_shutdown);
2598
2599 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2600                     char __user *optval, unsigned int optlen)
2601 {
2602         return -EOPNOTSUPP;
2603 }
2604 EXPORT_SYMBOL(sock_no_setsockopt);
2605
2606 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2607                     char __user *optval, int __user *optlen)
2608 {
2609         return -EOPNOTSUPP;
2610 }
2611 EXPORT_SYMBOL(sock_no_getsockopt);
2612
2613 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2614 {
2615         return -EOPNOTSUPP;
2616 }
2617 EXPORT_SYMBOL(sock_no_sendmsg);
2618
2619 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2620 {
2621         return -EOPNOTSUPP;
2622 }
2623 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2624
2625 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2626                     int flags)
2627 {
2628         return -EOPNOTSUPP;
2629 }
2630 EXPORT_SYMBOL(sock_no_recvmsg);
2631
2632 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2633 {
2634         /* Mirror missing mmap method error code */
2635         return -ENODEV;
2636 }
2637 EXPORT_SYMBOL(sock_no_mmap);
2638
2639 /*
2640  * When a file is received (via SCM_RIGHTS, etc), we must bump the
2641  * various sock-based usage counts.
2642  */
2643 void __receive_sock(struct file *file)
2644 {
2645         struct socket *sock;
2646         int error;
2647
2648         /*
2649          * The resulting value of "error" is ignored here since we only
2650          * need to take action when the file is a socket and testing
2651          * "sock" for NULL is sufficient.
2652          */
2653         sock = sock_from_file(file, &error);
2654         if (sock) {
2655                 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2656                 sock_update_classid(&sock->sk->sk_cgrp_data);
2657         }
2658 }
2659
2660 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2661 {
2662         ssize_t res;
2663         struct msghdr msg = {.msg_flags = flags};
2664         struct kvec iov;
2665         char *kaddr = kmap(page);
2666         iov.iov_base = kaddr + offset;
2667         iov.iov_len = size;
2668         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2669         kunmap(page);
2670         return res;
2671 }
2672 EXPORT_SYMBOL(sock_no_sendpage);
2673
2674 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2675                                 int offset, size_t size, int flags)
2676 {
2677         ssize_t res;
2678         struct msghdr msg = {.msg_flags = flags};
2679         struct kvec iov;
2680         char *kaddr = kmap(page);
2681
2682         iov.iov_base = kaddr + offset;
2683         iov.iov_len = size;
2684         res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2685         kunmap(page);
2686         return res;
2687 }
2688 EXPORT_SYMBOL(sock_no_sendpage_locked);
2689
2690 /*
2691  *      Default Socket Callbacks
2692  */
2693
2694 static void sock_def_wakeup(struct sock *sk)
2695 {
2696         struct socket_wq *wq;
2697
2698         rcu_read_lock();
2699         wq = rcu_dereference(sk->sk_wq);
2700         if (skwq_has_sleeper(wq))
2701                 wake_up_interruptible_all(&wq->wait);
2702         rcu_read_unlock();
2703 }
2704
2705 static void sock_def_error_report(struct sock *sk)
2706 {
2707         struct socket_wq *wq;
2708
2709         rcu_read_lock();
2710         wq = rcu_dereference(sk->sk_wq);
2711         if (skwq_has_sleeper(wq))
2712                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2713         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2714         rcu_read_unlock();
2715 }
2716
2717 static void sock_def_readable(struct sock *sk)
2718 {
2719         struct socket_wq *wq;
2720
2721         rcu_read_lock();
2722         wq = rcu_dereference(sk->sk_wq);
2723         if (skwq_has_sleeper(wq))
2724                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2725                                                 EPOLLRDNORM | EPOLLRDBAND);
2726         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2727         rcu_read_unlock();
2728 }
2729
2730 static void sock_def_write_space(struct sock *sk)
2731 {
2732         struct socket_wq *wq;
2733
2734         rcu_read_lock();
2735
2736         /* Do not wake up a writer until he can make "significant"
2737          * progress.  --DaveM
2738          */
2739         if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2740                 wq = rcu_dereference(sk->sk_wq);
2741                 if (skwq_has_sleeper(wq))
2742                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2743                                                 EPOLLWRNORM | EPOLLWRBAND);
2744
2745                 /* Should agree with poll, otherwise some programs break */
2746                 if (sock_writeable(sk))
2747                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2748         }
2749
2750         rcu_read_unlock();
2751 }
2752
2753 static void sock_def_destruct(struct sock *sk)
2754 {
2755 }
2756
2757 void sk_send_sigurg(struct sock *sk)
2758 {
2759         if (sk->sk_socket && sk->sk_socket->file)
2760                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2761                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2762 }
2763 EXPORT_SYMBOL(sk_send_sigurg);
2764
2765 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2766                     unsigned long expires)
2767 {
2768         if (!mod_timer(timer, expires))
2769                 sock_hold(sk);
2770 }
2771 EXPORT_SYMBOL(sk_reset_timer);
2772
2773 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2774 {
2775         if (del_timer(timer))
2776                 __sock_put(sk);
2777 }
2778 EXPORT_SYMBOL(sk_stop_timer);
2779
2780 void sock_init_data(struct socket *sock, struct sock *sk)
2781 {
2782         sk_init_common(sk);
2783         sk->sk_send_head        =       NULL;
2784
2785         timer_setup(&sk->sk_timer, NULL, 0);
2786
2787         sk->sk_allocation       =       GFP_KERNEL;
2788         sk->sk_rcvbuf           =       sysctl_rmem_default;
2789         sk->sk_sndbuf           =       sysctl_wmem_default;
2790         sk->sk_state            =       TCP_CLOSE;
2791         sk_set_socket(sk, sock);
2792
2793         sock_set_flag(sk, SOCK_ZAPPED);
2794
2795         if (sock) {
2796                 sk->sk_type     =       sock->type;
2797                 sk->sk_wq       =       sock->wq;
2798                 sock->sk        =       sk;
2799                 sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2800         } else {
2801                 sk->sk_wq       =       NULL;
2802                 sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2803         }
2804
2805         rwlock_init(&sk->sk_callback_lock);
2806         if (sk->sk_kern_sock)
2807                 lockdep_set_class_and_name(
2808                         &sk->sk_callback_lock,
2809                         af_kern_callback_keys + sk->sk_family,
2810                         af_family_kern_clock_key_strings[sk->sk_family]);
2811         else
2812                 lockdep_set_class_and_name(
2813                         &sk->sk_callback_lock,
2814                         af_callback_keys + sk->sk_family,
2815                         af_family_clock_key_strings[sk->sk_family]);
2816
2817         sk->sk_state_change     =       sock_def_wakeup;
2818         sk->sk_data_ready       =       sock_def_readable;
2819         sk->sk_write_space      =       sock_def_write_space;
2820         sk->sk_error_report     =       sock_def_error_report;
2821         sk->sk_destruct         =       sock_def_destruct;
2822
2823         sk->sk_frag.page        =       NULL;
2824         sk->sk_frag.offset      =       0;
2825         sk->sk_peek_off         =       -1;
2826
2827         sk->sk_peer_pid         =       NULL;
2828         sk->sk_peer_cred        =       NULL;
2829         sk->sk_write_pending    =       0;
2830         sk->sk_rcvlowat         =       1;
2831         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2832         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2833
2834         sk->sk_stamp = SK_DEFAULT_STAMP;
2835 #if BITS_PER_LONG==32
2836         seqlock_init(&sk->sk_stamp_seq);
2837 #endif
2838         atomic_set(&sk->sk_zckey, 0);
2839
2840 #ifdef CONFIG_NET_RX_BUSY_POLL
2841         sk->sk_napi_id          =       0;
2842         sk->sk_ll_usec          =       sysctl_net_busy_read;
2843 #endif
2844
2845         sk->sk_max_pacing_rate = ~0U;
2846         sk->sk_pacing_rate = ~0U;
2847         sk->sk_pacing_shift = 10;
2848         sk->sk_incoming_cpu = -1;
2849
2850         sk_rx_queue_clear(sk);
2851         /*
2852          * Before updating sk_refcnt, we must commit prior changes to memory
2853          * (Documentation/RCU/rculist_nulls.txt for details)
2854          */
2855         smp_wmb();
2856         refcount_set(&sk->sk_refcnt, 1);
2857         atomic_set(&sk->sk_drops, 0);
2858 }
2859 EXPORT_SYMBOL(sock_init_data);
2860
2861 void lock_sock_nested(struct sock *sk, int subclass)
2862 {
2863         might_sleep();
2864         spin_lock_bh(&sk->sk_lock.slock);
2865         if (sk->sk_lock.owned)
2866                 __lock_sock(sk);
2867         sk->sk_lock.owned = 1;
2868         spin_unlock(&sk->sk_lock.slock);
2869         /*
2870          * The sk_lock has mutex_lock() semantics here:
2871          */
2872         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2873         local_bh_enable();
2874 }
2875 EXPORT_SYMBOL(lock_sock_nested);
2876
2877 void release_sock(struct sock *sk)
2878 {
2879         spin_lock_bh(&sk->sk_lock.slock);
2880         if (sk->sk_backlog.tail)
2881                 __release_sock(sk);
2882
2883         /* Warning : release_cb() might need to release sk ownership,
2884          * ie call sock_release_ownership(sk) before us.
2885          */
2886         if (sk->sk_prot->release_cb)
2887                 sk->sk_prot->release_cb(sk);
2888
2889         sock_release_ownership(sk);
2890         if (waitqueue_active(&sk->sk_lock.wq))
2891                 wake_up(&sk->sk_lock.wq);
2892         spin_unlock_bh(&sk->sk_lock.slock);
2893 }
2894 EXPORT_SYMBOL(release_sock);
2895
2896 /**
2897  * lock_sock_fast - fast version of lock_sock
2898  * @sk: socket
2899  *
2900  * This version should be used for very small section, where process wont block
2901  * return false if fast path is taken:
2902  *
2903  *   sk_lock.slock locked, owned = 0, BH disabled
2904  *
2905  * return true if slow path is taken:
2906  *
2907  *   sk_lock.slock unlocked, owned = 1, BH enabled
2908  */
2909 bool lock_sock_fast(struct sock *sk)
2910 {
2911         might_sleep();
2912         spin_lock_bh(&sk->sk_lock.slock);
2913
2914         if (!sk->sk_lock.owned)
2915                 /*
2916                  * Note : We must disable BH
2917                  */
2918                 return false;
2919
2920         __lock_sock(sk);
2921         sk->sk_lock.owned = 1;
2922         spin_unlock(&sk->sk_lock.slock);
2923         /*
2924          * The sk_lock has mutex_lock() semantics here:
2925          */
2926         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2927         local_bh_enable();
2928         return true;
2929 }
2930 EXPORT_SYMBOL(lock_sock_fast);
2931
2932 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2933 {
2934         struct timeval tv;
2935
2936         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2937         tv = ktime_to_timeval(sock_read_timestamp(sk));
2938         if (tv.tv_sec == -1)
2939                 return -ENOENT;
2940         if (tv.tv_sec == 0) {
2941                 ktime_t kt = ktime_get_real();
2942                 sock_write_timestamp(sk, kt);
2943                 tv = ktime_to_timeval(kt);
2944         }
2945         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2946 }
2947 EXPORT_SYMBOL(sock_get_timestamp);
2948
2949 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2950 {
2951         struct timespec ts;
2952
2953         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2954         ts = ktime_to_timespec(sock_read_timestamp(sk));
2955         if (ts.tv_sec == -1)
2956                 return -ENOENT;
2957         if (ts.tv_sec == 0) {
2958                 ktime_t kt = ktime_get_real();
2959                 sock_write_timestamp(sk, kt);
2960                 ts = ktime_to_timespec(sk->sk_stamp);
2961         }
2962         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2963 }
2964 EXPORT_SYMBOL(sock_get_timestampns);
2965
2966 void sock_enable_timestamp(struct sock *sk, int flag)
2967 {
2968         if (!sock_flag(sk, flag)) {
2969                 unsigned long previous_flags = sk->sk_flags;
2970
2971                 sock_set_flag(sk, flag);
2972                 /*
2973                  * we just set one of the two flags which require net
2974                  * time stamping, but time stamping might have been on
2975                  * already because of the other one
2976                  */
2977                 if (sock_needs_netstamp(sk) &&
2978                     !(previous_flags & SK_FLAGS_TIMESTAMP))
2979                         net_enable_timestamp();
2980         }
2981 }
2982
2983 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2984                        int level, int type)
2985 {
2986         struct sock_exterr_skb *serr;
2987         struct sk_buff *skb;
2988         int copied, err;
2989
2990         err = -EAGAIN;
2991         skb = sock_dequeue_err_skb(sk);
2992         if (skb == NULL)
2993                 goto out;
2994
2995         copied = skb->len;
2996         if (copied > len) {
2997                 msg->msg_flags |= MSG_TRUNC;
2998                 copied = len;
2999         }
3000         err = skb_copy_datagram_msg(skb, 0, msg, copied);
3001         if (err)
3002                 goto out_free_skb;
3003
3004         sock_recv_timestamp(msg, sk, skb);
3005
3006         serr = SKB_EXT_ERR(skb);
3007         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3008
3009         msg->msg_flags |= MSG_ERRQUEUE;
3010         err = copied;
3011
3012 out_free_skb:
3013         kfree_skb(skb);
3014 out:
3015         return err;
3016 }
3017 EXPORT_SYMBOL(sock_recv_errqueue);
3018
3019 /*
3020  *      Get a socket option on an socket.
3021  *
3022  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3023  *      asynchronous errors should be reported by getsockopt. We assume
3024  *      this means if you specify SO_ERROR (otherwise whats the point of it).
3025  */
3026 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3027                            char __user *optval, int __user *optlen)
3028 {
3029         struct sock *sk = sock->sk;
3030
3031         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3032 }
3033 EXPORT_SYMBOL(sock_common_getsockopt);
3034
3035 #ifdef CONFIG_COMPAT
3036 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3037                                   char __user *optval, int __user *optlen)
3038 {
3039         struct sock *sk = sock->sk;
3040
3041         if (sk->sk_prot->compat_getsockopt != NULL)
3042                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
3043                                                       optval, optlen);
3044         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3045 }
3046 EXPORT_SYMBOL(compat_sock_common_getsockopt);
3047 #endif
3048
3049 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3050                         int flags)
3051 {
3052         struct sock *sk = sock->sk;
3053         int addr_len = 0;
3054         int err;
3055
3056         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3057                                    flags & ~MSG_DONTWAIT, &addr_len);
3058         if (err >= 0)
3059                 msg->msg_namelen = addr_len;
3060         return err;
3061 }
3062 EXPORT_SYMBOL(sock_common_recvmsg);
3063
3064 /*
3065  *      Set socket options on an inet socket.
3066  */
3067 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3068                            char __user *optval, unsigned int optlen)
3069 {
3070         struct sock *sk = sock->sk;
3071
3072         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3073 }
3074 EXPORT_SYMBOL(sock_common_setsockopt);
3075
3076 #ifdef CONFIG_COMPAT
3077 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
3078                                   char __user *optval, unsigned int optlen)
3079 {
3080         struct sock *sk = sock->sk;
3081
3082         if (sk->sk_prot->compat_setsockopt != NULL)
3083                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
3084                                                       optval, optlen);
3085         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3086 }
3087 EXPORT_SYMBOL(compat_sock_common_setsockopt);
3088 #endif
3089
3090 void sk_common_release(struct sock *sk)
3091 {
3092         if (sk->sk_prot->destroy)
3093                 sk->sk_prot->destroy(sk);
3094
3095         /*
3096          * Observation: when sock_common_release is called, processes have
3097          * no access to socket. But net still has.
3098          * Step one, detach it from networking:
3099          *
3100          * A. Remove from hash tables.
3101          */
3102
3103         sk->sk_prot->unhash(sk);
3104
3105         /*
3106          * In this point socket cannot receive new packets, but it is possible
3107          * that some packets are in flight because some CPU runs receiver and
3108          * did hash table lookup before we unhashed socket. They will achieve
3109          * receive queue and will be purged by socket destructor.
3110          *
3111          * Also we still have packets pending on receive queue and probably,
3112          * our own packets waiting in device queues. sock_destroy will drain
3113          * receive queue, but transmitted packets will delay socket destruction
3114          * until the last reference will be released.
3115          */
3116
3117         sock_orphan(sk);
3118
3119         xfrm_sk_free_policy(sk);
3120
3121         sk_refcnt_debug_release(sk);
3122
3123         sock_put(sk);
3124 }
3125 EXPORT_SYMBOL(sk_common_release);
3126
3127 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3128 {
3129         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3130
3131         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3132         mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3133         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3134         mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3135         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3136         mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3137         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3138         mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3139         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3140 }
3141
3142 #ifdef CONFIG_PROC_FS
3143 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
3144 struct prot_inuse {
3145         int val[PROTO_INUSE_NR];
3146 };
3147
3148 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3149
3150 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3151 {
3152         __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3153 }
3154 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3155
3156 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3157 {
3158         int cpu, idx = prot->inuse_idx;
3159         int res = 0;
3160
3161         for_each_possible_cpu(cpu)
3162                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3163
3164         return res >= 0 ? res : 0;
3165 }
3166 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3167
3168 static void sock_inuse_add(struct net *net, int val)
3169 {
3170         this_cpu_add(*net->core.sock_inuse, val);
3171 }
3172
3173 int sock_inuse_get(struct net *net)
3174 {
3175         int cpu, res = 0;
3176
3177         for_each_possible_cpu(cpu)
3178                 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3179
3180         return res;
3181 }
3182
3183 EXPORT_SYMBOL_GPL(sock_inuse_get);
3184
3185 static int __net_init sock_inuse_init_net(struct net *net)
3186 {
3187         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3188         if (net->core.prot_inuse == NULL)
3189                 return -ENOMEM;
3190
3191         net->core.sock_inuse = alloc_percpu(int);
3192         if (net->core.sock_inuse == NULL)
3193                 goto out;
3194
3195         return 0;
3196
3197 out:
3198         free_percpu(net->core.prot_inuse);
3199         return -ENOMEM;
3200 }
3201
3202 static void __net_exit sock_inuse_exit_net(struct net *net)
3203 {
3204         free_percpu(net->core.prot_inuse);
3205         free_percpu(net->core.sock_inuse);
3206 }
3207
3208 static struct pernet_operations net_inuse_ops = {
3209         .init = sock_inuse_init_net,
3210         .exit = sock_inuse_exit_net,
3211 };
3212
3213 static __init int net_inuse_init(void)
3214 {
3215         if (register_pernet_subsys(&net_inuse_ops))
3216                 panic("Cannot initialize net inuse counters");
3217
3218         return 0;
3219 }
3220
3221 core_initcall(net_inuse_init);
3222
3223 static void assign_proto_idx(struct proto *prot)
3224 {
3225         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3226
3227         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3228                 pr_err("PROTO_INUSE_NR exhausted\n");
3229                 return;
3230         }
3231
3232         set_bit(prot->inuse_idx, proto_inuse_idx);
3233 }
3234
3235 static void release_proto_idx(struct proto *prot)
3236 {
3237         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3238                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3239 }
3240 #else
3241 static inline void assign_proto_idx(struct proto *prot)
3242 {
3243 }
3244
3245 static inline void release_proto_idx(struct proto *prot)
3246 {
3247 }
3248
3249 static void sock_inuse_add(struct net *net, int val)
3250 {
3251 }
3252 #endif
3253
3254 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3255 {
3256         if (!rsk_prot)
3257                 return;
3258         kfree(rsk_prot->slab_name);
3259         rsk_prot->slab_name = NULL;
3260         kmem_cache_destroy(rsk_prot->slab);
3261         rsk_prot->slab = NULL;
3262 }
3263
3264 static int req_prot_init(const struct proto *prot)
3265 {
3266         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3267
3268         if (!rsk_prot)
3269                 return 0;
3270
3271         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3272                                         prot->name);
3273         if (!rsk_prot->slab_name)
3274                 return -ENOMEM;
3275
3276         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3277                                            rsk_prot->obj_size, 0,
3278                                            SLAB_ACCOUNT | prot->slab_flags,
3279                                            NULL);
3280
3281         if (!rsk_prot->slab) {
3282                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3283                         prot->name);
3284                 return -ENOMEM;
3285         }
3286         return 0;
3287 }
3288
3289 int proto_register(struct proto *prot, int alloc_slab)
3290 {
3291         if (alloc_slab) {
3292                 prot->slab = kmem_cache_create_usercopy(prot->name,
3293                                         prot->obj_size, 0,
3294                                         SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3295                                         prot->slab_flags,
3296                                         prot->useroffset, prot->usersize,
3297                                         NULL);
3298
3299                 if (prot->slab == NULL) {
3300                         pr_crit("%s: Can't create sock SLAB cache!\n",
3301                                 prot->name);
3302                         goto out;
3303                 }
3304
3305                 if (req_prot_init(prot))
3306                         goto out_free_request_sock_slab;
3307
3308                 if (prot->twsk_prot != NULL) {
3309                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3310
3311                         if (prot->twsk_prot->twsk_slab_name == NULL)
3312                                 goto out_free_request_sock_slab;
3313
3314                         prot->twsk_prot->twsk_slab =
3315                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3316                                                   prot->twsk_prot->twsk_obj_size,
3317                                                   0,
3318                                                   SLAB_ACCOUNT |
3319                                                   prot->slab_flags,
3320                                                   NULL);
3321                         if (prot->twsk_prot->twsk_slab == NULL)
3322                                 goto out_free_timewait_sock_slab_name;
3323                 }
3324         }
3325
3326         mutex_lock(&proto_list_mutex);
3327         list_add(&prot->node, &proto_list);
3328         assign_proto_idx(prot);
3329         mutex_unlock(&proto_list_mutex);
3330         return 0;
3331
3332 out_free_timewait_sock_slab_name:
3333         kfree(prot->twsk_prot->twsk_slab_name);
3334 out_free_request_sock_slab:
3335         req_prot_cleanup(prot->rsk_prot);
3336
3337         kmem_cache_destroy(prot->slab);
3338         prot->slab = NULL;
3339 out:
3340         return -ENOBUFS;
3341 }
3342 EXPORT_SYMBOL(proto_register);
3343
3344 void proto_unregister(struct proto *prot)
3345 {
3346         mutex_lock(&proto_list_mutex);
3347         release_proto_idx(prot);
3348         list_del(&prot->node);
3349         mutex_unlock(&proto_list_mutex);
3350
3351         kmem_cache_destroy(prot->slab);
3352         prot->slab = NULL;
3353
3354         req_prot_cleanup(prot->rsk_prot);
3355
3356         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3357                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3358                 kfree(prot->twsk_prot->twsk_slab_name);
3359                 prot->twsk_prot->twsk_slab = NULL;
3360         }
3361 }
3362 EXPORT_SYMBOL(proto_unregister);
3363
3364 int sock_load_diag_module(int family, int protocol)
3365 {
3366         if (!protocol) {
3367                 if (!sock_is_registered(family))
3368                         return -ENOENT;
3369
3370                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3371                                       NETLINK_SOCK_DIAG, family);
3372         }
3373
3374 #ifdef CONFIG_INET
3375         if (family == AF_INET &&
3376             protocol != IPPROTO_RAW &&
3377             !rcu_access_pointer(inet_protos[protocol]))
3378                 return -ENOENT;
3379 #endif
3380
3381         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3382                               NETLINK_SOCK_DIAG, family, protocol);
3383 }
3384 EXPORT_SYMBOL(sock_load_diag_module);
3385
3386 #ifdef CONFIG_PROC_FS
3387 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3388         __acquires(proto_list_mutex)
3389 {
3390         mutex_lock(&proto_list_mutex);
3391         return seq_list_start_head(&proto_list, *pos);
3392 }
3393
3394 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3395 {
3396         return seq_list_next(v, &proto_list, pos);
3397 }
3398
3399 static void proto_seq_stop(struct seq_file *seq, void *v)
3400         __releases(proto_list_mutex)
3401 {
3402         mutex_unlock(&proto_list_mutex);
3403 }
3404
3405 static char proto_method_implemented(const void *method)
3406 {
3407         return method == NULL ? 'n' : 'y';
3408 }
3409 static long sock_prot_memory_allocated(struct proto *proto)
3410 {
3411         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3412 }
3413
3414 static char *sock_prot_memory_pressure(struct proto *proto)
3415 {
3416         return proto->memory_pressure != NULL ?
3417         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3418 }
3419
3420 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3421 {
3422
3423         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3424                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3425                    proto->name,
3426                    proto->obj_size,
3427                    sock_prot_inuse_get(seq_file_net(seq), proto),
3428                    sock_prot_memory_allocated(proto),
3429                    sock_prot_memory_pressure(proto),
3430                    proto->max_header,
3431                    proto->slab == NULL ? "no" : "yes",
3432                    module_name(proto->owner),
3433                    proto_method_implemented(proto->close),
3434                    proto_method_implemented(proto->connect),
3435                    proto_method_implemented(proto->disconnect),
3436                    proto_method_implemented(proto->accept),
3437                    proto_method_implemented(proto->ioctl),
3438                    proto_method_implemented(proto->init),
3439                    proto_method_implemented(proto->destroy),
3440                    proto_method_implemented(proto->shutdown),
3441                    proto_method_implemented(proto->setsockopt),
3442                    proto_method_implemented(proto->getsockopt),
3443                    proto_method_implemented(proto->sendmsg),
3444                    proto_method_implemented(proto->recvmsg),
3445                    proto_method_implemented(proto->sendpage),
3446                    proto_method_implemented(proto->bind),
3447                    proto_method_implemented(proto->backlog_rcv),
3448                    proto_method_implemented(proto->hash),
3449                    proto_method_implemented(proto->unhash),
3450                    proto_method_implemented(proto->get_port),
3451                    proto_method_implemented(proto->enter_memory_pressure));
3452 }
3453
3454 static int proto_seq_show(struct seq_file *seq, void *v)
3455 {
3456         if (v == &proto_list)
3457                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3458                            "protocol",
3459                            "size",
3460                            "sockets",
3461                            "memory",
3462                            "press",
3463                            "maxhdr",
3464                            "slab",
3465                            "module",
3466                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3467         else
3468                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3469         return 0;
3470 }
3471
3472 static const struct seq_operations proto_seq_ops = {
3473         .start  = proto_seq_start,
3474         .next   = proto_seq_next,
3475         .stop   = proto_seq_stop,
3476         .show   = proto_seq_show,
3477 };
3478
3479 static __net_init int proto_init_net(struct net *net)
3480 {
3481         if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3482                         sizeof(struct seq_net_private)))
3483                 return -ENOMEM;
3484
3485         return 0;
3486 }
3487
3488 static __net_exit void proto_exit_net(struct net *net)
3489 {
3490         remove_proc_entry("protocols", net->proc_net);
3491 }
3492
3493
3494 static __net_initdata struct pernet_operations proto_net_ops = {
3495         .init = proto_init_net,
3496         .exit = proto_exit_net,
3497 };
3498
3499 static int __init proto_init(void)
3500 {
3501         return register_pernet_subsys(&proto_net_ops);
3502 }
3503
3504 subsys_initcall(proto_init);
3505
3506 #endif /* PROC_FS */
3507
3508 #ifdef CONFIG_NET_RX_BUSY_POLL
3509 bool sk_busy_loop_end(void *p, unsigned long start_time)
3510 {
3511         struct sock *sk = p;
3512
3513         return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3514                sk_busy_loop_timeout(sk, start_time);
3515 }
3516 EXPORT_SYMBOL(sk_busy_loop_end);
3517 #endif /* CONFIG_NET_RX_BUSY_POLL */