net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/errqueue.h>
  97 #include <linux/types.h>
  98 #include <linux/socket.h>
  99 #include <linux/in.h>
 100 #include <linux/kernel.h>
 101 #include <linux/module.h>
 102 #include <linux/proc_fs.h>
 103 #include <linux/seq_file.h>
 104 #include <linux/sched.h>
 105 #include <linux/sched/mm.h>
 106 #include <linux/timer.h>
 107 #include <linux/string.h>
 108 #include <linux/sockios.h>
 109 #include <linux/net.h>
 110 #include <linux/mm.h>
 111 #include <linux/slab.h>
 112 #include <linux/interrupt.h>
 113 #include <linux/poll.h>
 114 #include <linux/tcp.h>
 115 #include <linux/init.h>
 116 #include <linux/highmem.h>
 117 #include <linux/user_namespace.h>
 118 #include <linux/static_key.h>
 119 #include <linux/memcontrol.h>
 120 #include <linux/prefetch.h>
 121
 122 #include <linux/uaccess.h>
 123
 124 #include <linux/netdevice.h>
 125 #include <net/protocol.h>
 126 #include <linux/skbuff.h>
 127 #include <net/net_namespace.h>
 128 #include <net/request_sock.h>
 129 #include <net/sock.h>
 130 #include <linux/net_tstamp.h>
 131 #include <net/xfrm.h>
 132 #include <linux/ipsec.h>
 133 #include <net/cls_cgroup.h>
 134 #include <net/netprio_cgroup.h>
 135 #include <linux/sock_diag.h>
 136
 137 #include <linux/filter.h>
 138 #include <net/sock_reuseport.h>
 139
 140 #include <trace/events/sock.h>
 141
 142 #include <net/tcp.h>
 143 #include <net/busy_poll.h>
 144
 145 static DEFINE_MUTEX(proto_list_mutex);
 146 static LIST_HEAD(proto_list);
 147
 148 /**
 149  * sk_ns_capable - General socket capability test
 150  * @sk: Socket to use a capability on or through
 151  * @user_ns: The user namespace of the capability to use
 152  * @cap: The capability to use
 153  *
 154  * Test to see if the opener of the socket had when the socket was
 155  * created and the current process has the capability @cap in the user
 156  * namespace @user_ns.
 157  */
 158 bool sk_ns_capable(const struct sock *sk,
 159                    struct user_namespace *user_ns, int cap)
 160 {
 161         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 162                 ns_capable(user_ns, cap);
 163 }
 164 EXPORT_SYMBOL(sk_ns_capable);
 165
 166 /**
 167  * sk_capable - Socket global capability test
 168  * @sk: Socket to use a capability on or through
 169  * @cap: The global capability to use
 170  *
 171  * Test to see if the opener of the socket had when the socket was
 172  * created and the current process has the capability @cap in all user
 173  * namespaces.
 174  */
 175 bool sk_capable(const struct sock *sk, int cap)
 176 {
 177         return sk_ns_capable(sk, &init_user_ns, cap);
 178 }
 179 EXPORT_SYMBOL(sk_capable);
 180
 181 /**
 182  * sk_net_capable - Network namespace socket capability test
 183  * @sk: Socket to use a capability on or through
 184  * @cap: The capability to use
 185  *
 186  * Test to see if the opener of the socket had when the socket was created
 187  * and the current process has the capability @cap over the network namespace
 188  * the socket is a member of.
 189  */
 190 bool sk_net_capable(const struct sock *sk, int cap)
 191 {
 192         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 193 }
 194 EXPORT_SYMBOL(sk_net_capable);
 195
 196 /*
 197  * Each address family might have different locking rules, so we have
 198  * one slock key per address family and separate keys for internal and
 199  * userspace sockets.
 200  */
 201 static struct lock_class_key af_family_keys[AF_MAX];
 202 static struct lock_class_key af_family_kern_keys[AF_MAX];
 203 static struct lock_class_key af_family_slock_keys[AF_MAX];
 204 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 205
 206 /*
 207  * Make lock validator output more readable. (we pre-construct these
 208  * strings build-time, so that runtime initialization of socket
 209  * locks is fast):
 210  */
 211
 212 #define _sock_locks(x)                                            \
 213   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 214   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 215   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 216   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 217   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 218   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 219   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 220   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 221   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 222   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 223   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 224   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 225   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 226   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 227   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_MAX"
 228
 229 static const char *const af_family_key_strings[AF_MAX+1] = {
 230         _sock_locks("sk_lock-")
 231 };
 232 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 233         _sock_locks("slock-")
 234 };
 235 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 236         _sock_locks("clock-")
 237 };
 238
 239 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 240         _sock_locks("k-sk_lock-")
 241 };
 242 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 243         _sock_locks("k-slock-")
 244 };
 245 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 246         _sock_locks("k-clock-")
 247 };
 248 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 249   "rlock-AF_UNSPEC", "rlock-AF_UNIX"     , "rlock-AF_INET"     ,
 250   "rlock-AF_AX25"  , "rlock-AF_IPX"      , "rlock-AF_APPLETALK",
 251   "rlock-AF_NETROM", "rlock-AF_BRIDGE"   , "rlock-AF_ATMPVC"   ,
 252   "rlock-AF_X25"   , "rlock-AF_INET6"    , "rlock-AF_ROSE"     ,
 253   "rlock-AF_DECnet", "rlock-AF_NETBEUI"  , "rlock-AF_SECURITY" ,
 254   "rlock-AF_KEY"   , "rlock-AF_NETLINK"  , "rlock-AF_PACKET"   ,
 255   "rlock-AF_ASH"   , "rlock-AF_ECONET"   , "rlock-AF_ATMSVC"   ,
 256   "rlock-AF_RDS"   , "rlock-AF_SNA"      , "rlock-AF_IRDA"     ,
 257   "rlock-AF_PPPOX" , "rlock-AF_WANPIPE"  , "rlock-AF_LLC"      ,
 258   "rlock-27"       , "rlock-28"          , "rlock-AF_CAN"      ,
 259   "rlock-AF_TIPC"  , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV"     ,
 260   "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
 261   "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
 262   "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
 263   "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_MAX"
 264 };
 265 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 266   "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
 267   "wlock-AF_AX25"  , "wlock-AF_IPX"      , "wlock-AF_APPLETALK",
 268   "wlock-AF_NETROM", "wlock-AF_BRIDGE"   , "wlock-AF_ATMPVC"   ,
 269   "wlock-AF_X25"   , "wlock-AF_INET6"    , "wlock-AF_ROSE"     ,
 270   "wlock-AF_DECnet", "wlock-AF_NETBEUI"  , "wlock-AF_SECURITY" ,
 271   "wlock-AF_KEY"   , "wlock-AF_NETLINK"  , "wlock-AF_PACKET"   ,
 272   "wlock-AF_ASH"   , "wlock-AF_ECONET"   , "wlock-AF_ATMSVC"   ,
 273   "wlock-AF_RDS"   , "wlock-AF_SNA"      , "wlock-AF_IRDA"     ,
 274   "wlock-AF_PPPOX" , "wlock-AF_WANPIPE"  , "wlock-AF_LLC"      ,
 275   "wlock-27"       , "wlock-28"          , "wlock-AF_CAN"      ,
 276   "wlock-AF_TIPC"  , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV"     ,
 277   "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
 278   "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
 279   "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
 280   "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_MAX"
 281 };
 282 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 283   "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
 284   "elock-AF_AX25"  , "elock-AF_IPX"      , "elock-AF_APPLETALK",
 285   "elock-AF_NETROM", "elock-AF_BRIDGE"   , "elock-AF_ATMPVC"   ,
 286   "elock-AF_X25"   , "elock-AF_INET6"    , "elock-AF_ROSE"     ,
 287   "elock-AF_DECnet", "elock-AF_NETBEUI"  , "elock-AF_SECURITY" ,
 288   "elock-AF_KEY"   , "elock-AF_NETLINK"  , "elock-AF_PACKET"   ,
 289   "elock-AF_ASH"   , "elock-AF_ECONET"   , "elock-AF_ATMSVC"   ,
 290   "elock-AF_RDS"   , "elock-AF_SNA"      , "elock-AF_IRDA"     ,
 291   "elock-AF_PPPOX" , "elock-AF_WANPIPE"  , "elock-AF_LLC"      ,
 292   "elock-27"       , "elock-28"          , "elock-AF_CAN"      ,
 293   "elock-AF_TIPC"  , "elock-AF_BLUETOOTH", "elock-AF_IUCV"     ,
 294   "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
 295   "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
 296   "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
 297   "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_MAX"
 298 };
 299
 300 /*
 301  * sk_callback_lock and sk queues locking rules are per-address-family,
 302  * so split the lock classes by using a per-AF key:
 303  */
 304 static struct lock_class_key af_callback_keys[AF_MAX];
 305 static struct lock_class_key af_rlock_keys[AF_MAX];
 306 static struct lock_class_key af_wlock_keys[AF_MAX];
 307 static struct lock_class_key af_elock_keys[AF_MAX];
 308 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 309
 310 /* Run time adjustable parameters. */
 311 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 312 EXPORT_SYMBOL(sysctl_wmem_max);
 313 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 314 EXPORT_SYMBOL(sysctl_rmem_max);
 315 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 316 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 317
 318 /* Maximal space eaten by iovec or ancillary data plus some space */
 319 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 320 EXPORT_SYMBOL(sysctl_optmem_max);
 321
 322 int sysctl_tstamp_allow_data __read_mostly = 1;
 323
 324 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 325 EXPORT_SYMBOL_GPL(memalloc_socks);
 326
 327 /**
 328  * sk_set_memalloc - sets %SOCK_MEMALLOC
 329  * @sk: socket to set it on
 330  *
 331  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 332  * It's the responsibility of the admin to adjust min_free_kbytes
 333  * to meet the requirements
 334  */
 335 void sk_set_memalloc(struct sock *sk)
 336 {
 337         sock_set_flag(sk, SOCK_MEMALLOC);
 338         sk->sk_allocation |= __GFP_MEMALLOC;
 339         static_key_slow_inc(&memalloc_socks);
 340 }
 341 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 342
 343 void sk_clear_memalloc(struct sock *sk)
 344 {
 345         sock_reset_flag(sk, SOCK_MEMALLOC);
 346         sk->sk_allocation &= ~__GFP_MEMALLOC;
 347         static_key_slow_dec(&memalloc_socks);
 348
 349         /*
 350          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 351          * progress of swapping. SOCK_MEMALLOC may be cleared while
 352          * it has rmem allocations due to the last swapfile being deactivated
 353          * but there is a risk that the socket is unusable due to exceeding
 354          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 355          */
 356         sk_mem_reclaim(sk);
 357 }
 358 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 359
 360 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 361 {
 362         int ret;
 363         unsigned int noreclaim_flag;
 364
 365         /* these should have been dropped before queueing */
 366         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 367
 368         noreclaim_flag = memalloc_noreclaim_save();
 369         ret = sk->sk_backlog_rcv(sk, skb);
 370         memalloc_noreclaim_restore(noreclaim_flag);
 371
 372         return ret;
 373 }
 374 EXPORT_SYMBOL(__sk_backlog_rcv);
 375
 376 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 377 {
 378         struct timeval tv;
 379
 380         if (optlen < sizeof(tv))
 381                 return -EINVAL;
 382         if (copy_from_user(&tv, optval, sizeof(tv)))
 383                 return -EFAULT;
 384         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 385                 return -EDOM;
 386
 387         if (tv.tv_sec < 0) {
 388                 static int warned __read_mostly;
 389
 390                 *timeo_p = 0;
 391                 if (warned < 10 && net_ratelimit()) {
 392                         warned++;
 393                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 394                                 __func__, current->comm, task_pid_nr(current));
 395                 }
 396                 return 0;
 397         }
 398         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 399         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 400                 return 0;
 401         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 402                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
 403         return 0;
 404 }
 405
 406 static void sock_warn_obsolete_bsdism(const char *name)
 407 {
 408         static int warned;
 409         static char warncomm[TASK_COMM_LEN];
 410         if (strcmp(warncomm, current->comm) && warned < 5) {
 411                 strcpy(warncomm,  current->comm);
 412                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 413                         warncomm, name);
 414                 warned++;
 415         }
 416 }
 417
 418 static bool sock_needs_netstamp(const struct sock *sk)
 419 {
 420         switch (sk->sk_family) {
 421         case AF_UNSPEC:
 422         case AF_UNIX:
 423                 return false;
 424         default:
 425                 return true;
 426         }
 427 }
 428
 429 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 430 {
 431         if (sk->sk_flags & flags) {
 432                 sk->sk_flags &= ~flags;
 433                 if (sock_needs_netstamp(sk) &&
 434                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 435                         net_disable_timestamp();
 436         }
 437 }
 438
 439
 440 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 441 {
 442         unsigned long flags;
 443         struct sk_buff_head *list = &sk->sk_receive_queue;
 444
 445         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 446                 atomic_inc(&sk->sk_drops);
 447                 trace_sock_rcvqueue_full(sk, skb);
 448                 return -ENOMEM;
 449         }
 450
 451         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 452                 atomic_inc(&sk->sk_drops);
 453                 return -ENOBUFS;
 454         }
 455
 456         skb->dev = NULL;
 457         skb_set_owner_r(skb, sk);
 458
 459         /* we escape from rcu protected region, make sure we dont leak
 460          * a norefcounted dst
 461          */
 462         skb_dst_force(skb);
 463
 464         spin_lock_irqsave(&list->lock, flags);
 465         sock_skb_set_dropcount(sk, skb);
 466         __skb_queue_tail(list, skb);
 467         spin_unlock_irqrestore(&list->lock, flags);
 468
 469         if (!sock_flag(sk, SOCK_DEAD))
 470                 sk->sk_data_ready(sk);
 471         return 0;
 472 }
 473 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 474
 475 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 476 {
 477         int err;
 478
 479         err = sk_filter(sk, skb);
 480         if (err)
 481                 return err;
 482
 483         return __sock_queue_rcv_skb(sk, skb);
 484 }
 485 EXPORT_SYMBOL(sock_queue_rcv_skb);
 486
 487 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 488                      const int nested, unsigned int trim_cap, bool refcounted)
 489 {
 490         int rc = NET_RX_SUCCESS;
 491
 492         if (sk_filter_trim_cap(sk, skb, trim_cap))
 493                 goto discard_and_relse;
 494
 495         skb->dev = NULL;
 496
 497         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 498                 atomic_inc(&sk->sk_drops);
 499                 goto discard_and_relse;
 500         }
 501         if (nested)
 502                 bh_lock_sock_nested(sk);
 503         else
 504                 bh_lock_sock(sk);
 505         if (!sock_owned_by_user(sk)) {
 506                 /*
 507                  * trylock + unlock semantics:
 508                  */
 509                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 510
 511                 rc = sk_backlog_rcv(sk, skb);
 512
 513                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 514         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 515                 bh_unlock_sock(sk);
 516                 atomic_inc(&sk->sk_drops);
 517                 goto discard_and_relse;
 518         }
 519
 520         bh_unlock_sock(sk);
 521 out:
 522         if (refcounted)
 523                 sock_put(sk);
 524         return rc;
 525 discard_and_relse:
 526         kfree_skb(skb);
 527         goto out;
 528 }
 529 EXPORT_SYMBOL(__sk_receive_skb);
 530
 531 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 532 {
 533         struct dst_entry *dst = __sk_dst_get(sk);
 534
 535         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 536                 sk_tx_queue_clear(sk);
 537                 sk->sk_dst_pending_confirm = 0;
 538                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 539                 dst_release(dst);
 540                 return NULL;
 541         }
 542
 543         return dst;
 544 }
 545 EXPORT_SYMBOL(__sk_dst_check);
 546
 547 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 548 {
 549         struct dst_entry *dst = sk_dst_get(sk);
 550
 551         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 552                 sk_dst_reset(sk);
 553                 dst_release(dst);
 554                 return NULL;
 555         }
 556
 557         return dst;
 558 }
 559 EXPORT_SYMBOL(sk_dst_check);
 560
 561 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 562                                 int optlen)
 563 {
 564         int ret = -ENOPROTOOPT;
 565 #ifdef CONFIG_NETDEVICES
 566         struct net *net = sock_net(sk);
 567         char devname[IFNAMSIZ];
 568         int index;
 569
 570         /* Sorry... */
 571         ret = -EPERM;
 572         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 573                 goto out;
 574
 575         ret = -EINVAL;
 576         if (optlen < 0)
 577                 goto out;
 578
 579         /* Bind this socket to a particular device like "eth0",
 580          * as specified in the passed interface name. If the
 581          * name is "" or the option length is zero the socket
 582          * is not bound.
 583          */
 584         if (optlen > IFNAMSIZ - 1)
 585                 optlen = IFNAMSIZ - 1;
 586         memset(devname, 0, sizeof(devname));
 587
 588         ret = -EFAULT;
 589         if (copy_from_user(devname, optval, optlen))
 590                 goto out;
 591
 592         index = 0;
 593         if (devname[0] != '\0') {
 594                 struct net_device *dev;
 595
 596                 rcu_read_lock();
 597                 dev = dev_get_by_name_rcu(net, devname);
 598                 if (dev)
 599                         index = dev->ifindex;
 600                 rcu_read_unlock();
 601                 ret = -ENODEV;
 602                 if (!dev)
 603                         goto out;
 604         }
 605
 606         lock_sock(sk);
 607         sk->sk_bound_dev_if = index;
 608         sk_dst_reset(sk);
 609         release_sock(sk);
 610
 611         ret = 0;
 612
 613 out:
 614 #endif
 615
 616         return ret;
 617 }
 618
 619 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 620                                 int __user *optlen, int len)
 621 {
 622         int ret = -ENOPROTOOPT;
 623 #ifdef CONFIG_NETDEVICES
 624         struct net *net = sock_net(sk);
 625         char devname[IFNAMSIZ];
 626
 627         if (sk->sk_bound_dev_if == 0) {
 628                 len = 0;
 629                 goto zero;
 630         }
 631
 632         ret = -EINVAL;
 633         if (len < IFNAMSIZ)
 634                 goto out;
 635
 636         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 637         if (ret)
 638                 goto out;
 639
 640         len = strlen(devname) + 1;
 641
 642         ret = -EFAULT;
 643         if (copy_to_user(optval, devname, len))
 644                 goto out;
 645
 646 zero:
 647         ret = -EFAULT;
 648         if (put_user(len, optlen))
 649                 goto out;
 650
 651         ret = 0;
 652
 653 out:
 654 #endif
 655
 656         return ret;
 657 }
 658
 659 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 660 {
 661         if (valbool)
 662                 sock_set_flag(sk, bit);
 663         else
 664                 sock_reset_flag(sk, bit);
 665 }
 666
 667 bool sk_mc_loop(struct sock *sk)
 668 {
 669         if (dev_recursion_level())
 670                 return false;
 671         if (!sk)
 672                 return true;
 673         switch (sk->sk_family) {
 674         case AF_INET:
 675                 return inet_sk(sk)->mc_loop;
 676 #if IS_ENABLED(CONFIG_IPV6)
 677         case AF_INET6:
 678                 return inet6_sk(sk)->mc_loop;
 679 #endif
 680         }
 681         WARN_ON(1);
 682         return true;
 683 }
 684 EXPORT_SYMBOL(sk_mc_loop);
 685
 686 /*
 687  *      This is meant for all protocols to use and covers goings on
 688  *      at the socket level. Everything here is generic.
 689  */
 690
 691 int sock_setsockopt(struct socket *sock, int level, int optname,
 692                     char __user *optval, unsigned int optlen)
 693 {
 694         struct sock *sk = sock->sk;
 695         int val;
 696         int valbool;
 697         struct linger ling;
 698         int ret = 0;
 699
 700         /*
 701          *      Options without arguments
 702          */
 703
 704         if (optname == SO_BINDTODEVICE)
 705                 return sock_setbindtodevice(sk, optval, optlen);
 706
 707         if (optlen < sizeof(int))
 708                 return -EINVAL;
 709
 710         if (get_user(val, (int __user *)optval))
 711                 return -EFAULT;
 712
 713         valbool = val ? 1 : 0;
 714
 715         lock_sock(sk);
 716
 717         switch (optname) {
 718         case SO_DEBUG:
 719                 if (val && !capable(CAP_NET_ADMIN))
 720                         ret = -EACCES;
 721                 else
 722                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 723                 break;
 724         case SO_REUSEADDR:
 725                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 726                 break;
 727         case SO_REUSEPORT:
 728                 sk->sk_reuseport = valbool;
 729                 break;
 730         case SO_TYPE:
 731         case SO_PROTOCOL:
 732         case SO_DOMAIN:
 733         case SO_ERROR:
 734                 ret = -ENOPROTOOPT;
 735                 break;
 736         case SO_DONTROUTE:
 737                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 738                 sk_dst_reset(sk);
 739                 break;
 740         case SO_BROADCAST:
 741                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 742                 break;
 743         case SO_SNDBUF:
 744                 /* Don't error on this BSD doesn't and if you think
 745                  * about it this is right. Otherwise apps have to
 746                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 747                  * are treated in BSD as hints
 748                  */
 749                 val = min_t(u32, val, sysctl_wmem_max);
 750 set_sndbuf:
 751                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 752                 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 753                 /* Wake up sending tasks if we upped the value. */
 754                 sk->sk_write_space(sk);
 755                 break;
 756
 757         case SO_SNDBUFFORCE:
 758                 if (!capable(CAP_NET_ADMIN)) {
 759                         ret = -EPERM;
 760                         break;
 761                 }
 762                 goto set_sndbuf;
 763
 764         case SO_RCVBUF:
 765                 /* Don't error on this BSD doesn't and if you think
 766                  * about it this is right. Otherwise apps have to
 767                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 768                  * are treated in BSD as hints
 769                  */
 770                 val = min_t(u32, val, sysctl_rmem_max);
 771 set_rcvbuf:
 772                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 773                 /*
 774                  * We double it on the way in to account for
 775                  * "struct sk_buff" etc. overhead.   Applications
 776                  * assume that the SO_RCVBUF setting they make will
 777                  * allow that much actual data to be received on that
 778                  * socket.
 779                  *
 780                  * Applications are unaware that "struct sk_buff" and
 781                  * other overheads allocate from the receive buffer
 782                  * during socket buffer allocation.
 783                  *
 784                  * And after considering the possible alternatives,
 785                  * returning the value we actually used in getsockopt
 786                  * is the most desirable behavior.
 787                  */
 788                 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 789                 break;
 790
 791         case SO_RCVBUFFORCE:
 792                 if (!capable(CAP_NET_ADMIN)) {
 793                         ret = -EPERM;
 794                         break;
 795                 }
 796                 goto set_rcvbuf;
 797
 798         case SO_KEEPALIVE:
 799                 if (sk->sk_prot->keepalive)
 800                         sk->sk_prot->keepalive(sk, valbool);
 801                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 802                 break;
 803
 804         case SO_OOBINLINE:
 805                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 806                 break;
 807
 808         case SO_NO_CHECK:
 809                 sk->sk_no_check_tx = valbool;
 810                 break;
 811
 812         case SO_PRIORITY:
 813                 if ((val >= 0 && val <= 6) ||
 814                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 815                         sk->sk_priority = val;
 816                 else
 817                         ret = -EPERM;
 818                 break;
 819
 820         case SO_LINGER:
 821                 if (optlen < sizeof(ling)) {
 822                         ret = -EINVAL;  /* 1003.1g */
 823                         break;
 824                 }
 825                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 826                         ret = -EFAULT;
 827                         break;
 828                 }
 829                 if (!ling.l_onoff)
 830                         sock_reset_flag(sk, SOCK_LINGER);
 831                 else {
 832 #if (BITS_PER_LONG == 32)
 833                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 834                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 835                         else
 836 #endif
 837                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 838                         sock_set_flag(sk, SOCK_LINGER);
 839                 }
 840                 break;
 841
 842         case SO_BSDCOMPAT:
 843                 sock_warn_obsolete_bsdism("setsockopt");
 844                 break;
 845
 846         case SO_PASSCRED:
 847                 if (valbool)
 848                         set_bit(SOCK_PASSCRED, &sock->flags);
 849                 else
 850                         clear_bit(SOCK_PASSCRED, &sock->flags);
 851                 break;
 852
 853         case SO_TIMESTAMP:
 854         case SO_TIMESTAMPNS:
 855                 if (valbool)  {
 856                         if (optname == SO_TIMESTAMP)
 857                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 858                         else
 859                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 860                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 861                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 862                 } else {
 863                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 864                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 865                 }
 866                 break;
 867
 868         case SO_TIMESTAMPING:
 869                 if (val & ~SOF_TIMESTAMPING_MASK) {
 870                         ret = -EINVAL;
 871                         break;
 872                 }
 873
 874                 if (val & SOF_TIMESTAMPING_OPT_ID &&
 875                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 876                         if (sk->sk_protocol == IPPROTO_TCP &&
 877                             sk->sk_type == SOCK_STREAM) {
 878                                 if ((1 << sk->sk_state) &
 879                                     (TCPF_CLOSE | TCPF_LISTEN)) {
 880                                         ret = -EINVAL;
 881                                         break;
 882                                 }
 883                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
 884                         } else {
 885                                 sk->sk_tskey = 0;
 886                         }
 887                 }
 888
 889                 if (val & SOF_TIMESTAMPING_OPT_STATS &&
 890                     !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 891                         ret = -EINVAL;
 892                         break;
 893                 }
 894
 895                 sk->sk_tsflags = val;
 896                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 897                         sock_enable_timestamp(sk,
 898                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 899                 else
 900                         sock_disable_timestamp(sk,
 901                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 902                 break;
 903
 904         case SO_RCVLOWAT:
 905                 if (val < 0)
 906                         val = INT_MAX;
 907                 sk->sk_rcvlowat = val ? : 1;
 908                 break;
 909
 910         case SO_RCVTIMEO:
 911                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 912                 break;
 913
 914         case SO_SNDTIMEO:
 915                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 916                 break;
 917
 918         case SO_ATTACH_FILTER:
 919                 ret = -EINVAL;
 920                 if (optlen == sizeof(struct sock_fprog)) {
 921                         struct sock_fprog fprog;
 922
 923                         ret = -EFAULT;
 924                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 925                                 break;
 926
 927                         ret = sk_attach_filter(&fprog, sk);
 928                 }
 929                 break;
 930
 931         case SO_ATTACH_BPF:
 932                 ret = -EINVAL;
 933                 if (optlen == sizeof(u32)) {
 934                         u32 ufd;
 935
 936                         ret = -EFAULT;
 937                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 938                                 break;
 939
 940                         ret = sk_attach_bpf(ufd, sk);
 941                 }
 942                 break;
 943
 944         case SO_ATTACH_REUSEPORT_CBPF:
 945                 ret = -EINVAL;
 946                 if (optlen == sizeof(struct sock_fprog)) {
 947                         struct sock_fprog fprog;
 948
 949                         ret = -EFAULT;
 950                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 951                                 break;
 952
 953                         ret = sk_reuseport_attach_filter(&fprog, sk);
 954                 }
 955                 break;
 956
 957         case SO_ATTACH_REUSEPORT_EBPF:
 958                 ret = -EINVAL;
 959                 if (optlen == sizeof(u32)) {
 960                         u32 ufd;
 961
 962                         ret = -EFAULT;
 963                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 964                                 break;
 965
 966                         ret = sk_reuseport_attach_bpf(ufd, sk);
 967                 }
 968                 break;
 969
 970         case SO_DETACH_FILTER:
 971                 ret = sk_detach_filter(sk);
 972                 break;
 973
 974         case SO_LOCK_FILTER:
 975                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 976                         ret = -EPERM;
 977                 else
 978                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 979                 break;
 980
 981         case SO_PASSSEC:
 982                 if (valbool)
 983                         set_bit(SOCK_PASSSEC, &sock->flags);
 984                 else
 985                         clear_bit(SOCK_PASSSEC, &sock->flags);
 986                 break;
 987         case SO_MARK:
 988                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 989                         ret = -EPERM;
 990                 else
 991                         sk->sk_mark = val;
 992                 break;
 993
 994         case SO_RXQ_OVFL:
 995                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 996                 break;
 997
 998         case SO_WIFI_STATUS:
 999                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1000                 break;
1001
1002         case SO_PEEK_OFF:
1003                 if (sock->ops->set_peek_off)
1004                         ret = sock->ops->set_peek_off(sk, val);
1005                 else
1006                         ret = -EOPNOTSUPP;
1007                 break;
1008
1009         case SO_NOFCS:
1010                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1011                 break;
1012
1013         case SO_SELECT_ERR_QUEUE:
1014                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1015                 break;
1016
1017 #ifdef CONFIG_NET_RX_BUSY_POLL
1018         case SO_BUSY_POLL:
1019                 /* allow unprivileged users to decrease the value */
1020                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1021                         ret = -EPERM;
1022                 else {
1023                         if (val < 0)
1024                                 ret = -EINVAL;
1025                         else
1026                                 WRITE_ONCE(sk->sk_ll_usec, val);
1027                 }
1028                 break;
1029 #endif
1030
1031         case SO_MAX_PACING_RATE:
1032                 if (val != ~0U)
1033                         cmpxchg(&sk->sk_pacing_status,
1034                                 SK_PACING_NONE,
1035                                 SK_PACING_NEEDED);
1036                 sk->sk_max_pacing_rate = val;
1037                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1038                                          sk->sk_max_pacing_rate);
1039                 break;
1040
1041         case SO_INCOMING_CPU:
1042                 WRITE_ONCE(sk->sk_incoming_cpu, val);
1043                 break;
1044
1045         case SO_CNX_ADVICE:
1046                 if (val == 1)
1047                         dst_negative_advice(sk);
1048                 break;
1049
1050         case SO_ZEROCOPY:
1051                 if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
1052                         ret = -ENOTSUPP;
1053                 else if (sk->sk_protocol != IPPROTO_TCP)
1054                         ret = -ENOTSUPP;
1055                 else if (sk->sk_state != TCP_CLOSE)
1056                         ret = -EBUSY;
1057                 else if (val < 0 || val > 1)
1058                         ret = -EINVAL;
1059                 else
1060                         sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1061                 break;
1062
1063         default:
1064                 ret = -ENOPROTOOPT;
1065                 break;
1066         }
1067         release_sock(sk);
1068         return ret;
1069 }
1070 EXPORT_SYMBOL(sock_setsockopt);
1071
1072 static const struct cred *sk_get_peer_cred(struct sock *sk)
1073 {
1074         const struct cred *cred;
1075
1076         spin_lock(&sk->sk_peer_lock);
1077         cred = get_cred(sk->sk_peer_cred);
1078         spin_unlock(&sk->sk_peer_lock);
1079
1080         return cred;
1081 }
1082
1083 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1084                           struct ucred *ucred)
1085 {
1086         ucred->pid = pid_vnr(pid);
1087         ucred->uid = ucred->gid = -1;
1088         if (cred) {
1089                 struct user_namespace *current_ns = current_user_ns();
1090
1091                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1092                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1093         }
1094 }
1095
1096 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1097 {
1098         struct user_namespace *user_ns = current_user_ns();
1099         int i;
1100
1101         for (i = 0; i < src->ngroups; i++)
1102                 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1103                         return -EFAULT;
1104
1105         return 0;
1106 }
1107
1108 int sock_getsockopt(struct socket *sock, int level, int optname,
1109                     char __user *optval, int __user *optlen)
1110 {
1111         struct sock *sk = sock->sk;
1112
1113         union {
1114                 int val;
1115                 u64 val64;
1116                 struct linger ling;
1117                 struct timeval tm;
1118         } v;
1119
1120         int lv = sizeof(int);
1121         int len;
1122
1123         if (get_user(len, optlen))
1124                 return -EFAULT;
1125         if (len < 0)
1126                 return -EINVAL;
1127
1128         memset(&v, 0, sizeof(v));
1129
1130         switch (optname) {
1131         case SO_DEBUG:
1132                 v.val = sock_flag(sk, SOCK_DBG);
1133                 break;
1134
1135         case SO_DONTROUTE:
1136                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1137                 break;
1138
1139         case SO_BROADCAST:
1140                 v.val = sock_flag(sk, SOCK_BROADCAST);
1141                 break;
1142
1143         case SO_SNDBUF:
1144                 v.val = sk->sk_sndbuf;
1145                 break;
1146
1147         case SO_RCVBUF:
1148                 v.val = sk->sk_rcvbuf;
1149                 break;
1150
1151         case SO_REUSEADDR:
1152                 v.val = sk->sk_reuse;
1153                 break;
1154
1155         case SO_REUSEPORT:
1156                 v.val = sk->sk_reuseport;
1157                 break;
1158
1159         case SO_KEEPALIVE:
1160                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1161                 break;
1162
1163         case SO_TYPE:
1164                 v.val = sk->sk_type;
1165                 break;
1166
1167         case SO_PROTOCOL:
1168                 v.val = sk->sk_protocol;
1169                 break;
1170
1171         case SO_DOMAIN:
1172                 v.val = sk->sk_family;
1173                 break;
1174
1175         case SO_ERROR:
1176                 v.val = -sock_error(sk);
1177                 if (v.val == 0)
1178                         v.val = xchg(&sk->sk_err_soft, 0);
1179                 break;
1180
1181         case SO_OOBINLINE:
1182                 v.val = sock_flag(sk, SOCK_URGINLINE);
1183                 break;
1184
1185         case SO_NO_CHECK:
1186                 v.val = sk->sk_no_check_tx;
1187                 break;
1188
1189         case SO_PRIORITY:
1190                 v.val = sk->sk_priority;
1191                 break;
1192
1193         case SO_LINGER:
1194                 lv              = sizeof(v.ling);
1195                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1196                 v.ling.l_linger = sk->sk_lingertime / HZ;
1197                 break;
1198
1199         case SO_BSDCOMPAT:
1200                 sock_warn_obsolete_bsdism("getsockopt");
1201                 break;
1202
1203         case SO_TIMESTAMP:
1204                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1205                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1206                 break;
1207
1208         case SO_TIMESTAMPNS:
1209                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1210                 break;
1211
1212         case SO_TIMESTAMPING:
1213                 v.val = sk->sk_tsflags;
1214                 break;
1215
1216         case SO_RCVTIMEO:
1217                 lv = sizeof(struct timeval);
1218                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1219                         v.tm.tv_sec = 0;
1220                         v.tm.tv_usec = 0;
1221                 } else {
1222                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1223                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1224                 }
1225                 break;
1226
1227         case SO_SNDTIMEO:
1228                 lv = sizeof(struct timeval);
1229                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1230                         v.tm.tv_sec = 0;
1231                         v.tm.tv_usec = 0;
1232                 } else {
1233                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1234                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1235                 }
1236                 break;
1237
1238         case SO_RCVLOWAT:
1239                 v.val = sk->sk_rcvlowat;
1240                 break;
1241
1242         case SO_SNDLOWAT:
1243                 v.val = 1;
1244                 break;
1245
1246         case SO_PASSCRED:
1247                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1248                 break;
1249
1250         case SO_PEERCRED:
1251         {
1252                 struct ucred peercred;
1253                 if (len > sizeof(peercred))
1254                         len = sizeof(peercred);
1255
1256                 spin_lock(&sk->sk_peer_lock);
1257                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1258                 spin_unlock(&sk->sk_peer_lock);
1259
1260                 if (copy_to_user(optval, &peercred, len))
1261                         return -EFAULT;
1262                 goto lenout;
1263         }
1264
1265         case SO_PEERGROUPS:
1266         {
1267                 const struct cred *cred;
1268                 int ret, n;
1269
1270                 cred = sk_get_peer_cred(sk);
1271                 if (!cred)
1272                         return -ENODATA;
1273
1274                 n = cred->group_info->ngroups;
1275                 if (len < n * sizeof(gid_t)) {
1276                         len = n * sizeof(gid_t);
1277                         put_cred(cred);
1278                         return put_user(len, optlen) ? -EFAULT : -ERANGE;
1279                 }
1280                 len = n * sizeof(gid_t);
1281
1282                 ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1283                 put_cred(cred);
1284                 if (ret)
1285                         return ret;
1286                 goto lenout;
1287         }
1288
1289         case SO_PEERNAME:
1290         {
1291                 char address[128];
1292
1293                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1294                         return -ENOTCONN;
1295                 if (lv < len)
1296                         return -EINVAL;
1297                 if (copy_to_user(optval, address, len))
1298                         return -EFAULT;
1299                 goto lenout;
1300         }
1301
1302         /* Dubious BSD thing... Probably nobody even uses it, but
1303          * the UNIX standard wants it for whatever reason... -DaveM
1304          */
1305         case SO_ACCEPTCONN:
1306                 v.val = sk->sk_state == TCP_LISTEN;
1307                 break;
1308
1309         case SO_PASSSEC:
1310                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1311                 break;
1312
1313         case SO_PEERSEC:
1314                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1315
1316         case SO_MARK:
1317                 v.val = sk->sk_mark;
1318                 break;
1319
1320         case SO_RXQ_OVFL:
1321                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1322                 break;
1323
1324         case SO_WIFI_STATUS:
1325                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1326                 break;
1327
1328         case SO_PEEK_OFF:
1329                 if (!sock->ops->set_peek_off)
1330                         return -EOPNOTSUPP;
1331
1332                 v.val = sk->sk_peek_off;
1333                 break;
1334         case SO_NOFCS:
1335                 v.val = sock_flag(sk, SOCK_NOFCS);
1336                 break;
1337
1338         case SO_BINDTODEVICE:
1339                 return sock_getbindtodevice(sk, optval, optlen, len);
1340
1341         case SO_GET_FILTER:
1342                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1343                 if (len < 0)
1344                         return len;
1345
1346                 goto lenout;
1347
1348         case SO_LOCK_FILTER:
1349                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1350                 break;
1351
1352         case SO_BPF_EXTENSIONS:
1353                 v.val = bpf_tell_extensions();
1354                 break;
1355
1356         case SO_SELECT_ERR_QUEUE:
1357                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1358                 break;
1359
1360 #ifdef CONFIG_NET_RX_BUSY_POLL
1361         case SO_BUSY_POLL:
1362                 v.val = sk->sk_ll_usec;
1363                 break;
1364 #endif
1365
1366         case SO_MAX_PACING_RATE:
1367                 v.val = sk->sk_max_pacing_rate;
1368                 break;
1369
1370         case SO_INCOMING_CPU:
1371                 v.val = READ_ONCE(sk->sk_incoming_cpu);
1372                 break;
1373
1374         case SO_MEMINFO:
1375         {
1376                 u32 meminfo[SK_MEMINFO_VARS];
1377
1378                 sk_get_meminfo(sk, meminfo);
1379
1380                 len = min_t(unsigned int, len, sizeof(meminfo));
1381                 if (copy_to_user(optval, &meminfo, len))
1382                         return -EFAULT;
1383
1384                 goto lenout;
1385         }
1386
1387 #ifdef CONFIG_NET_RX_BUSY_POLL
1388         case SO_INCOMING_NAPI_ID:
1389                 v.val = READ_ONCE(sk->sk_napi_id);
1390
1391                 /* aggregate non-NAPI IDs down to 0 */
1392                 if (v.val < MIN_NAPI_ID)
1393                         v.val = 0;
1394
1395                 break;
1396 #endif
1397
1398         case SO_COOKIE:
1399                 lv = sizeof(u64);
1400                 if (len < lv)
1401                         return -EINVAL;
1402                 v.val64 = sock_gen_cookie(sk);
1403                 break;
1404
1405         case SO_ZEROCOPY:
1406                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1407                 break;
1408
1409         default:
1410                 /* We implement the SO_SNDLOWAT etc to not be settable
1411                  * (1003.1g 7).
1412                  */
1413                 return -ENOPROTOOPT;
1414         }
1415
1416         if (len > lv)
1417                 len = lv;
1418         if (copy_to_user(optval, &v, len))
1419                 return -EFAULT;
1420 lenout:
1421         if (put_user(len, optlen))
1422                 return -EFAULT;
1423         return 0;
1424 }
1425
1426 /*
1427  * Initialize an sk_lock.
1428  *
1429  * (We also register the sk_lock with the lock validator.)
1430  */
1431 static inline void sock_lock_init(struct sock *sk)
1432 {
1433         if (sk->sk_kern_sock)
1434                 sock_lock_init_class_and_name(
1435                         sk,
1436                         af_family_kern_slock_key_strings[sk->sk_family],
1437                         af_family_kern_slock_keys + sk->sk_family,
1438                         af_family_kern_key_strings[sk->sk_family],
1439                         af_family_kern_keys + sk->sk_family);
1440         else
1441                 sock_lock_init_class_and_name(
1442                         sk,
1443                         af_family_slock_key_strings[sk->sk_family],
1444                         af_family_slock_keys + sk->sk_family,
1445                         af_family_key_strings[sk->sk_family],
1446                         af_family_keys + sk->sk_family);
1447 }
1448
1449 /*
1450  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1451  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1452  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1453  */
1454 static void sock_copy(struct sock *nsk, const struct sock *osk)
1455 {
1456 #ifdef CONFIG_SECURITY_NETWORK
1457         void *sptr = nsk->sk_security;
1458 #endif
1459         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1460
1461         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1462                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1463
1464 #ifdef CONFIG_SECURITY_NETWORK
1465         nsk->sk_security = sptr;
1466         security_sk_clone(osk, nsk);
1467 #endif
1468 }
1469
1470 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1471                 int family)
1472 {
1473         struct sock *sk;
1474         struct kmem_cache *slab;
1475
1476         slab = prot->slab;
1477         if (slab != NULL) {
1478                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1479                 if (!sk)
1480                         return sk;
1481                 if (priority & __GFP_ZERO)
1482                         sk_prot_clear_nulls(sk, prot->obj_size);
1483         } else
1484                 sk = kmalloc(prot->obj_size, priority);
1485
1486         if (sk != NULL) {
1487                 if (security_sk_alloc(sk, family, priority))
1488                         goto out_free;
1489
1490                 if (!try_module_get(prot->owner))
1491                         goto out_free_sec;
1492                 sk_tx_queue_clear(sk);
1493         }
1494
1495         return sk;
1496
1497 out_free_sec:
1498         security_sk_free(sk);
1499 out_free:
1500         if (slab != NULL)
1501                 kmem_cache_free(slab, sk);
1502         else
1503                 kfree(sk);
1504         return NULL;
1505 }
1506
1507 static void sk_prot_free(struct proto *prot, struct sock *sk)
1508 {
1509         struct kmem_cache *slab;
1510         struct module *owner;
1511
1512         owner = prot->owner;
1513         slab = prot->slab;
1514
1515         cgroup_sk_free(&sk->sk_cgrp_data);
1516         mem_cgroup_sk_free(sk);
1517         security_sk_free(sk);
1518         if (slab != NULL)
1519                 kmem_cache_free(slab, sk);
1520         else
1521                 kfree(sk);
1522         module_put(owner);
1523 }
1524
1525 /**
1526  *      sk_alloc - All socket objects are allocated here
1527  *      @net: the applicable net namespace
1528  *      @family: protocol family
1529  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1530  *      @prot: struct proto associated with this new sock instance
1531  *      @kern: is this to be a kernel socket?
1532  */
1533 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1534                       struct proto *prot, int kern)
1535 {
1536         struct sock *sk;
1537
1538         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1539         if (sk) {
1540                 sk->sk_family = family;
1541                 /*
1542                  * See comment in struct sock definition to understand
1543                  * why we need sk_prot_creator -acme
1544                  */
1545                 sk->sk_prot = sk->sk_prot_creator = prot;
1546                 sk->sk_kern_sock = kern;
1547                 sock_lock_init(sk);
1548                 sk->sk_net_refcnt = kern ? 0 : 1;
1549                 if (likely(sk->sk_net_refcnt))
1550                         get_net(net);
1551                 sock_net_set(sk, net);
1552                 refcount_set(&sk->sk_wmem_alloc, 1);
1553
1554                 mem_cgroup_sk_alloc(sk);
1555                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1556                 sock_update_classid(&sk->sk_cgrp_data);
1557                 sock_update_netprioidx(&sk->sk_cgrp_data);
1558                 sk_tx_queue_clear(sk);
1559         }
1560
1561         return sk;
1562 }
1563 EXPORT_SYMBOL(sk_alloc);
1564
1565 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1566  * grace period. This is the case for UDP sockets and TCP listeners.
1567  */
1568 static void __sk_destruct(struct rcu_head *head)
1569 {
1570         struct sock *sk = container_of(head, struct sock, sk_rcu);
1571         struct sk_filter *filter;
1572
1573         if (sk->sk_destruct)
1574                 sk->sk_destruct(sk);
1575
1576         filter = rcu_dereference_check(sk->sk_filter,
1577                                        refcount_read(&sk->sk_wmem_alloc) == 0);
1578         if (filter) {
1579                 sk_filter_uncharge(sk, filter);
1580                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1581         }
1582
1583         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1584
1585         if (atomic_read(&sk->sk_omem_alloc))
1586                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1587                          __func__, atomic_read(&sk->sk_omem_alloc));
1588
1589         if (sk->sk_frag.page) {
1590                 put_page(sk->sk_frag.page);
1591                 sk->sk_frag.page = NULL;
1592         }
1593
1594         /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
1595         put_cred(sk->sk_peer_cred);
1596         put_pid(sk->sk_peer_pid);
1597
1598         if (likely(sk->sk_net_refcnt))
1599                 put_net(sock_net(sk));
1600         sk_prot_free(sk->sk_prot_creator, sk);
1601 }
1602
1603 void sk_destruct(struct sock *sk)
1604 {
1605         bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1606
1607         if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1608                 reuseport_detach_sock(sk);
1609                 use_call_rcu = true;
1610         }
1611
1612         if (use_call_rcu)
1613                 call_rcu(&sk->sk_rcu, __sk_destruct);
1614         else
1615                 __sk_destruct(&sk->sk_rcu);
1616 }
1617
1618 static void __sk_free(struct sock *sk)
1619 {
1620         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1621                 sock_diag_broadcast_destroy(sk);
1622         else
1623                 sk_destruct(sk);
1624 }
1625
1626 void sk_free(struct sock *sk)
1627 {
1628         /*
1629          * We subtract one from sk_wmem_alloc and can know if
1630          * some packets are still in some tx queue.
1631          * If not null, sock_wfree() will call __sk_free(sk) later
1632          */
1633         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1634                 __sk_free(sk);
1635 }
1636 EXPORT_SYMBOL(sk_free);
1637
1638 static void sk_init_common(struct sock *sk)
1639 {
1640         skb_queue_head_init(&sk->sk_receive_queue);
1641         skb_queue_head_init(&sk->sk_write_queue);
1642         skb_queue_head_init(&sk->sk_error_queue);
1643
1644         rwlock_init(&sk->sk_callback_lock);
1645         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1646                         af_rlock_keys + sk->sk_family,
1647                         af_family_rlock_key_strings[sk->sk_family]);
1648         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1649                         af_wlock_keys + sk->sk_family,
1650                         af_family_wlock_key_strings[sk->sk_family]);
1651         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1652                         af_elock_keys + sk->sk_family,
1653                         af_family_elock_key_strings[sk->sk_family]);
1654         lockdep_set_class_and_name(&sk->sk_callback_lock,
1655                         af_callback_keys + sk->sk_family,
1656                         af_family_clock_key_strings[sk->sk_family]);
1657 }
1658
1659 /**
1660  *      sk_clone_lock - clone a socket, and lock its clone
1661  *      @sk: the socket to clone
1662  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1663  *
1664  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1665  */
1666 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1667 {
1668         struct sock *newsk;
1669         bool is_charged = true;
1670
1671         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1672         if (newsk != NULL) {
1673                 struct sk_filter *filter;
1674
1675                 sock_copy(newsk, sk);
1676
1677                 newsk->sk_prot_creator = sk->sk_prot;
1678
1679                 /* SANITY */
1680                 if (likely(newsk->sk_net_refcnt))
1681                         get_net(sock_net(newsk));
1682                 sk_node_init(&newsk->sk_node);
1683                 sock_lock_init(newsk);
1684                 bh_lock_sock(newsk);
1685                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1686                 newsk->sk_backlog.len = 0;
1687
1688                 atomic_set(&newsk->sk_rmem_alloc, 0);
1689                 /*
1690                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1691                  */
1692                 refcount_set(&newsk->sk_wmem_alloc, 1);
1693                 atomic_set(&newsk->sk_omem_alloc, 0);
1694                 sk_init_common(newsk);
1695
1696                 newsk->sk_dst_cache     = NULL;
1697                 newsk->sk_dst_pending_confirm = 0;
1698                 newsk->sk_wmem_queued   = 0;
1699                 newsk->sk_forward_alloc = 0;
1700                 atomic_set(&newsk->sk_drops, 0);
1701                 newsk->sk_send_head     = NULL;
1702                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1703                 atomic_set(&newsk->sk_zckey, 0);
1704
1705                 sock_reset_flag(newsk, SOCK_DONE);
1706
1707                 /* sk->sk_memcg will be populated at accept() time */
1708                 newsk->sk_memcg = NULL;
1709
1710                 cgroup_sk_clone(&newsk->sk_cgrp_data);
1711
1712                 rcu_read_lock();
1713                 filter = rcu_dereference(sk->sk_filter);
1714                 if (filter != NULL)
1715                         /* though it's an empty new sock, the charging may fail
1716                          * if sysctl_optmem_max was changed between creation of
1717                          * original socket and cloning
1718                          */
1719                         is_charged = sk_filter_charge(newsk, filter);
1720                 RCU_INIT_POINTER(newsk->sk_filter, filter);
1721                 rcu_read_unlock();
1722
1723                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1724                         /* We need to make sure that we don't uncharge the new
1725                          * socket if we couldn't charge it in the first place
1726                          * as otherwise we uncharge the parent's filter.
1727                          */
1728                         if (!is_charged)
1729                                 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1730                         sk_free_unlock_clone(newsk);
1731                         newsk = NULL;
1732                         goto out;
1733                 }
1734                 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1735
1736                 newsk->sk_err      = 0;
1737                 newsk->sk_err_soft = 0;
1738                 newsk->sk_priority = 0;
1739                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1740                 atomic64_set(&newsk->sk_cookie, 0);
1741
1742                 /*
1743                  * Before updating sk_refcnt, we must commit prior changes to memory
1744                  * (Documentation/RCU/rculist_nulls.txt for details)
1745                  */
1746                 smp_wmb();
1747                 refcount_set(&newsk->sk_refcnt, 2);
1748
1749                 /*
1750                  * Increment the counter in the same struct proto as the master
1751                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1752                  * is the same as sk->sk_prot->socks, as this field was copied
1753                  * with memcpy).
1754                  *
1755                  * This _changes_ the previous behaviour, where
1756                  * tcp_create_openreq_child always was incrementing the
1757                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1758                  * to be taken into account in all callers. -acme
1759                  */
1760                 sk_refcnt_debug_inc(newsk);
1761                 sk_set_socket(newsk, NULL);
1762                 sk_tx_queue_clear(newsk);
1763                 newsk->sk_wq = NULL;
1764
1765                 if (newsk->sk_prot->sockets_allocated)
1766                         sk_sockets_allocated_inc(newsk);
1767
1768                 if (sock_needs_netstamp(sk) &&
1769                     newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1770                         net_enable_timestamp();
1771         }
1772 out:
1773         return newsk;
1774 }
1775 EXPORT_SYMBOL_GPL(sk_clone_lock);
1776
1777 void sk_free_unlock_clone(struct sock *sk)
1778 {
1779         /* It is still raw copy of parent, so invalidate
1780          * destructor and make plain sk_free() */
1781         sk->sk_destruct = NULL;
1782         bh_unlock_sock(sk);
1783         sk_free(sk);
1784 }
1785 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1786
1787 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1788 {
1789         u32 max_segs = 1;
1790
1791         sk_dst_set(sk, dst);
1792         sk->sk_route_caps = dst->dev->features;
1793         if (sk->sk_route_caps & NETIF_F_GSO)
1794                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1795         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1796         if (sk_can_gso(sk)) {
1797                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1798                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1799                 } else {
1800                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1801                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1802                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1803                 }
1804         }
1805         sk->sk_gso_max_segs = max_segs;
1806 }
1807 EXPORT_SYMBOL_GPL(sk_setup_caps);
1808
1809 /*
1810  *      Simple resource managers for sockets.
1811  */
1812
1813
1814 /*
1815  * Write buffer destructor automatically called from kfree_skb.
1816  */
1817 void sock_wfree(struct sk_buff *skb)
1818 {
1819         struct sock *sk = skb->sk;
1820         unsigned int len = skb->truesize;
1821
1822         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1823                 /*
1824                  * Keep a reference on sk_wmem_alloc, this will be released
1825                  * after sk_write_space() call
1826                  */
1827                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1828                 sk->sk_write_space(sk);
1829                 len = 1;
1830         }
1831         /*
1832          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1833          * could not do because of in-flight packets
1834          */
1835         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1836                 __sk_free(sk);
1837 }
1838 EXPORT_SYMBOL(sock_wfree);
1839
1840 /* This variant of sock_wfree() is used by TCP,
1841  * since it sets SOCK_USE_WRITE_QUEUE.
1842  */
1843 void __sock_wfree(struct sk_buff *skb)
1844 {
1845         struct sock *sk = skb->sk;
1846
1847         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1848                 __sk_free(sk);
1849 }
1850
1851 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1852 {
1853         skb_orphan(skb);
1854         skb->sk = sk;
1855 #ifdef CONFIG_INET
1856         if (unlikely(!sk_fullsock(sk))) {
1857                 skb->destructor = sock_edemux;
1858                 sock_hold(sk);
1859                 return;
1860         }
1861 #endif
1862         skb->destructor = sock_wfree;
1863         skb_set_hash_from_sk(skb, sk);
1864         /*
1865          * We used to take a refcount on sk, but following operation
1866          * is enough to guarantee sk_free() wont free this sock until
1867          * all in-flight packets are completed
1868          */
1869         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1870 }
1871 EXPORT_SYMBOL(skb_set_owner_w);
1872
1873 /* This helper is used by netem, as it can hold packets in its
1874  * delay queue. We want to allow the owner socket to send more
1875  * packets, as if they were already TX completed by a typical driver.
1876  * But we also want to keep skb->sk set because some packet schedulers
1877  * rely on it (sch_fq for example).
1878  */
1879 void skb_orphan_partial(struct sk_buff *skb)
1880 {
1881         if (skb_is_tcp_pure_ack(skb))
1882                 return;
1883
1884         if (skb->destructor == sock_wfree
1885 #ifdef CONFIG_INET
1886             || skb->destructor == tcp_wfree
1887 #endif
1888                 ) {
1889                 struct sock *sk = skb->sk;
1890
1891                 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1892                         WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1893                         skb->destructor = sock_efree;
1894                 }
1895         } else {
1896                 skb_orphan(skb);
1897         }
1898 }
1899 EXPORT_SYMBOL(skb_orphan_partial);
1900
1901 /*
1902  * Read buffer destructor automatically called from kfree_skb.
1903  */
1904 void sock_rfree(struct sk_buff *skb)
1905 {
1906         struct sock *sk = skb->sk;
1907         unsigned int len = skb->truesize;
1908
1909         atomic_sub(len, &sk->sk_rmem_alloc);
1910         sk_mem_uncharge(sk, len);
1911 }
1912 EXPORT_SYMBOL(sock_rfree);
1913
1914 /*
1915  * Buffer destructor for skbs that are not used directly in read or write
1916  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1917  */
1918 void sock_efree(struct sk_buff *skb)
1919 {
1920         sock_put(skb->sk);
1921 }
1922 EXPORT_SYMBOL(sock_efree);
1923
1924 kuid_t sock_i_uid(struct sock *sk)
1925 {
1926         kuid_t uid;
1927
1928         read_lock_bh(&sk->sk_callback_lock);
1929         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1930         read_unlock_bh(&sk->sk_callback_lock);
1931         return uid;
1932 }
1933 EXPORT_SYMBOL(sock_i_uid);
1934
1935 unsigned long sock_i_ino(struct sock *sk)
1936 {
1937         unsigned long ino;
1938
1939         read_lock_bh(&sk->sk_callback_lock);
1940         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1941         read_unlock_bh(&sk->sk_callback_lock);
1942         return ino;
1943 }
1944 EXPORT_SYMBOL(sock_i_ino);
1945
1946 /*
1947  * Allocate a skb from the socket's send buffer.
1948  */
1949 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1950                              gfp_t priority)
1951 {
1952         if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1953                 struct sk_buff *skb = alloc_skb(size, priority);
1954                 if (skb) {
1955                         skb_set_owner_w(skb, sk);
1956                         return skb;
1957                 }
1958         }
1959         return NULL;
1960 }
1961 EXPORT_SYMBOL(sock_wmalloc);
1962
1963 static void sock_ofree(struct sk_buff *skb)
1964 {
1965         struct sock *sk = skb->sk;
1966
1967         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1968 }
1969
1970 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1971                              gfp_t priority)
1972 {
1973         struct sk_buff *skb;
1974
1975         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1976         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1977             sysctl_optmem_max)
1978                 return NULL;
1979
1980         skb = alloc_skb(size, priority);
1981         if (!skb)
1982                 return NULL;
1983
1984         atomic_add(skb->truesize, &sk->sk_omem_alloc);
1985         skb->sk = sk;
1986         skb->destructor = sock_ofree;
1987         return skb;
1988 }
1989
1990 /*
1991  * Allocate a memory block from the socket's option memory buffer.
1992  */
1993 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1994 {
1995         if ((unsigned int)size <= sysctl_optmem_max &&
1996             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1997                 void *mem;
1998                 /* First do the add, to avoid the race if kmalloc
1999                  * might sleep.
2000                  */
2001                 atomic_add(size, &sk->sk_omem_alloc);
2002                 mem = kmalloc(size, priority);
2003                 if (mem)
2004                         return mem;
2005                 atomic_sub(size, &sk->sk_omem_alloc);
2006         }
2007         return NULL;
2008 }
2009 EXPORT_SYMBOL(sock_kmalloc);
2010
2011 /* Free an option memory block. Note, we actually want the inline
2012  * here as this allows gcc to detect the nullify and fold away the
2013  * condition entirely.
2014  */
2015 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2016                                   const bool nullify)
2017 {
2018         if (WARN_ON_ONCE(!mem))
2019                 return;
2020         if (nullify)
2021                 kzfree(mem);
2022         else
2023                 kfree(mem);
2024         atomic_sub(size, &sk->sk_omem_alloc);
2025 }
2026
2027 void sock_kfree_s(struct sock *sk, void *mem, int size)
2028 {
2029         __sock_kfree_s(sk, mem, size, false);
2030 }
2031 EXPORT_SYMBOL(sock_kfree_s);
2032
2033 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2034 {
2035         __sock_kfree_s(sk, mem, size, true);
2036 }
2037 EXPORT_SYMBOL(sock_kzfree_s);
2038
2039 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2040    I think, these locks should be removed for datagram sockets.
2041  */
2042 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2043 {
2044         DEFINE_WAIT(wait);
2045
2046         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2047         for (;;) {
2048                 if (!timeo)
2049                         break;
2050                 if (signal_pending(current))
2051                         break;
2052                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2053                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2054                 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2055                         break;
2056                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2057                         break;
2058                 if (sk->sk_err)
2059                         break;
2060                 timeo = schedule_timeout(timeo);
2061         }
2062         finish_wait(sk_sleep(sk), &wait);
2063         return timeo;
2064 }
2065
2066
2067 /*
2068  *      Generic send/receive buffer handlers
2069  */
2070
2071 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2072                                      unsigned long data_len, int noblock,
2073                                      int *errcode, int max_page_order)
2074 {
2075         struct sk_buff *skb;
2076         long timeo;
2077         int err;
2078
2079         timeo = sock_sndtimeo(sk, noblock);
2080         for (;;) {
2081                 err = sock_error(sk);
2082                 if (err != 0)
2083                         goto failure;
2084
2085                 err = -EPIPE;
2086                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2087                         goto failure;
2088
2089                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2090                         break;
2091
2092                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2093                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2094                 err = -EAGAIN;
2095                 if (!timeo)
2096                         goto failure;
2097                 if (signal_pending(current))
2098                         goto interrupted;
2099                 timeo = sock_wait_for_wmem(sk, timeo);
2100         }
2101         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2102                                    errcode, sk->sk_allocation);
2103         if (skb)
2104                 skb_set_owner_w(skb, sk);
2105         return skb;
2106
2107 interrupted:
2108         err = sock_intr_errno(timeo);
2109 failure:
2110         *errcode = err;
2111         return NULL;
2112 }
2113 EXPORT_SYMBOL(sock_alloc_send_pskb);
2114
2115 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2116                                     int noblock, int *errcode)
2117 {
2118         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2119 }
2120 EXPORT_SYMBOL(sock_alloc_send_skb);
2121
2122 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2123                      struct sockcm_cookie *sockc)
2124 {
2125         u32 tsflags;
2126
2127         switch (cmsg->cmsg_type) {
2128         case SO_MARK:
2129                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2130                         return -EPERM;
2131                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2132                         return -EINVAL;
2133                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2134                 break;
2135         case SO_TIMESTAMPING:
2136                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2137                         return -EINVAL;
2138
2139                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2140                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2141                         return -EINVAL;
2142
2143                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2144                 sockc->tsflags |= tsflags;
2145                 break;
2146         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2147         case SCM_RIGHTS:
2148         case SCM_CREDENTIALS:
2149                 break;
2150         default:
2151                 return -EINVAL;
2152         }
2153         return 0;
2154 }
2155 EXPORT_SYMBOL(__sock_cmsg_send);
2156
2157 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2158                    struct sockcm_cookie *sockc)
2159 {
2160         struct cmsghdr *cmsg;
2161         int ret;
2162
2163         for_each_cmsghdr(cmsg, msg) {
2164                 if (!CMSG_OK(msg, cmsg))
2165                         return -EINVAL;
2166                 if (cmsg->cmsg_level != SOL_SOCKET)
2167                         continue;
2168                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2169                 if (ret)
2170                         return ret;
2171         }
2172         return 0;
2173 }
2174 EXPORT_SYMBOL(sock_cmsg_send);
2175
2176 static void sk_enter_memory_pressure(struct sock *sk)
2177 {
2178         if (!sk->sk_prot->enter_memory_pressure)
2179                 return;
2180
2181         sk->sk_prot->enter_memory_pressure(sk);
2182 }
2183
2184 static void sk_leave_memory_pressure(struct sock *sk)
2185 {
2186         if (sk->sk_prot->leave_memory_pressure) {
2187                 sk->sk_prot->leave_memory_pressure(sk);
2188         } else {
2189                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2190
2191                 if (memory_pressure && READ_ONCE(*memory_pressure))
2192                         WRITE_ONCE(*memory_pressure, 0);
2193         }
2194 }
2195
2196 /* On 32bit arches, an skb frag is limited to 2^15 */
2197 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
2198
2199 /**
2200  * skb_page_frag_refill - check that a page_frag contains enough room
2201  * @sz: minimum size of the fragment we want to get
2202  * @pfrag: pointer to page_frag
2203  * @gfp: priority for memory allocation
2204  *
2205  * Note: While this allocator tries to use high order pages, there is
2206  * no guarantee that allocations succeed. Therefore, @sz MUST be
2207  * less or equal than PAGE_SIZE.
2208  */
2209 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2210 {
2211         if (pfrag->page) {
2212                 if (page_ref_count(pfrag->page) == 1) {
2213                         pfrag->offset = 0;
2214                         return true;
2215                 }
2216                 if (pfrag->offset + sz <= pfrag->size)
2217                         return true;
2218                 put_page(pfrag->page);
2219         }
2220
2221         pfrag->offset = 0;
2222         if (SKB_FRAG_PAGE_ORDER) {
2223                 /* Avoid direct reclaim but allow kswapd to wake */
2224                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2225                                           __GFP_COMP | __GFP_NOWARN |
2226                                           __GFP_NORETRY,
2227                                           SKB_FRAG_PAGE_ORDER);
2228                 if (likely(pfrag->page)) {
2229                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2230                         return true;
2231                 }
2232         }
2233         pfrag->page = alloc_page(gfp);
2234         if (likely(pfrag->page)) {
2235                 pfrag->size = PAGE_SIZE;
2236                 return true;
2237         }
2238         return false;
2239 }
2240 EXPORT_SYMBOL(skb_page_frag_refill);
2241
2242 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2243 {
2244         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2245                 return true;
2246
2247         sk_enter_memory_pressure(sk);
2248         sk_stream_moderate_sndbuf(sk);
2249         return false;
2250 }
2251 EXPORT_SYMBOL(sk_page_frag_refill);
2252
2253 static void __lock_sock(struct sock *sk)
2254         __releases(&sk->sk_lock.slock)
2255         __acquires(&sk->sk_lock.slock)
2256 {
2257         DEFINE_WAIT(wait);
2258
2259         for (;;) {
2260                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2261                                         TASK_UNINTERRUPTIBLE);
2262                 spin_unlock_bh(&sk->sk_lock.slock);
2263                 schedule();
2264                 spin_lock_bh(&sk->sk_lock.slock);
2265                 if (!sock_owned_by_user(sk))
2266                         break;
2267         }
2268         finish_wait(&sk->sk_lock.wq, &wait);
2269 }
2270
2271 void __release_sock(struct sock *sk)
2272         __releases(&sk->sk_lock.slock)
2273         __acquires(&sk->sk_lock.slock)
2274 {
2275         struct sk_buff *skb, *next;
2276
2277         while ((skb = sk->sk_backlog.head) != NULL) {
2278                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2279
2280                 spin_unlock_bh(&sk->sk_lock.slock);
2281
2282                 do {
2283                         next = skb->next;
2284                         prefetch(next);
2285                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2286                         skb->next = NULL;
2287                         sk_backlog_rcv(sk, skb);
2288
2289                         cond_resched();
2290
2291                         skb = next;
2292                 } while (skb != NULL);
2293
2294                 spin_lock_bh(&sk->sk_lock.slock);
2295         }
2296
2297         /*
2298          * Doing the zeroing here guarantee we can not loop forever
2299          * while a wild producer attempts to flood us.
2300          */
2301         sk->sk_backlog.len = 0;
2302 }
2303
2304 void __sk_flush_backlog(struct sock *sk)
2305 {
2306         spin_lock_bh(&sk->sk_lock.slock);
2307         __release_sock(sk);
2308         spin_unlock_bh(&sk->sk_lock.slock);
2309 }
2310
2311 /**
2312  * sk_wait_data - wait for data to arrive at sk_receive_queue
2313  * @sk:    sock to wait on
2314  * @timeo: for how long
2315  * @skb:   last skb seen on sk_receive_queue
2316  *
2317  * Now socket state including sk->sk_err is changed only under lock,
2318  * hence we may omit checks after joining wait queue.
2319  * We check receive queue before schedule() only as optimization;
2320  * it is very likely that release_sock() added new data.
2321  */
2322 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2323 {
2324         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2325         int rc;
2326
2327         add_wait_queue(sk_sleep(sk), &wait);
2328         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2329         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2330         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2331         remove_wait_queue(sk_sleep(sk), &wait);
2332         return rc;
2333 }
2334 EXPORT_SYMBOL(sk_wait_data);
2335
2336 /**
2337  *      __sk_mem_raise_allocated - increase memory_allocated
2338  *      @sk: socket
2339  *      @size: memory size to allocate
2340  *      @amt: pages to allocate
2341  *      @kind: allocation type
2342  *
2343  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2344  */
2345 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2346 {
2347         struct proto *prot = sk->sk_prot;
2348         long allocated = sk_memory_allocated_add(sk, amt);
2349
2350         if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2351             !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2352                 goto suppress_allocation;
2353
2354         /* Under limit. */
2355         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2356                 sk_leave_memory_pressure(sk);
2357                 return 1;
2358         }
2359
2360         /* Under pressure. */
2361         if (allocated > sk_prot_mem_limits(sk, 1))
2362                 sk_enter_memory_pressure(sk);
2363
2364         /* Over hard limit. */
2365         if (allocated > sk_prot_mem_limits(sk, 2))
2366                 goto suppress_allocation;
2367
2368         /* guarantee minimum buffer size under pressure */
2369         if (kind == SK_MEM_RECV) {
2370                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2371                         return 1;
2372
2373         } else { /* SK_MEM_SEND */
2374                 if (sk->sk_type == SOCK_STREAM) {
2375                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2376                                 return 1;
2377                 } else if (refcount_read(&sk->sk_wmem_alloc) <
2378                            prot->sysctl_wmem[0])
2379                                 return 1;
2380         }
2381
2382         if (sk_has_memory_pressure(sk)) {
2383                 u64 alloc;
2384
2385                 if (!sk_under_memory_pressure(sk))
2386                         return 1;
2387                 alloc = sk_sockets_allocated_read_positive(sk);
2388                 if (sk_prot_mem_limits(sk, 2) > alloc *
2389                     sk_mem_pages(sk->sk_wmem_queued +
2390                                  atomic_read(&sk->sk_rmem_alloc) +
2391                                  sk->sk_forward_alloc))
2392                         return 1;
2393         }
2394
2395 suppress_allocation:
2396
2397         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2398                 sk_stream_moderate_sndbuf(sk);
2399
2400                 /* Fail only if socket is _under_ its sndbuf.
2401                  * In this case we cannot block, so that we have to fail.
2402                  */
2403                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2404                         return 1;
2405         }
2406
2407         trace_sock_exceed_buf_limit(sk, prot, allocated);
2408
2409         sk_memory_allocated_sub(sk, amt);
2410
2411         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2412                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2413
2414         return 0;
2415 }
2416 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2417
2418 /**
2419  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2420  *      @sk: socket
2421  *      @size: memory size to allocate
2422  *      @kind: allocation type
2423  *
2424  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2425  *      rmem allocation. This function assumes that protocols which have
2426  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2427  */
2428 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2429 {
2430         int ret, amt = sk_mem_pages(size);
2431
2432         sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2433         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2434         if (!ret)
2435                 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2436         return ret;
2437 }
2438 EXPORT_SYMBOL(__sk_mem_schedule);
2439
2440 /**
2441  *      __sk_mem_reduce_allocated - reclaim memory_allocated
2442  *      @sk: socket
2443  *      @amount: number of quanta
2444  *
2445  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2446  */
2447 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2448 {
2449         sk_memory_allocated_sub(sk, amount);
2450
2451         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2452                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2453
2454         if (sk_under_memory_pressure(sk) &&
2455             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2456                 sk_leave_memory_pressure(sk);
2457 }
2458 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2459
2460 /**
2461  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2462  *      @sk: socket
2463  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2464  */
2465 void __sk_mem_reclaim(struct sock *sk, int amount)
2466 {
2467         amount >>= SK_MEM_QUANTUM_SHIFT;
2468         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2469         __sk_mem_reduce_allocated(sk, amount);
2470 }
2471 EXPORT_SYMBOL(__sk_mem_reclaim);
2472
2473 int sk_set_peek_off(struct sock *sk, int val)
2474 {
2475         sk->sk_peek_off = val;
2476         return 0;
2477 }
2478 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2479
2480 /*
2481  * Set of default routines for initialising struct proto_ops when
2482  * the protocol does not support a particular function. In certain
2483  * cases where it makes no sense for a protocol to have a "do nothing"
2484  * function, some default processing is provided.
2485  */
2486
2487 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2488 {
2489         return -EOPNOTSUPP;
2490 }
2491 EXPORT_SYMBOL(sock_no_bind);
2492
2493 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2494                     int len, int flags)
2495 {
2496         return -EOPNOTSUPP;
2497 }
2498 EXPORT_SYMBOL(sock_no_connect);
2499
2500 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2501 {
2502         return -EOPNOTSUPP;
2503 }
2504 EXPORT_SYMBOL(sock_no_socketpair);
2505
2506 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2507                    bool kern)
2508 {
2509         return -EOPNOTSUPP;
2510 }
2511 EXPORT_SYMBOL(sock_no_accept);
2512
2513 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2514                     int *len, int peer)
2515 {
2516         return -EOPNOTSUPP;
2517 }
2518 EXPORT_SYMBOL(sock_no_getname);
2519
2520 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2521 {
2522         return 0;
2523 }
2524 EXPORT_SYMBOL(sock_no_poll);
2525
2526 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2527 {
2528         return -EOPNOTSUPP;
2529 }
2530 EXPORT_SYMBOL(sock_no_ioctl);
2531
2532 int sock_no_listen(struct socket *sock, int backlog)
2533 {
2534         return -EOPNOTSUPP;
2535 }
2536 EXPORT_SYMBOL(sock_no_listen);
2537
2538 int sock_no_shutdown(struct socket *sock, int how)
2539 {
2540         return -EOPNOTSUPP;
2541 }
2542 EXPORT_SYMBOL(sock_no_shutdown);
2543
2544 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2545                     char __user *optval, unsigned int optlen)
2546 {
2547         return -EOPNOTSUPP;
2548 }
2549 EXPORT_SYMBOL(sock_no_setsockopt);
2550
2551 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2552                     char __user *optval, int __user *optlen)
2553 {
2554         return -EOPNOTSUPP;
2555 }
2556 EXPORT_SYMBOL(sock_no_getsockopt);
2557
2558 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2559 {
2560         return -EOPNOTSUPP;
2561 }
2562 EXPORT_SYMBOL(sock_no_sendmsg);
2563
2564 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2565 {
2566         return -EOPNOTSUPP;
2567 }
2568 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2569
2570 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2571                     int flags)
2572 {
2573         return -EOPNOTSUPP;
2574 }
2575 EXPORT_SYMBOL(sock_no_recvmsg);
2576
2577 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2578 {
2579         /* Mirror missing mmap method error code */
2580         return -ENODEV;
2581 }
2582 EXPORT_SYMBOL(sock_no_mmap);
2583
2584 /*
2585  * When a file is received (via SCM_RIGHTS, etc), we must bump the
2586  * various sock-based usage counts.
2587  */
2588 void __receive_sock(struct file *file)
2589 {
2590         struct socket *sock;
2591         int error;
2592
2593         /*
2594          * The resulting value of "error" is ignored here since we only
2595          * need to take action when the file is a socket and testing
2596          * "sock" for NULL is sufficient.
2597          */
2598         sock = sock_from_file(file, &error);
2599         if (sock) {
2600                 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2601                 sock_update_classid(&sock->sk->sk_cgrp_data);
2602         }
2603 }
2604
2605 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2606 {
2607         ssize_t res;
2608         struct msghdr msg = {.msg_flags = flags};
2609         struct kvec iov;
2610         char *kaddr = kmap(page);
2611         iov.iov_base = kaddr + offset;
2612         iov.iov_len = size;
2613         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2614         kunmap(page);
2615         return res;
2616 }
2617 EXPORT_SYMBOL(sock_no_sendpage);
2618
2619 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2620                                 int offset, size_t size, int flags)
2621 {
2622         ssize_t res;
2623         struct msghdr msg = {.msg_flags = flags};
2624         struct kvec iov;
2625         char *kaddr = kmap(page);
2626
2627         iov.iov_base = kaddr + offset;
2628         iov.iov_len = size;
2629         res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2630         kunmap(page);
2631         return res;
2632 }
2633 EXPORT_SYMBOL(sock_no_sendpage_locked);
2634
2635 /*
2636  *      Default Socket Callbacks
2637  */
2638
2639 static void sock_def_wakeup(struct sock *sk)
2640 {
2641         struct socket_wq *wq;
2642
2643         rcu_read_lock();
2644         wq = rcu_dereference(sk->sk_wq);
2645         if (skwq_has_sleeper(wq))
2646                 wake_up_interruptible_all(&wq->wait);
2647         rcu_read_unlock();
2648 }
2649
2650 static void sock_def_error_report(struct sock *sk)
2651 {
2652         struct socket_wq *wq;
2653
2654         rcu_read_lock();
2655         wq = rcu_dereference(sk->sk_wq);
2656         if (skwq_has_sleeper(wq))
2657                 wake_up_interruptible_poll(&wq->wait, POLLERR);
2658         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2659         rcu_read_unlock();
2660 }
2661
2662 static void sock_def_readable(struct sock *sk)
2663 {
2664         struct socket_wq *wq;
2665
2666         rcu_read_lock();
2667         wq = rcu_dereference(sk->sk_wq);
2668         if (skwq_has_sleeper(wq))
2669                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2670                                                 POLLRDNORM | POLLRDBAND);
2671         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2672         rcu_read_unlock();
2673 }
2674
2675 static void sock_def_write_space(struct sock *sk)
2676 {
2677         struct socket_wq *wq;
2678
2679         rcu_read_lock();
2680
2681         /* Do not wake up a writer until he can make "significant"
2682          * progress.  --DaveM
2683          */
2684         if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2685                 wq = rcu_dereference(sk->sk_wq);
2686                 if (skwq_has_sleeper(wq))
2687                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2688                                                 POLLWRNORM | POLLWRBAND);
2689
2690                 /* Should agree with poll, otherwise some programs break */
2691                 if (sock_writeable(sk))
2692                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2693         }
2694
2695         rcu_read_unlock();
2696 }
2697
2698 static void sock_def_destruct(struct sock *sk)
2699 {
2700 }
2701
2702 void sk_send_sigurg(struct sock *sk)
2703 {
2704         if (sk->sk_socket && sk->sk_socket->file)
2705                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2706                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2707 }
2708 EXPORT_SYMBOL(sk_send_sigurg);
2709
2710 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2711                     unsigned long expires)
2712 {
2713         if (!mod_timer(timer, expires))
2714                 sock_hold(sk);
2715 }
2716 EXPORT_SYMBOL(sk_reset_timer);
2717
2718 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2719 {
2720         if (del_timer(timer))
2721                 __sock_put(sk);
2722 }
2723 EXPORT_SYMBOL(sk_stop_timer);
2724
2725 void sock_init_data(struct socket *sock, struct sock *sk)
2726 {
2727         sk_init_common(sk);
2728         sk->sk_send_head        =       NULL;
2729
2730         init_timer(&sk->sk_timer);
2731
2732         sk->sk_allocation       =       GFP_KERNEL;
2733         sk->sk_rcvbuf           =       sysctl_rmem_default;
2734         sk->sk_sndbuf           =       sysctl_wmem_default;
2735         sk->sk_state            =       TCP_CLOSE;
2736         sk_set_socket(sk, sock);
2737
2738         sock_set_flag(sk, SOCK_ZAPPED);
2739
2740         if (sock) {
2741                 sk->sk_type     =       sock->type;
2742                 sk->sk_wq       =       sock->wq;
2743                 sock->sk        =       sk;
2744                 sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2745         } else {
2746                 sk->sk_wq       =       NULL;
2747                 sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2748         }
2749
2750         rwlock_init(&sk->sk_callback_lock);
2751         if (sk->sk_kern_sock)
2752                 lockdep_set_class_and_name(
2753                         &sk->sk_callback_lock,
2754                         af_kern_callback_keys + sk->sk_family,
2755                         af_family_kern_clock_key_strings[sk->sk_family]);
2756         else
2757                 lockdep_set_class_and_name(
2758                         &sk->sk_callback_lock,
2759                         af_callback_keys + sk->sk_family,
2760                         af_family_clock_key_strings[sk->sk_family]);
2761
2762         sk->sk_state_change     =       sock_def_wakeup;
2763         sk->sk_data_ready       =       sock_def_readable;
2764         sk->sk_write_space      =       sock_def_write_space;
2765         sk->sk_error_report     =       sock_def_error_report;
2766         sk->sk_destruct         =       sock_def_destruct;
2767
2768         sk->sk_frag.page        =       NULL;
2769         sk->sk_frag.offset      =       0;
2770         sk->sk_peek_off         =       -1;
2771
2772         sk->sk_peer_pid         =       NULL;
2773         sk->sk_peer_cred        =       NULL;
2774         spin_lock_init(&sk->sk_peer_lock);
2775
2776         sk->sk_write_pending    =       0;
2777         sk->sk_rcvlowat         =       1;
2778         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2779         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2780
2781         sk->sk_stamp = SK_DEFAULT_STAMP;
2782 #if BITS_PER_LONG==32
2783         seqlock_init(&sk->sk_stamp_seq);
2784 #endif
2785         atomic_set(&sk->sk_zckey, 0);
2786
2787 #ifdef CONFIG_NET_RX_BUSY_POLL
2788         sk->sk_napi_id          =       0;
2789         sk->sk_ll_usec          =       sysctl_net_busy_read;
2790 #endif
2791
2792         sk->sk_max_pacing_rate = ~0U;
2793         sk->sk_pacing_rate = ~0U;
2794         sk->sk_incoming_cpu = -1;
2795         /*
2796          * Before updating sk_refcnt, we must commit prior changes to memory
2797          * (Documentation/RCU/rculist_nulls.txt for details)
2798          */
2799         smp_wmb();
2800         refcount_set(&sk->sk_refcnt, 1);
2801         atomic_set(&sk->sk_drops, 0);
2802 }
2803 EXPORT_SYMBOL(sock_init_data);
2804
2805 void lock_sock_nested(struct sock *sk, int subclass)
2806 {
2807         might_sleep();
2808         spin_lock_bh(&sk->sk_lock.slock);
2809         if (sk->sk_lock.owned)
2810                 __lock_sock(sk);
2811         sk->sk_lock.owned = 1;
2812         spin_unlock(&sk->sk_lock.slock);
2813         /*
2814          * The sk_lock has mutex_lock() semantics here:
2815          */
2816         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2817         local_bh_enable();
2818 }
2819 EXPORT_SYMBOL(lock_sock_nested);
2820
2821 void release_sock(struct sock *sk)
2822 {
2823         spin_lock_bh(&sk->sk_lock.slock);
2824         if (sk->sk_backlog.tail)
2825                 __release_sock(sk);
2826
2827         /* Warning : release_cb() might need to release sk ownership,
2828          * ie call sock_release_ownership(sk) before us.
2829          */
2830         if (sk->sk_prot->release_cb)
2831                 sk->sk_prot->release_cb(sk);
2832
2833         sock_release_ownership(sk);
2834         if (waitqueue_active(&sk->sk_lock.wq))
2835                 wake_up(&sk->sk_lock.wq);
2836         spin_unlock_bh(&sk->sk_lock.slock);
2837 }
2838 EXPORT_SYMBOL(release_sock);
2839
2840 /**
2841  * lock_sock_fast - fast version of lock_sock
2842  * @sk: socket
2843  *
2844  * This version should be used for very small section, where process wont block
2845  * return false if fast path is taken:
2846  *
2847  *   sk_lock.slock locked, owned = 0, BH disabled
2848  *
2849  * return true if slow path is taken:
2850  *
2851  *   sk_lock.slock unlocked, owned = 1, BH enabled
2852  */
2853 bool lock_sock_fast(struct sock *sk)
2854 {
2855         might_sleep();
2856         spin_lock_bh(&sk->sk_lock.slock);
2857
2858         if (!sk->sk_lock.owned)
2859                 /*
2860                  * Note : We must disable BH
2861                  */
2862                 return false;
2863
2864         __lock_sock(sk);
2865         sk->sk_lock.owned = 1;
2866         spin_unlock(&sk->sk_lock.slock);
2867         /*
2868          * The sk_lock has mutex_lock() semantics here:
2869          */
2870         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2871         local_bh_enable();
2872         return true;
2873 }
2874 EXPORT_SYMBOL(lock_sock_fast);
2875
2876 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2877 {
2878         struct timeval tv;
2879         if (!sock_flag(sk, SOCK_TIMESTAMP))
2880                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2881         tv = ktime_to_timeval(sk->sk_stamp);
2882         if (tv.tv_sec == -1)
2883                 return -ENOENT;
2884         if (tv.tv_sec == 0) {
2885                 sk->sk_stamp = ktime_get_real();
2886                 tv = ktime_to_timeval(sk->sk_stamp);
2887         }
2888         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2889 }
2890 EXPORT_SYMBOL(sock_get_timestamp);
2891
2892 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2893 {
2894         struct timespec ts;
2895         if (!sock_flag(sk, SOCK_TIMESTAMP))
2896                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2897         ts = ktime_to_timespec(sk->sk_stamp);
2898         if (ts.tv_sec == -1)
2899                 return -ENOENT;
2900         if (ts.tv_sec == 0) {
2901                 sk->sk_stamp = ktime_get_real();
2902                 ts = ktime_to_timespec(sk->sk_stamp);
2903         }
2904         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2905 }
2906 EXPORT_SYMBOL(sock_get_timestampns);
2907
2908 void sock_enable_timestamp(struct sock *sk, int flag)
2909 {
2910         if (!sock_flag(sk, flag)) {
2911                 unsigned long previous_flags = sk->sk_flags;
2912
2913                 sock_set_flag(sk, flag);
2914                 /*
2915                  * we just set one of the two flags which require net
2916                  * time stamping, but time stamping might have been on
2917                  * already because of the other one
2918                  */
2919                 if (sock_needs_netstamp(sk) &&
2920                     !(previous_flags & SK_FLAGS_TIMESTAMP))
2921                         net_enable_timestamp();
2922         }
2923 }
2924
2925 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2926                        int level, int type)
2927 {
2928         struct sock_exterr_skb *serr;
2929         struct sk_buff *skb;
2930         int copied, err;
2931
2932         err = -EAGAIN;
2933         skb = sock_dequeue_err_skb(sk);
2934         if (skb == NULL)
2935                 goto out;
2936
2937         copied = skb->len;
2938         if (copied > len) {
2939                 msg->msg_flags |= MSG_TRUNC;
2940                 copied = len;
2941         }
2942         err = skb_copy_datagram_msg(skb, 0, msg, copied);
2943         if (err)
2944                 goto out_free_skb;
2945
2946         sock_recv_timestamp(msg, sk, skb);
2947
2948         serr = SKB_EXT_ERR(skb);
2949         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2950
2951         msg->msg_flags |= MSG_ERRQUEUE;
2952         err = copied;
2953
2954 out_free_skb:
2955         kfree_skb(skb);
2956 out:
2957         return err;
2958 }
2959 EXPORT_SYMBOL(sock_recv_errqueue);
2960
2961 /*
2962  *      Get a socket option on an socket.
2963  *
2964  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2965  *      asynchronous errors should be reported by getsockopt. We assume
2966  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2967  */
2968 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2969                            char __user *optval, int __user *optlen)
2970 {
2971         struct sock *sk = sock->sk;
2972
2973         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2974 }
2975 EXPORT_SYMBOL(sock_common_getsockopt);
2976
2977 #ifdef CONFIG_COMPAT
2978 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2979                                   char __user *optval, int __user *optlen)
2980 {
2981         struct sock *sk = sock->sk;
2982
2983         if (sk->sk_prot->compat_getsockopt != NULL)
2984                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2985                                                       optval, optlen);
2986         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2987 }
2988 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2989 #endif
2990
2991 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2992                         int flags)
2993 {
2994         struct sock *sk = sock->sk;
2995         int addr_len = 0;
2996         int err;
2997
2998         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2999                                    flags & ~MSG_DONTWAIT, &addr_len);
3000         if (err >= 0)
3001                 msg->msg_namelen = addr_len;
3002         return err;
3003 }
3004 EXPORT_SYMBOL(sock_common_recvmsg);
3005
3006 /*
3007  *      Set socket options on an inet socket.
3008  */
3009 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3010                            char __user *optval, unsigned int optlen)
3011 {
3012         struct sock *sk = sock->sk;
3013
3014         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3015 }
3016 EXPORT_SYMBOL(sock_common_setsockopt);
3017
3018 #ifdef CONFIG_COMPAT
3019 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
3020                                   char __user *optval, unsigned int optlen)
3021 {
3022         struct sock *sk = sock->sk;
3023
3024         if (sk->sk_prot->compat_setsockopt != NULL)
3025                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
3026                                                       optval, optlen);
3027         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3028 }
3029 EXPORT_SYMBOL(compat_sock_common_setsockopt);
3030 #endif
3031
3032 void sk_common_release(struct sock *sk)
3033 {
3034         if (sk->sk_prot->destroy)
3035                 sk->sk_prot->destroy(sk);
3036
3037         /*
3038          * Observation: when sock_common_release is called, processes have
3039          * no access to socket. But net still has.
3040          * Step one, detach it from networking:
3041          *
3042          * A. Remove from hash tables.
3043          */
3044
3045         sk->sk_prot->unhash(sk);
3046
3047         /*
3048          * In this point socket cannot receive new packets, but it is possible
3049          * that some packets are in flight because some CPU runs receiver and
3050          * did hash table lookup before we unhashed socket. They will achieve
3051          * receive queue and will be purged by socket destructor.
3052          *
3053          * Also we still have packets pending on receive queue and probably,
3054          * our own packets waiting in device queues. sock_destroy will drain
3055          * receive queue, but transmitted packets will delay socket destruction
3056          * until the last reference will be released.
3057          */
3058
3059         sock_orphan(sk);
3060
3061         xfrm_sk_free_policy(sk);
3062
3063         sk_refcnt_debug_release(sk);
3064
3065         sock_put(sk);
3066 }
3067 EXPORT_SYMBOL(sk_common_release);
3068
3069 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3070 {
3071         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3072
3073         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3074         mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3075         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3076         mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3077         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3078         mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3079         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3080         mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3081         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3082 }
3083
3084 #ifdef CONFIG_PROC_FS
3085 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
3086 struct prot_inuse {
3087         int val[PROTO_INUSE_NR];
3088 };
3089
3090 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3091
3092 #ifdef CONFIG_NET_NS
3093 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3094 {
3095         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
3096 }
3097 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3098
3099 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3100 {
3101         int cpu, idx = prot->inuse_idx;
3102         int res = 0;
3103
3104         for_each_possible_cpu(cpu)
3105                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
3106
3107         return res >= 0 ? res : 0;
3108 }
3109 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3110
3111 static int __net_init sock_inuse_init_net(struct net *net)
3112 {
3113         net->core.inuse = alloc_percpu(struct prot_inuse);
3114         return net->core.inuse ? 0 : -ENOMEM;
3115 }
3116
3117 static void __net_exit sock_inuse_exit_net(struct net *net)
3118 {
3119         free_percpu(net->core.inuse);
3120 }
3121
3122 static struct pernet_operations net_inuse_ops = {
3123         .init = sock_inuse_init_net,
3124         .exit = sock_inuse_exit_net,
3125 };
3126
3127 static __init int net_inuse_init(void)
3128 {
3129         if (register_pernet_subsys(&net_inuse_ops))
3130                 panic("Cannot initialize net inuse counters");
3131
3132         return 0;
3133 }
3134
3135 core_initcall(net_inuse_init);
3136 #else
3137 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
3138
3139 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3140 {
3141         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
3142 }
3143 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3144
3145 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3146 {
3147         int cpu, idx = prot->inuse_idx;
3148         int res = 0;
3149
3150         for_each_possible_cpu(cpu)
3151                 res += per_cpu(prot_inuse, cpu).val[idx];
3152
3153         return res >= 0 ? res : 0;
3154 }
3155 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3156 #endif
3157
3158 static void assign_proto_idx(struct proto *prot)
3159 {
3160         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3161
3162         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3163                 pr_err("PROTO_INUSE_NR exhausted\n");
3164                 return;
3165         }
3166
3167         set_bit(prot->inuse_idx, proto_inuse_idx);
3168 }
3169
3170 static void release_proto_idx(struct proto *prot)
3171 {
3172         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3173                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3174 }
3175 #else
3176 static inline void assign_proto_idx(struct proto *prot)
3177 {
3178 }
3179
3180 static inline void release_proto_idx(struct proto *prot)
3181 {
3182 }
3183 #endif
3184
3185 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3186 {
3187         if (!rsk_prot)
3188                 return;
3189         kfree(rsk_prot->slab_name);
3190         rsk_prot->slab_name = NULL;
3191         kmem_cache_destroy(rsk_prot->slab);
3192         rsk_prot->slab = NULL;
3193 }
3194
3195 static int req_prot_init(const struct proto *prot)
3196 {
3197         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3198
3199         if (!rsk_prot)
3200                 return 0;
3201
3202         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3203                                         prot->name);
3204         if (!rsk_prot->slab_name)
3205                 return -ENOMEM;
3206
3207         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3208                                            rsk_prot->obj_size, 0,
3209                                            prot->slab_flags, NULL);
3210
3211         if (!rsk_prot->slab) {
3212                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3213                         prot->name);
3214                 return -ENOMEM;
3215         }
3216         return 0;
3217 }
3218
3219 int proto_register(struct proto *prot, int alloc_slab)
3220 {
3221         if (alloc_slab) {
3222                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
3223                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
3224                                         NULL);
3225
3226                 if (prot->slab == NULL) {
3227                         pr_crit("%s: Can't create sock SLAB cache!\n",
3228                                 prot->name);
3229                         goto out;
3230                 }
3231
3232                 if (req_prot_init(prot))
3233                         goto out_free_request_sock_slab;
3234
3235                 if (prot->twsk_prot != NULL) {
3236                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3237
3238                         if (prot->twsk_prot->twsk_slab_name == NULL)
3239                                 goto out_free_request_sock_slab;
3240
3241                         prot->twsk_prot->twsk_slab =
3242                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3243                                                   prot->twsk_prot->twsk_obj_size,
3244                                                   0,
3245                                                   prot->slab_flags,
3246                                                   NULL);
3247                         if (prot->twsk_prot->twsk_slab == NULL)
3248                                 goto out_free_timewait_sock_slab_name;
3249                 }
3250         }
3251
3252         mutex_lock(&proto_list_mutex);
3253         list_add(&prot->node, &proto_list);
3254         assign_proto_idx(prot);
3255         mutex_unlock(&proto_list_mutex);
3256         return 0;
3257
3258 out_free_timewait_sock_slab_name:
3259         kfree(prot->twsk_prot->twsk_slab_name);
3260 out_free_request_sock_slab:
3261         req_prot_cleanup(prot->rsk_prot);
3262
3263         kmem_cache_destroy(prot->slab);
3264         prot->slab = NULL;
3265 out:
3266         return -ENOBUFS;
3267 }
3268 EXPORT_SYMBOL(proto_register);
3269
3270 void proto_unregister(struct proto *prot)
3271 {
3272         mutex_lock(&proto_list_mutex);
3273         release_proto_idx(prot);
3274         list_del(&prot->node);
3275         mutex_unlock(&proto_list_mutex);
3276
3277         kmem_cache_destroy(prot->slab);
3278         prot->slab = NULL;
3279
3280         req_prot_cleanup(prot->rsk_prot);
3281
3282         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3283                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3284                 kfree(prot->twsk_prot->twsk_slab_name);
3285                 prot->twsk_prot->twsk_slab = NULL;
3286         }
3287 }
3288 EXPORT_SYMBOL(proto_unregister);
3289
3290 #ifdef CONFIG_PROC_FS
3291 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3292         __acquires(proto_list_mutex)
3293 {
3294         mutex_lock(&proto_list_mutex);
3295         return seq_list_start_head(&proto_list, *pos);
3296 }
3297
3298 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3299 {
3300         return seq_list_next(v, &proto_list, pos);
3301 }
3302
3303 static void proto_seq_stop(struct seq_file *seq, void *v)
3304         __releases(proto_list_mutex)
3305 {
3306         mutex_unlock(&proto_list_mutex);
3307 }
3308
3309 static char proto_method_implemented(const void *method)
3310 {
3311         return method == NULL ? 'n' : 'y';
3312 }
3313 static long sock_prot_memory_allocated(struct proto *proto)
3314 {
3315         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3316 }
3317
3318 static char *sock_prot_memory_pressure(struct proto *proto)
3319 {
3320         return proto->memory_pressure != NULL ?
3321         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3322 }
3323
3324 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3325 {
3326
3327         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3328                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3329                    proto->name,
3330                    proto->obj_size,
3331                    sock_prot_inuse_get(seq_file_net(seq), proto),
3332                    sock_prot_memory_allocated(proto),
3333                    sock_prot_memory_pressure(proto),
3334                    proto->max_header,
3335                    proto->slab == NULL ? "no" : "yes",
3336                    module_name(proto->owner),
3337                    proto_method_implemented(proto->close),
3338                    proto_method_implemented(proto->connect),
3339                    proto_method_implemented(proto->disconnect),
3340                    proto_method_implemented(proto->accept),
3341                    proto_method_implemented(proto->ioctl),
3342                    proto_method_implemented(proto->init),
3343                    proto_method_implemented(proto->destroy),
3344                    proto_method_implemented(proto->shutdown),
3345                    proto_method_implemented(proto->setsockopt),
3346                    proto_method_implemented(proto->getsockopt),
3347                    proto_method_implemented(proto->sendmsg),
3348                    proto_method_implemented(proto->recvmsg),
3349                    proto_method_implemented(proto->sendpage),
3350                    proto_method_implemented(proto->bind),
3351                    proto_method_implemented(proto->backlog_rcv),
3352                    proto_method_implemented(proto->hash),
3353                    proto_method_implemented(proto->unhash),
3354                    proto_method_implemented(proto->get_port),
3355                    proto_method_implemented(proto->enter_memory_pressure));
3356 }
3357
3358 static int proto_seq_show(struct seq_file *seq, void *v)
3359 {
3360         if (v == &proto_list)
3361                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3362                            "protocol",
3363                            "size",
3364                            "sockets",
3365                            "memory",
3366                            "press",
3367                            "maxhdr",
3368                            "slab",
3369                            "module",
3370                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3371         else
3372                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3373         return 0;
3374 }
3375
3376 static const struct seq_operations proto_seq_ops = {
3377         .start  = proto_seq_start,
3378         .next   = proto_seq_next,
3379         .stop   = proto_seq_stop,
3380         .show   = proto_seq_show,
3381 };
3382
3383 static int proto_seq_open(struct inode *inode, struct file *file)
3384 {
3385         return seq_open_net(inode, file, &proto_seq_ops,
3386                             sizeof(struct seq_net_private));
3387 }
3388
3389 static const struct file_operations proto_seq_fops = {
3390         .owner          = THIS_MODULE,
3391         .open           = proto_seq_open,
3392         .read           = seq_read,
3393         .llseek         = seq_lseek,
3394         .release        = seq_release_net,
3395 };
3396
3397 static __net_init int proto_init_net(struct net *net)
3398 {
3399         if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3400                 return -ENOMEM;
3401
3402         return 0;
3403 }
3404
3405 static __net_exit void proto_exit_net(struct net *net)
3406 {
3407         remove_proc_entry("protocols", net->proc_net);
3408 }
3409
3410
3411 static __net_initdata struct pernet_operations proto_net_ops = {
3412         .init = proto_init_net,
3413         .exit = proto_exit_net,
3414 };
3415
3416 static int __init proto_init(void)
3417 {
3418         return register_pernet_subsys(&proto_net_ops);
3419 }
3420
3421 subsys_initcall(proto_init);
3422
3423 #endif /* PROC_FS */
3424
3425 #ifdef CONFIG_NET_RX_BUSY_POLL
3426 bool sk_busy_loop_end(void *p, unsigned long start_time)
3427 {
3428         struct sock *sk = p;
3429
3430         return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3431                sk_busy_loop_timeout(sk, start_time);
3432 }
3433 EXPORT_SYMBOL(sk_busy_loop_end);
3434 #endif /* CONFIG_NET_RX_BUSY_POLL */