net/core/sock.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Generic socket support routines. Memory allocators, socket lock/release
   8  *              handler for protocols to use and generic option handler.
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  */
  85
  86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88 #include <asm/unaligned.h>
  89 #include <linux/capability.h>
  90 #include <linux/errno.h>
  91 #include <linux/errqueue.h>
  92 #include <linux/types.h>
  93 #include <linux/socket.h>
  94 #include <linux/in.h>
  95 #include <linux/kernel.h>
  96 #include <linux/module.h>
  97 #include <linux/proc_fs.h>
  98 #include <linux/seq_file.h>
  99 #include <linux/sched.h>
 100 #include <linux/sched/mm.h>
 101 #include <linux/timer.h>
 102 #include <linux/string.h>
 103 #include <linux/sockios.h>
 104 #include <linux/net.h>
 105 #include <linux/mm.h>
 106 #include <linux/slab.h>
 107 #include <linux/interrupt.h>
 108 #include <linux/poll.h>
 109 #include <linux/tcp.h>
 110 #include <linux/init.h>
 111 #include <linux/highmem.h>
 112 #include <linux/user_namespace.h>
 113 #include <linux/static_key.h>
 114 #include <linux/memcontrol.h>
 115 #include <linux/prefetch.h>
 116 #include <linux/compat.h>
 117
 118 #include <linux/uaccess.h>
 119
 120 #include <linux/netdevice.h>
 121 #include <net/protocol.h>
 122 #include <linux/skbuff.h>
 123 #include <net/net_namespace.h>
 124 #include <net/request_sock.h>
 125 #include <net/sock.h>
 126 #include <linux/net_tstamp.h>
 127 #include <net/xfrm.h>
 128 #include <linux/ipsec.h>
 129 #include <net/cls_cgroup.h>
 130 #include <net/netprio_cgroup.h>
 131 #include <linux/sock_diag.h>
 132
 133 #include <linux/filter.h>
 134 #include <net/sock_reuseport.h>
 135 #include <net/bpf_sk_storage.h>
 136
 137 #include <trace/events/sock.h>
 138
 139 #include <net/tcp.h>
 140 #include <net/busy_poll.h>
 141
 142 #include <linux/ethtool.h>
 143
 144 static DEFINE_MUTEX(proto_list_mutex);
 145 static LIST_HEAD(proto_list);
 146
 147 static void sock_inuse_add(struct net *net, int val);
 148
 149 /**
 150  * sk_ns_capable - General socket capability test
 151  * @sk: Socket to use a capability on or through
 152  * @user_ns: The user namespace of the capability to use
 153  * @cap: The capability to use
 154  *
 155  * Test to see if the opener of the socket had when the socket was
 156  * created and the current process has the capability @cap in the user
 157  * namespace @user_ns.
 158  */
 159 bool sk_ns_capable(const struct sock *sk,
 160                    struct user_namespace *user_ns, int cap)
 161 {
 162         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 163                 ns_capable(user_ns, cap);
 164 }
 165 EXPORT_SYMBOL(sk_ns_capable);
 166
 167 /**
 168  * sk_capable - Socket global capability test
 169  * @sk: Socket to use a capability on or through
 170  * @cap: The global capability to use
 171  *
 172  * Test to see if the opener of the socket had when the socket was
 173  * created and the current process has the capability @cap in all user
 174  * namespaces.
 175  */
 176 bool sk_capable(const struct sock *sk, int cap)
 177 {
 178         return sk_ns_capable(sk, &init_user_ns, cap);
 179 }
 180 EXPORT_SYMBOL(sk_capable);
 181
 182 /**
 183  * sk_net_capable - Network namespace socket capability test
 184  * @sk: Socket to use a capability on or through
 185  * @cap: The capability to use
 186  *
 187  * Test to see if the opener of the socket had when the socket was created
 188  * and the current process has the capability @cap over the network namespace
 189  * the socket is a member of.
 190  */
 191 bool sk_net_capable(const struct sock *sk, int cap)
 192 {
 193         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 194 }
 195 EXPORT_SYMBOL(sk_net_capable);
 196
 197 /*
 198  * Each address family might have different locking rules, so we have
 199  * one slock key per address family and separate keys for internal and
 200  * userspace sockets.
 201  */
 202 static struct lock_class_key af_family_keys[AF_MAX];
 203 static struct lock_class_key af_family_kern_keys[AF_MAX];
 204 static struct lock_class_key af_family_slock_keys[AF_MAX];
 205 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 206
 207 /*
 208  * Make lock validator output more readable. (we pre-construct these
 209  * strings build-time, so that runtime initialization of socket
 210  * locks is fast):
 211  */
 212
 213 #define _sock_locks(x)                                            \
 214   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 215   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 216   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 217   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 218   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 219   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 220   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 221   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 222   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 223   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 224   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 225   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 226   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 227   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 228   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 229   x "AF_MCTP"  , \
 230   x "AF_MAX"
 231
 232 static const char *const af_family_key_strings[AF_MAX+1] = {
 233         _sock_locks("sk_lock-")
 234 };
 235 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 236         _sock_locks("slock-")
 237 };
 238 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 239         _sock_locks("clock-")
 240 };
 241
 242 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 243         _sock_locks("k-sk_lock-")
 244 };
 245 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 246         _sock_locks("k-slock-")
 247 };
 248 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 249         _sock_locks("k-clock-")
 250 };
 251 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 252         _sock_locks("rlock-")
 253 };
 254 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 255         _sock_locks("wlock-")
 256 };
 257 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 258         _sock_locks("elock-")
 259 };
 260
 261 /*
 262  * sk_callback_lock and sk queues locking rules are per-address-family,
 263  * so split the lock classes by using a per-AF key:
 264  */
 265 static struct lock_class_key af_callback_keys[AF_MAX];
 266 static struct lock_class_key af_rlock_keys[AF_MAX];
 267 static struct lock_class_key af_wlock_keys[AF_MAX];
 268 static struct lock_class_key af_elock_keys[AF_MAX];
 269 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 270
 271 /* Run time adjustable parameters. */
 272 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 273 EXPORT_SYMBOL(sysctl_wmem_max);
 274 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 275 EXPORT_SYMBOL(sysctl_rmem_max);
 276 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 277 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 278
 279 /* Maximal space eaten by iovec or ancillary data plus some space */
 280 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 281 EXPORT_SYMBOL(sysctl_optmem_max);
 282
 283 int sysctl_tstamp_allow_data __read_mostly = 1;
 284
 285 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 286 EXPORT_SYMBOL_GPL(memalloc_socks_key);
 287
 288 /**
 289  * sk_set_memalloc - sets %SOCK_MEMALLOC
 290  * @sk: socket to set it on
 291  *
 292  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 293  * It's the responsibility of the admin to adjust min_free_kbytes
 294  * to meet the requirements
 295  */
 296 void sk_set_memalloc(struct sock *sk)
 297 {
 298         sock_set_flag(sk, SOCK_MEMALLOC);
 299         sk->sk_allocation |= __GFP_MEMALLOC;
 300         static_branch_inc(&memalloc_socks_key);
 301 }
 302 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 303
 304 void sk_clear_memalloc(struct sock *sk)
 305 {
 306         sock_reset_flag(sk, SOCK_MEMALLOC);
 307         sk->sk_allocation &= ~__GFP_MEMALLOC;
 308         static_branch_dec(&memalloc_socks_key);
 309
 310         /*
 311          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 312          * progress of swapping. SOCK_MEMALLOC may be cleared while
 313          * it has rmem allocations due to the last swapfile being deactivated
 314          * but there is a risk that the socket is unusable due to exceeding
 315          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 316          */
 317         sk_mem_reclaim(sk);
 318 }
 319 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 320
 321 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 322 {
 323         int ret;
 324         unsigned int noreclaim_flag;
 325
 326         /* these should have been dropped before queueing */
 327         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 328
 329         noreclaim_flag = memalloc_noreclaim_save();
 330         ret = sk->sk_backlog_rcv(sk, skb);
 331         memalloc_noreclaim_restore(noreclaim_flag);
 332
 333         return ret;
 334 }
 335 EXPORT_SYMBOL(__sk_backlog_rcv);
 336
 337 void sk_error_report(struct sock *sk)
 338 {
 339         sk->sk_error_report(sk);
 340
 341         switch (sk->sk_family) {
 342         case AF_INET:
 343                 fallthrough;
 344         case AF_INET6:
 345                 trace_inet_sk_error_report(sk);
 346                 break;
 347         default:
 348                 break;
 349         }
 350 }
 351 EXPORT_SYMBOL(sk_error_report);
 352
 353 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 354 {
 355         struct __kernel_sock_timeval tv;
 356
 357         if (timeo == MAX_SCHEDULE_TIMEOUT) {
 358                 tv.tv_sec = 0;
 359                 tv.tv_usec = 0;
 360         } else {
 361                 tv.tv_sec = timeo / HZ;
 362                 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 363         }
 364
 365         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 366                 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 367                 *(struct old_timeval32 *)optval = tv32;
 368                 return sizeof(tv32);
 369         }
 370
 371         if (old_timeval) {
 372                 struct __kernel_old_timeval old_tv;
 373                 old_tv.tv_sec = tv.tv_sec;
 374                 old_tv.tv_usec = tv.tv_usec;
 375                 *(struct __kernel_old_timeval *)optval = old_tv;
 376                 return sizeof(old_tv);
 377         }
 378
 379         *(struct __kernel_sock_timeval *)optval = tv;
 380         return sizeof(tv);
 381 }
 382
 383 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 384                             bool old_timeval)
 385 {
 386         struct __kernel_sock_timeval tv;
 387
 388         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 389                 struct old_timeval32 tv32;
 390
 391                 if (optlen < sizeof(tv32))
 392                         return -EINVAL;
 393
 394                 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 395                         return -EFAULT;
 396                 tv.tv_sec = tv32.tv_sec;
 397                 tv.tv_usec = tv32.tv_usec;
 398         } else if (old_timeval) {
 399                 struct __kernel_old_timeval old_tv;
 400
 401                 if (optlen < sizeof(old_tv))
 402                         return -EINVAL;
 403                 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 404                         return -EFAULT;
 405                 tv.tv_sec = old_tv.tv_sec;
 406                 tv.tv_usec = old_tv.tv_usec;
 407         } else {
 408                 if (optlen < sizeof(tv))
 409                         return -EINVAL;
 410                 if (copy_from_sockptr(&tv, optval, sizeof(tv)))
 411                         return -EFAULT;
 412         }
 413         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 414                 return -EDOM;
 415
 416         if (tv.tv_sec < 0) {
 417                 static int warned __read_mostly;
 418
 419                 *timeo_p = 0;
 420                 if (warned < 10 && net_ratelimit()) {
 421                         warned++;
 422                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 423                                 __func__, current->comm, task_pid_nr(current));
 424                 }
 425                 return 0;
 426         }
 427         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 428         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 429                 return 0;
 430         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
 431                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 432         return 0;
 433 }
 434
 435 static bool sock_needs_netstamp(const struct sock *sk)
 436 {
 437         switch (sk->sk_family) {
 438         case AF_UNSPEC:
 439         case AF_UNIX:
 440                 return false;
 441         default:
 442                 return true;
 443         }
 444 }
 445
 446 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 447 {
 448         if (sk->sk_flags & flags) {
 449                 sk->sk_flags &= ~flags;
 450                 if (sock_needs_netstamp(sk) &&
 451                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 452                         net_disable_timestamp();
 453         }
 454 }
 455
 456
 457 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 458 {
 459         unsigned long flags;
 460         struct sk_buff_head *list = &sk->sk_receive_queue;
 461
 462         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 463                 atomic_inc(&sk->sk_drops);
 464                 trace_sock_rcvqueue_full(sk, skb);
 465                 return -ENOMEM;
 466         }
 467
 468         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 469                 atomic_inc(&sk->sk_drops);
 470                 return -ENOBUFS;
 471         }
 472
 473         skb->dev = NULL;
 474         skb_set_owner_r(skb, sk);
 475
 476         /* we escape from rcu protected region, make sure we dont leak
 477          * a norefcounted dst
 478          */
 479         skb_dst_force(skb);
 480
 481         spin_lock_irqsave(&list->lock, flags);
 482         sock_skb_set_dropcount(sk, skb);
 483         __skb_queue_tail(list, skb);
 484         spin_unlock_irqrestore(&list->lock, flags);
 485
 486         if (!sock_flag(sk, SOCK_DEAD))
 487                 sk->sk_data_ready(sk);
 488         return 0;
 489 }
 490 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 491
 492 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 493 {
 494         int err;
 495
 496         err = sk_filter(sk, skb);
 497         if (err)
 498                 return err;
 499
 500         return __sock_queue_rcv_skb(sk, skb);
 501 }
 502 EXPORT_SYMBOL(sock_queue_rcv_skb);
 503
 504 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 505                      const int nested, unsigned int trim_cap, bool refcounted)
 506 {
 507         int rc = NET_RX_SUCCESS;
 508
 509         if (sk_filter_trim_cap(sk, skb, trim_cap))
 510                 goto discard_and_relse;
 511
 512         skb->dev = NULL;
 513
 514         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 515                 atomic_inc(&sk->sk_drops);
 516                 goto discard_and_relse;
 517         }
 518         if (nested)
 519                 bh_lock_sock_nested(sk);
 520         else
 521                 bh_lock_sock(sk);
 522         if (!sock_owned_by_user(sk)) {
 523                 /*
 524                  * trylock + unlock semantics:
 525                  */
 526                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 527
 528                 rc = sk_backlog_rcv(sk, skb);
 529
 530                 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 531         } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 532                 bh_unlock_sock(sk);
 533                 atomic_inc(&sk->sk_drops);
 534                 goto discard_and_relse;
 535         }
 536
 537         bh_unlock_sock(sk);
 538 out:
 539         if (refcounted)
 540                 sock_put(sk);
 541         return rc;
 542 discard_and_relse:
 543         kfree_skb(skb);
 544         goto out;
 545 }
 546 EXPORT_SYMBOL(__sk_receive_skb);
 547
 548 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 549                                                           u32));
 550 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 551                                                            u32));
 552 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 553 {
 554         struct dst_entry *dst = __sk_dst_get(sk);
 555
 556         if (dst && dst->obsolete &&
 557             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 558                                dst, cookie) == NULL) {
 559                 sk_tx_queue_clear(sk);
 560                 sk->sk_dst_pending_confirm = 0;
 561                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 562                 dst_release(dst);
 563                 return NULL;
 564         }
 565
 566         return dst;
 567 }
 568 EXPORT_SYMBOL(__sk_dst_check);
 569
 570 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 571 {
 572         struct dst_entry *dst = sk_dst_get(sk);
 573
 574         if (dst && dst->obsolete &&
 575             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 576                                dst, cookie) == NULL) {
 577                 sk_dst_reset(sk);
 578                 dst_release(dst);
 579                 return NULL;
 580         }
 581
 582         return dst;
 583 }
 584 EXPORT_SYMBOL(sk_dst_check);
 585
 586 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 587 {
 588         int ret = -ENOPROTOOPT;
 589 #ifdef CONFIG_NETDEVICES
 590         struct net *net = sock_net(sk);
 591
 592         /* Sorry... */
 593         ret = -EPERM;
 594         if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 595                 goto out;
 596
 597         ret = -EINVAL;
 598         if (ifindex < 0)
 599                 goto out;
 600
 601         sk->sk_bound_dev_if = ifindex;
 602         if (sk->sk_prot->rehash)
 603                 sk->sk_prot->rehash(sk);
 604         sk_dst_reset(sk);
 605
 606         ret = 0;
 607
 608 out:
 609 #endif
 610
 611         return ret;
 612 }
 613
 614 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 615 {
 616         int ret;
 617
 618         if (lock_sk)
 619                 lock_sock(sk);
 620         ret = sock_bindtoindex_locked(sk, ifindex);
 621         if (lock_sk)
 622                 release_sock(sk);
 623
 624         return ret;
 625 }
 626 EXPORT_SYMBOL(sock_bindtoindex);
 627
 628 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 629 {
 630         int ret = -ENOPROTOOPT;
 631 #ifdef CONFIG_NETDEVICES
 632         struct net *net = sock_net(sk);
 633         char devname[IFNAMSIZ];
 634         int index;
 635
 636         ret = -EINVAL;
 637         if (optlen < 0)
 638                 goto out;
 639
 640         /* Bind this socket to a particular device like "eth0",
 641          * as specified in the passed interface name. If the
 642          * name is "" or the option length is zero the socket
 643          * is not bound.
 644          */
 645         if (optlen > IFNAMSIZ - 1)
 646                 optlen = IFNAMSIZ - 1;
 647         memset(devname, 0, sizeof(devname));
 648
 649         ret = -EFAULT;
 650         if (copy_from_sockptr(devname, optval, optlen))
 651                 goto out;
 652
 653         index = 0;
 654         if (devname[0] != '\0') {
 655                 struct net_device *dev;
 656
 657                 rcu_read_lock();
 658                 dev = dev_get_by_name_rcu(net, devname);
 659                 if (dev)
 660                         index = dev->ifindex;
 661                 rcu_read_unlock();
 662                 ret = -ENODEV;
 663                 if (!dev)
 664                         goto out;
 665         }
 666
 667         return sock_bindtoindex(sk, index, true);
 668 out:
 669 #endif
 670
 671         return ret;
 672 }
 673
 674 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 675                                 int __user *optlen, int len)
 676 {
 677         int ret = -ENOPROTOOPT;
 678 #ifdef CONFIG_NETDEVICES
 679         struct net *net = sock_net(sk);
 680         char devname[IFNAMSIZ];
 681
 682         if (sk->sk_bound_dev_if == 0) {
 683                 len = 0;
 684                 goto zero;
 685         }
 686
 687         ret = -EINVAL;
 688         if (len < IFNAMSIZ)
 689                 goto out;
 690
 691         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 692         if (ret)
 693                 goto out;
 694
 695         len = strlen(devname) + 1;
 696
 697         ret = -EFAULT;
 698         if (copy_to_user(optval, devname, len))
 699                 goto out;
 700
 701 zero:
 702         ret = -EFAULT;
 703         if (put_user(len, optlen))
 704                 goto out;
 705
 706         ret = 0;
 707
 708 out:
 709 #endif
 710
 711         return ret;
 712 }
 713
 714 bool sk_mc_loop(struct sock *sk)
 715 {
 716         if (dev_recursion_level())
 717                 return false;
 718         if (!sk)
 719                 return true;
 720         /* IPV6_ADDRFORM can change sk->sk_family under us. */
 721         switch (READ_ONCE(sk->sk_family)) {
 722         case AF_INET:
 723                 return inet_sk(sk)->mc_loop;
 724 #if IS_ENABLED(CONFIG_IPV6)
 725         case AF_INET6:
 726                 return inet6_sk(sk)->mc_loop;
 727 #endif
 728         }
 729         WARN_ON_ONCE(1);
 730         return true;
 731 }
 732 EXPORT_SYMBOL(sk_mc_loop);
 733
 734 void sock_set_reuseaddr(struct sock *sk)
 735 {
 736         lock_sock(sk);
 737         sk->sk_reuse = SK_CAN_REUSE;
 738         release_sock(sk);
 739 }
 740 EXPORT_SYMBOL(sock_set_reuseaddr);
 741
 742 void sock_set_reuseport(struct sock *sk)
 743 {
 744         lock_sock(sk);
 745         sk->sk_reuseport = true;
 746         release_sock(sk);
 747 }
 748 EXPORT_SYMBOL(sock_set_reuseport);
 749
 750 void sock_no_linger(struct sock *sk)
 751 {
 752         lock_sock(sk);
 753         sk->sk_lingertime = 0;
 754         sock_set_flag(sk, SOCK_LINGER);
 755         release_sock(sk);
 756 }
 757 EXPORT_SYMBOL(sock_no_linger);
 758
 759 void sock_set_priority(struct sock *sk, u32 priority)
 760 {
 761         lock_sock(sk);
 762         sk->sk_priority = priority;
 763         release_sock(sk);
 764 }
 765 EXPORT_SYMBOL(sock_set_priority);
 766
 767 void sock_set_sndtimeo(struct sock *sk, s64 secs)
 768 {
 769         lock_sock(sk);
 770         if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 771                 sk->sk_sndtimeo = secs * HZ;
 772         else
 773                 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
 774         release_sock(sk);
 775 }
 776 EXPORT_SYMBOL(sock_set_sndtimeo);
 777
 778 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 779 {
 780         if (val)  {
 781                 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 782                 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 783                 sock_set_flag(sk, SOCK_RCVTSTAMP);
 784                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 785         } else {
 786                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
 787                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 788         }
 789 }
 790
 791 void sock_enable_timestamps(struct sock *sk)
 792 {
 793         lock_sock(sk);
 794         __sock_set_timestamps(sk, true, false, true);
 795         release_sock(sk);
 796 }
 797 EXPORT_SYMBOL(sock_enable_timestamps);
 798
 799 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 800 {
 801         switch (optname) {
 802         case SO_TIMESTAMP_OLD:
 803                 __sock_set_timestamps(sk, valbool, false, false);
 804                 break;
 805         case SO_TIMESTAMP_NEW:
 806                 __sock_set_timestamps(sk, valbool, true, false);
 807                 break;
 808         case SO_TIMESTAMPNS_OLD:
 809                 __sock_set_timestamps(sk, valbool, false, true);
 810                 break;
 811         case SO_TIMESTAMPNS_NEW:
 812                 __sock_set_timestamps(sk, valbool, true, true);
 813                 break;
 814         }
 815 }
 816
 817 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 818 {
 819         struct net *net = sock_net(sk);
 820         struct net_device *dev = NULL;
 821         bool match = false;
 822         int *vclock_index;
 823         int i, num;
 824
 825         if (sk->sk_bound_dev_if)
 826                 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 827
 828         if (!dev) {
 829                 pr_err("%s: sock not bind to device\n", __func__);
 830                 return -EOPNOTSUPP;
 831         }
 832
 833         num = ethtool_get_phc_vclocks(dev, &vclock_index);
 834         dev_put(dev);
 835
 836         for (i = 0; i < num; i++) {
 837                 if (*(vclock_index + i) == phc_index) {
 838                         match = true;
 839                         break;
 840                 }
 841         }
 842
 843         if (num > 0)
 844                 kfree(vclock_index);
 845
 846         if (!match)
 847                 return -EINVAL;
 848
 849         sk->sk_bind_phc = phc_index;
 850
 851         return 0;
 852 }
 853
 854 int sock_set_timestamping(struct sock *sk, int optname,
 855                           struct so_timestamping timestamping)
 856 {
 857         int val = timestamping.flags;
 858         int ret;
 859
 860         if (val & ~SOF_TIMESTAMPING_MASK)
 861                 return -EINVAL;
 862
 863         if (val & SOF_TIMESTAMPING_OPT_ID &&
 864             !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 865                 if (sk->sk_protocol == IPPROTO_TCP &&
 866                     sk->sk_type == SOCK_STREAM) {
 867                         if ((1 << sk->sk_state) &
 868                             (TCPF_CLOSE | TCPF_LISTEN))
 869                                 return -EINVAL;
 870                         atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
 871                 } else {
 872                         atomic_set(&sk->sk_tskey, 0);
 873                 }
 874         }
 875
 876         if (val & SOF_TIMESTAMPING_OPT_STATS &&
 877             !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 878                 return -EINVAL;
 879
 880         if (val & SOF_TIMESTAMPING_BIND_PHC) {
 881                 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 882                 if (ret)
 883                         return ret;
 884         }
 885
 886         sk->sk_tsflags = val;
 887         sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 888
 889         if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 890                 sock_enable_timestamp(sk,
 891                                       SOCK_TIMESTAMPING_RX_SOFTWARE);
 892         else
 893                 sock_disable_timestamp(sk,
 894                                        (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 895         return 0;
 896 }
 897
 898 void sock_set_keepalive(struct sock *sk)
 899 {
 900         lock_sock(sk);
 901         if (sk->sk_prot->keepalive)
 902                 sk->sk_prot->keepalive(sk, true);
 903         sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 904         release_sock(sk);
 905 }
 906 EXPORT_SYMBOL(sock_set_keepalive);
 907
 908 static void __sock_set_rcvbuf(struct sock *sk, int val)
 909 {
 910         /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 911          * as a negative value.
 912          */
 913         val = min_t(int, val, INT_MAX / 2);
 914         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 915
 916         /* We double it on the way in to account for "struct sk_buff" etc.
 917          * overhead.   Applications assume that the SO_RCVBUF setting they make
 918          * will allow that much actual data to be received on that socket.
 919          *
 920          * Applications are unaware that "struct sk_buff" and other overheads
 921          * allocate from the receive buffer during socket buffer allocation.
 922          *
 923          * And after considering the possible alternatives, returning the value
 924          * we actually used in getsockopt is the most desirable behavior.
 925          */
 926         WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 927 }
 928
 929 void sock_set_rcvbuf(struct sock *sk, int val)
 930 {
 931         lock_sock(sk);
 932         __sock_set_rcvbuf(sk, val);
 933         release_sock(sk);
 934 }
 935 EXPORT_SYMBOL(sock_set_rcvbuf);
 936
 937 static void __sock_set_mark(struct sock *sk, u32 val)
 938 {
 939         if (val != sk->sk_mark) {
 940                 sk->sk_mark = val;
 941                 sk_dst_reset(sk);
 942         }
 943 }
 944
 945 void sock_set_mark(struct sock *sk, u32 val)
 946 {
 947         lock_sock(sk);
 948         __sock_set_mark(sk, val);
 949         release_sock(sk);
 950 }
 951 EXPORT_SYMBOL(sock_set_mark);
 952
 953 /*
 954  *      This is meant for all protocols to use and covers goings on
 955  *      at the socket level. Everything here is generic.
 956  */
 957
 958 int sock_setsockopt(struct socket *sock, int level, int optname,
 959                     sockptr_t optval, unsigned int optlen)
 960 {
 961         struct so_timestamping timestamping;
 962         struct sock_txtime sk_txtime;
 963         struct sock *sk = sock->sk;
 964         int val;
 965         int valbool;
 966         struct linger ling;
 967         int ret = 0;
 968
 969         /*
 970          *      Options without arguments
 971          */
 972
 973         if (optname == SO_BINDTODEVICE)
 974                 return sock_setbindtodevice(sk, optval, optlen);
 975
 976         if (optlen < sizeof(int))
 977                 return -EINVAL;
 978
 979         if (copy_from_sockptr(&val, optval, sizeof(val)))
 980                 return -EFAULT;
 981
 982         valbool = val ? 1 : 0;
 983
 984         lock_sock(sk);
 985
 986         switch (optname) {
 987         case SO_DEBUG:
 988                 if (val && !capable(CAP_NET_ADMIN))
 989                         ret = -EACCES;
 990                 else
 991                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 992                 break;
 993         case SO_REUSEADDR:
 994                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 995                 break;
 996         case SO_REUSEPORT:
 997                 sk->sk_reuseport = valbool;
 998                 break;
 999         case SO_TYPE:
1000         case SO_PROTOCOL:
1001         case SO_DOMAIN:
1002         case SO_ERROR:
1003                 ret = -ENOPROTOOPT;
1004                 break;
1005         case SO_DONTROUTE:
1006                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1007                 sk_dst_reset(sk);
1008                 break;
1009         case SO_BROADCAST:
1010                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1011                 break;
1012         case SO_SNDBUF:
1013                 /* Don't error on this BSD doesn't and if you think
1014                  * about it this is right. Otherwise apps have to
1015                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1016                  * are treated in BSD as hints
1017                  */
1018                 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1019 set_sndbuf:
1020                 /* Ensure val * 2 fits into an int, to prevent max_t()
1021                  * from treating it as a negative value.
1022                  */
1023                 val = min_t(int, val, INT_MAX / 2);
1024                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1025                 WRITE_ONCE(sk->sk_sndbuf,
1026                            max_t(int, val * 2, SOCK_MIN_SNDBUF));
1027                 /* Wake up sending tasks if we upped the value. */
1028                 sk->sk_write_space(sk);
1029                 break;
1030
1031         case SO_SNDBUFFORCE:
1032                 if (!capable(CAP_NET_ADMIN)) {
1033                         ret = -EPERM;
1034                         break;
1035                 }
1036
1037                 /* No negative values (to prevent underflow, as val will be
1038                  * multiplied by 2).
1039                  */
1040                 if (val < 0)
1041                         val = 0;
1042                 goto set_sndbuf;
1043
1044         case SO_RCVBUF:
1045                 /* Don't error on this BSD doesn't and if you think
1046                  * about it this is right. Otherwise apps have to
1047                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1048                  * are treated in BSD as hints
1049                  */
1050                 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1051                 break;
1052
1053         case SO_RCVBUFFORCE:
1054                 if (!capable(CAP_NET_ADMIN)) {
1055                         ret = -EPERM;
1056                         break;
1057                 }
1058
1059                 /* No negative values (to prevent underflow, as val will be
1060                  * multiplied by 2).
1061                  */
1062                 __sock_set_rcvbuf(sk, max(val, 0));
1063                 break;
1064
1065         case SO_KEEPALIVE:
1066                 if (sk->sk_prot->keepalive)
1067                         sk->sk_prot->keepalive(sk, valbool);
1068                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1069                 break;
1070
1071         case SO_OOBINLINE:
1072                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1073                 break;
1074
1075         case SO_NO_CHECK:
1076                 sk->sk_no_check_tx = valbool;
1077                 break;
1078
1079         case SO_PRIORITY:
1080                 if ((val >= 0 && val <= 6) ||
1081                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1082                         sk->sk_priority = val;
1083                 else
1084                         ret = -EPERM;
1085                 break;
1086
1087         case SO_LINGER:
1088                 if (optlen < sizeof(ling)) {
1089                         ret = -EINVAL;  /* 1003.1g */
1090                         break;
1091                 }
1092                 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1093                         ret = -EFAULT;
1094                         break;
1095                 }
1096                 if (!ling.l_onoff)
1097                         sock_reset_flag(sk, SOCK_LINGER);
1098                 else {
1099 #if (BITS_PER_LONG == 32)
1100                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1101                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1102                         else
1103 #endif
1104                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1105                         sock_set_flag(sk, SOCK_LINGER);
1106                 }
1107                 break;
1108
1109         case SO_BSDCOMPAT:
1110                 break;
1111
1112         case SO_PASSCRED:
1113                 if (valbool)
1114                         set_bit(SOCK_PASSCRED, &sock->flags);
1115                 else
1116                         clear_bit(SOCK_PASSCRED, &sock->flags);
1117                 break;
1118
1119         case SO_TIMESTAMP_OLD:
1120         case SO_TIMESTAMP_NEW:
1121         case SO_TIMESTAMPNS_OLD:
1122         case SO_TIMESTAMPNS_NEW:
1123                 sock_set_timestamp(sk, optname, valbool);
1124                 break;
1125
1126         case SO_TIMESTAMPING_NEW:
1127         case SO_TIMESTAMPING_OLD:
1128                 if (optlen == sizeof(timestamping)) {
1129                         if (copy_from_sockptr(&timestamping, optval,
1130                                               sizeof(timestamping))) {
1131                                 ret = -EFAULT;
1132                                 break;
1133                         }
1134                 } else {
1135                         memset(&timestamping, 0, sizeof(timestamping));
1136                         timestamping.flags = val;
1137                 }
1138                 ret = sock_set_timestamping(sk, optname, timestamping);
1139                 break;
1140
1141         case SO_RCVLOWAT:
1142                 if (val < 0)
1143                         val = INT_MAX;
1144                 if (sock->ops->set_rcvlowat)
1145                         ret = sock->ops->set_rcvlowat(sk, val);
1146                 else
1147                         WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1148                 break;
1149
1150         case SO_RCVTIMEO_OLD:
1151         case SO_RCVTIMEO_NEW:
1152                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1153                                        optlen, optname == SO_RCVTIMEO_OLD);
1154                 break;
1155
1156         case SO_SNDTIMEO_OLD:
1157         case SO_SNDTIMEO_NEW:
1158                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1159                                        optlen, optname == SO_SNDTIMEO_OLD);
1160                 break;
1161
1162         case SO_ATTACH_FILTER: {
1163                 struct sock_fprog fprog;
1164
1165                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1166                 if (!ret)
1167                         ret = sk_attach_filter(&fprog, sk);
1168                 break;
1169         }
1170         case SO_ATTACH_BPF:
1171                 ret = -EINVAL;
1172                 if (optlen == sizeof(u32)) {
1173                         u32 ufd;
1174
1175                         ret = -EFAULT;
1176                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1177                                 break;
1178
1179                         ret = sk_attach_bpf(ufd, sk);
1180                 }
1181                 break;
1182
1183         case SO_ATTACH_REUSEPORT_CBPF: {
1184                 struct sock_fprog fprog;
1185
1186                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1187                 if (!ret)
1188                         ret = sk_reuseport_attach_filter(&fprog, sk);
1189                 break;
1190         }
1191         case SO_ATTACH_REUSEPORT_EBPF:
1192                 ret = -EINVAL;
1193                 if (optlen == sizeof(u32)) {
1194                         u32 ufd;
1195
1196                         ret = -EFAULT;
1197                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1198                                 break;
1199
1200                         ret = sk_reuseport_attach_bpf(ufd, sk);
1201                 }
1202                 break;
1203
1204         case SO_DETACH_REUSEPORT_BPF:
1205                 ret = reuseport_detach_prog(sk);
1206                 break;
1207
1208         case SO_DETACH_FILTER:
1209                 ret = sk_detach_filter(sk);
1210                 break;
1211
1212         case SO_LOCK_FILTER:
1213                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1214                         ret = -EPERM;
1215                 else
1216                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1217                 break;
1218
1219         case SO_PASSSEC:
1220                 if (valbool)
1221                         set_bit(SOCK_PASSSEC, &sock->flags);
1222                 else
1223                         clear_bit(SOCK_PASSSEC, &sock->flags);
1224                 break;
1225         case SO_MARK:
1226                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1227                         ret = -EPERM;
1228                         break;
1229                 }
1230
1231                 __sock_set_mark(sk, val);
1232                 break;
1233
1234         case SO_RXQ_OVFL:
1235                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1236                 break;
1237
1238         case SO_WIFI_STATUS:
1239                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1240                 break;
1241
1242         case SO_PEEK_OFF:
1243                 if (sock->ops->set_peek_off)
1244                         ret = sock->ops->set_peek_off(sk, val);
1245                 else
1246                         ret = -EOPNOTSUPP;
1247                 break;
1248
1249         case SO_NOFCS:
1250                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1251                 break;
1252
1253         case SO_SELECT_ERR_QUEUE:
1254                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1255                 break;
1256
1257 #ifdef CONFIG_NET_RX_BUSY_POLL
1258         case SO_BUSY_POLL:
1259                 /* allow unprivileged users to decrease the value */
1260                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1261                         ret = -EPERM;
1262                 else {
1263                         if (val < 0)
1264                                 ret = -EINVAL;
1265                         else
1266                                 WRITE_ONCE(sk->sk_ll_usec, val);
1267                 }
1268                 break;
1269         case SO_PREFER_BUSY_POLL:
1270                 if (valbool && !capable(CAP_NET_ADMIN))
1271                         ret = -EPERM;
1272                 else
1273                         WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1274                 break;
1275         case SO_BUSY_POLL_BUDGET:
1276                 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1277                         ret = -EPERM;
1278                 } else {
1279                         if (val < 0 || val > U16_MAX)
1280                                 ret = -EINVAL;
1281                         else
1282                                 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1283                 }
1284                 break;
1285 #endif
1286
1287         case SO_MAX_PACING_RATE:
1288                 {
1289                 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1290
1291                 if (sizeof(ulval) != sizeof(val) &&
1292                     optlen >= sizeof(ulval) &&
1293                     copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1294                         ret = -EFAULT;
1295                         break;
1296                 }
1297                 if (ulval != ~0UL)
1298                         cmpxchg(&sk->sk_pacing_status,
1299                                 SK_PACING_NONE,
1300                                 SK_PACING_NEEDED);
1301                 /* Pairs with READ_ONCE() from sk_getsockopt() */
1302                 WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1303                 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1304                 break;
1305                 }
1306         case SO_INCOMING_CPU:
1307                 reuseport_update_incoming_cpu(sk, val);
1308                 break;
1309
1310         case SO_CNX_ADVICE:
1311                 if (val == 1)
1312                         dst_negative_advice(sk);
1313                 break;
1314
1315         case SO_ZEROCOPY:
1316                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1317                         if (!((sk->sk_type == SOCK_STREAM &&
1318                                sk->sk_protocol == IPPROTO_TCP) ||
1319                               (sk->sk_type == SOCK_DGRAM &&
1320                                sk->sk_protocol == IPPROTO_UDP)))
1321                                 ret = -ENOTSUPP;
1322                 } else if (sk->sk_family != PF_RDS) {
1323                         ret = -ENOTSUPP;
1324                 }
1325                 if (!ret) {
1326                         if (val < 0 || val > 1)
1327                                 ret = -EINVAL;
1328                         else
1329                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1330                 }
1331                 break;
1332
1333         case SO_TXTIME:
1334                 if (optlen != sizeof(struct sock_txtime)) {
1335                         ret = -EINVAL;
1336                         break;
1337                 } else if (copy_from_sockptr(&sk_txtime, optval,
1338                            sizeof(struct sock_txtime))) {
1339                         ret = -EFAULT;
1340                         break;
1341                 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1342                         ret = -EINVAL;
1343                         break;
1344                 }
1345                 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1346                  * scheduler has enough safe guards.
1347                  */
1348                 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1349                     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1350                         ret = -EPERM;
1351                         break;
1352                 }
1353                 sock_valbool_flag(sk, SOCK_TXTIME, true);
1354                 sk->sk_clockid = sk_txtime.clockid;
1355                 sk->sk_txtime_deadline_mode =
1356                         !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1357                 sk->sk_txtime_report_errors =
1358                         !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1359                 break;
1360
1361         case SO_BINDTOIFINDEX:
1362                 ret = sock_bindtoindex_locked(sk, val);
1363                 break;
1364
1365         case SO_BUF_LOCK:
1366                 if (val & ~SOCK_BUF_LOCK_MASK) {
1367                         ret = -EINVAL;
1368                         break;
1369                 }
1370                 sk->sk_userlocks = val | (sk->sk_userlocks &
1371                                           ~SOCK_BUF_LOCK_MASK);
1372                 break;
1373
1374         default:
1375                 ret = -ENOPROTOOPT;
1376                 break;
1377         }
1378         release_sock(sk);
1379         return ret;
1380 }
1381 EXPORT_SYMBOL(sock_setsockopt);
1382
1383 static const struct cred *sk_get_peer_cred(struct sock *sk)
1384 {
1385         const struct cred *cred;
1386
1387         spin_lock(&sk->sk_peer_lock);
1388         cred = get_cred(sk->sk_peer_cred);
1389         spin_unlock(&sk->sk_peer_lock);
1390
1391         return cred;
1392 }
1393
1394 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1395                           struct ucred *ucred)
1396 {
1397         ucred->pid = pid_vnr(pid);
1398         ucred->uid = ucred->gid = -1;
1399         if (cred) {
1400                 struct user_namespace *current_ns = current_user_ns();
1401
1402                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1403                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1404         }
1405 }
1406
1407 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1408 {
1409         struct user_namespace *user_ns = current_user_ns();
1410         int i;
1411
1412         for (i = 0; i < src->ngroups; i++)
1413                 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1414                         return -EFAULT;
1415
1416         return 0;
1417 }
1418
1419 int sock_getsockopt(struct socket *sock, int level, int optname,
1420                     char __user *optval, int __user *optlen)
1421 {
1422         struct sock *sk = sock->sk;
1423
1424         union {
1425                 int val;
1426                 u64 val64;
1427                 unsigned long ulval;
1428                 struct linger ling;
1429                 struct old_timeval32 tm32;
1430                 struct __kernel_old_timeval tm;
1431                 struct  __kernel_sock_timeval stm;
1432                 struct sock_txtime txtime;
1433                 struct so_timestamping timestamping;
1434         } v;
1435
1436         int lv = sizeof(int);
1437         int len;
1438
1439         if (get_user(len, optlen))
1440                 return -EFAULT;
1441         if (len < 0)
1442                 return -EINVAL;
1443
1444         memset(&v, 0, sizeof(v));
1445
1446         switch (optname) {
1447         case SO_DEBUG:
1448                 v.val = sock_flag(sk, SOCK_DBG);
1449                 break;
1450
1451         case SO_DONTROUTE:
1452                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1453                 break;
1454
1455         case SO_BROADCAST:
1456                 v.val = sock_flag(sk, SOCK_BROADCAST);
1457                 break;
1458
1459         case SO_SNDBUF:
1460                 v.val = READ_ONCE(sk->sk_sndbuf);
1461                 break;
1462
1463         case SO_RCVBUF:
1464                 v.val = READ_ONCE(sk->sk_rcvbuf);
1465                 break;
1466
1467         case SO_REUSEADDR:
1468                 v.val = sk->sk_reuse;
1469                 break;
1470
1471         case SO_REUSEPORT:
1472                 v.val = sk->sk_reuseport;
1473                 break;
1474
1475         case SO_KEEPALIVE:
1476                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1477                 break;
1478
1479         case SO_TYPE:
1480                 v.val = sk->sk_type;
1481                 break;
1482
1483         case SO_PROTOCOL:
1484                 v.val = sk->sk_protocol;
1485                 break;
1486
1487         case SO_DOMAIN:
1488                 v.val = sk->sk_family;
1489                 break;
1490
1491         case SO_ERROR:
1492                 v.val = -sock_error(sk);
1493                 if (v.val == 0)
1494                         v.val = xchg(&sk->sk_err_soft, 0);
1495                 break;
1496
1497         case SO_OOBINLINE:
1498                 v.val = sock_flag(sk, SOCK_URGINLINE);
1499                 break;
1500
1501         case SO_NO_CHECK:
1502                 v.val = sk->sk_no_check_tx;
1503                 break;
1504
1505         case SO_PRIORITY:
1506                 v.val = sk->sk_priority;
1507                 break;
1508
1509         case SO_LINGER:
1510                 lv              = sizeof(v.ling);
1511                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1512                 v.ling.l_linger = sk->sk_lingertime / HZ;
1513                 break;
1514
1515         case SO_BSDCOMPAT:
1516                 break;
1517
1518         case SO_TIMESTAMP_OLD:
1519                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1520                                 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1521                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1522                 break;
1523
1524         case SO_TIMESTAMPNS_OLD:
1525                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1526                 break;
1527
1528         case SO_TIMESTAMP_NEW:
1529                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1530                 break;
1531
1532         case SO_TIMESTAMPNS_NEW:
1533                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1534                 break;
1535
1536         case SO_TIMESTAMPING_OLD:
1537                 lv = sizeof(v.timestamping);
1538                 v.timestamping.flags = sk->sk_tsflags;
1539                 v.timestamping.bind_phc = sk->sk_bind_phc;
1540                 break;
1541
1542         case SO_RCVTIMEO_OLD:
1543         case SO_RCVTIMEO_NEW:
1544                 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1545                 break;
1546
1547         case SO_SNDTIMEO_OLD:
1548         case SO_SNDTIMEO_NEW:
1549                 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1550                 break;
1551
1552         case SO_RCVLOWAT:
1553                 v.val = READ_ONCE(sk->sk_rcvlowat);
1554                 break;
1555
1556         case SO_SNDLOWAT:
1557                 v.val = 1;
1558                 break;
1559
1560         case SO_PASSCRED:
1561                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1562                 break;
1563
1564         case SO_PEERCRED:
1565         {
1566                 struct ucred peercred;
1567                 if (len > sizeof(peercred))
1568                         len = sizeof(peercred);
1569
1570                 spin_lock(&sk->sk_peer_lock);
1571                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1572                 spin_unlock(&sk->sk_peer_lock);
1573
1574                 if (copy_to_user(optval, &peercred, len))
1575                         return -EFAULT;
1576                 goto lenout;
1577         }
1578
1579         case SO_PEERGROUPS:
1580         {
1581                 const struct cred *cred;
1582                 int ret, n;
1583
1584                 cred = sk_get_peer_cred(sk);
1585                 if (!cred)
1586                         return -ENODATA;
1587
1588                 n = cred->group_info->ngroups;
1589                 if (len < n * sizeof(gid_t)) {
1590                         len = n * sizeof(gid_t);
1591                         put_cred(cred);
1592                         return put_user(len, optlen) ? -EFAULT : -ERANGE;
1593                 }
1594                 len = n * sizeof(gid_t);
1595
1596                 ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1597                 put_cred(cred);
1598                 if (ret)
1599                         return ret;
1600                 goto lenout;
1601         }
1602
1603         case SO_PEERNAME:
1604         {
1605                 char address[128];
1606
1607                 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1608                 if (lv < 0)
1609                         return -ENOTCONN;
1610                 if (lv < len)
1611                         return -EINVAL;
1612                 if (copy_to_user(optval, address, len))
1613                         return -EFAULT;
1614                 goto lenout;
1615         }
1616
1617         /* Dubious BSD thing... Probably nobody even uses it, but
1618          * the UNIX standard wants it for whatever reason... -DaveM
1619          */
1620         case SO_ACCEPTCONN:
1621                 v.val = sk->sk_state == TCP_LISTEN;
1622                 break;
1623
1624         case SO_PASSSEC:
1625                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1626                 break;
1627
1628         case SO_PEERSEC:
1629                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1630
1631         case SO_MARK:
1632                 v.val = sk->sk_mark;
1633                 break;
1634
1635         case SO_RXQ_OVFL:
1636                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1637                 break;
1638
1639         case SO_WIFI_STATUS:
1640                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1641                 break;
1642
1643         case SO_PEEK_OFF:
1644                 if (!sock->ops->set_peek_off)
1645                         return -EOPNOTSUPP;
1646
1647                 v.val = READ_ONCE(sk->sk_peek_off);
1648                 break;
1649         case SO_NOFCS:
1650                 v.val = sock_flag(sk, SOCK_NOFCS);
1651                 break;
1652
1653         case SO_BINDTODEVICE:
1654                 return sock_getbindtodevice(sk, optval, optlen, len);
1655
1656         case SO_GET_FILTER:
1657                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1658                 if (len < 0)
1659                         return len;
1660
1661                 goto lenout;
1662
1663         case SO_LOCK_FILTER:
1664                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1665                 break;
1666
1667         case SO_BPF_EXTENSIONS:
1668                 v.val = bpf_tell_extensions();
1669                 break;
1670
1671         case SO_SELECT_ERR_QUEUE:
1672                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1673                 break;
1674
1675 #ifdef CONFIG_NET_RX_BUSY_POLL
1676         case SO_BUSY_POLL:
1677                 v.val = READ_ONCE(sk->sk_ll_usec);
1678                 break;
1679         case SO_PREFER_BUSY_POLL:
1680                 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1681                 break;
1682 #endif
1683
1684         case SO_MAX_PACING_RATE:
1685                 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1686                 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1687                         lv = sizeof(v.ulval);
1688                         v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1689                 } else {
1690                         /* 32bit version */
1691                         v.val = min_t(unsigned long, ~0U,
1692                                       READ_ONCE(sk->sk_max_pacing_rate));
1693                 }
1694                 break;
1695
1696         case SO_INCOMING_CPU:
1697                 v.val = READ_ONCE(sk->sk_incoming_cpu);
1698                 break;
1699
1700         case SO_MEMINFO:
1701         {
1702                 u32 meminfo[SK_MEMINFO_VARS];
1703
1704                 sk_get_meminfo(sk, meminfo);
1705
1706                 len = min_t(unsigned int, len, sizeof(meminfo));
1707                 if (copy_to_user(optval, &meminfo, len))
1708                         return -EFAULT;
1709
1710                 goto lenout;
1711         }
1712
1713 #ifdef CONFIG_NET_RX_BUSY_POLL
1714         case SO_INCOMING_NAPI_ID:
1715                 v.val = READ_ONCE(sk->sk_napi_id);
1716
1717                 /* aggregate non-NAPI IDs down to 0 */
1718                 if (v.val < MIN_NAPI_ID)
1719                         v.val = 0;
1720
1721                 break;
1722 #endif
1723
1724         case SO_COOKIE:
1725                 lv = sizeof(u64);
1726                 if (len < lv)
1727                         return -EINVAL;
1728                 v.val64 = sock_gen_cookie(sk);
1729                 break;
1730
1731         case SO_ZEROCOPY:
1732                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1733                 break;
1734
1735         case SO_TXTIME:
1736                 lv = sizeof(v.txtime);
1737                 v.txtime.clockid = sk->sk_clockid;
1738                 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1739                                   SOF_TXTIME_DEADLINE_MODE : 0;
1740                 v.txtime.flags |= sk->sk_txtime_report_errors ?
1741                                   SOF_TXTIME_REPORT_ERRORS : 0;
1742                 break;
1743
1744         case SO_BINDTOIFINDEX:
1745                 v.val = sk->sk_bound_dev_if;
1746                 break;
1747
1748         case SO_NETNS_COOKIE:
1749                 lv = sizeof(u64);
1750                 if (len != lv)
1751                         return -EINVAL;
1752                 v.val64 = sock_net(sk)->net_cookie;
1753                 break;
1754
1755         case SO_BUF_LOCK:
1756                 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1757                 break;
1758
1759         default:
1760                 /* We implement the SO_SNDLOWAT etc to not be settable
1761                  * (1003.1g 7).
1762                  */
1763                 return -ENOPROTOOPT;
1764         }
1765
1766         if (len > lv)
1767                 len = lv;
1768         if (copy_to_user(optval, &v, len))
1769                 return -EFAULT;
1770 lenout:
1771         if (put_user(len, optlen))
1772                 return -EFAULT;
1773         return 0;
1774 }
1775
1776 /*
1777  * Initialize an sk_lock.
1778  *
1779  * (We also register the sk_lock with the lock validator.)
1780  */
1781 static inline void sock_lock_init(struct sock *sk)
1782 {
1783         if (sk->sk_kern_sock)
1784                 sock_lock_init_class_and_name(
1785                         sk,
1786                         af_family_kern_slock_key_strings[sk->sk_family],
1787                         af_family_kern_slock_keys + sk->sk_family,
1788                         af_family_kern_key_strings[sk->sk_family],
1789                         af_family_kern_keys + sk->sk_family);
1790         else
1791                 sock_lock_init_class_and_name(
1792                         sk,
1793                         af_family_slock_key_strings[sk->sk_family],
1794                         af_family_slock_keys + sk->sk_family,
1795                         af_family_key_strings[sk->sk_family],
1796                         af_family_keys + sk->sk_family);
1797 }
1798
1799 /*
1800  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1801  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1802  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1803  */
1804 static void sock_copy(struct sock *nsk, const struct sock *osk)
1805 {
1806         const struct proto *prot = READ_ONCE(osk->sk_prot);
1807 #ifdef CONFIG_SECURITY_NETWORK
1808         void *sptr = nsk->sk_security;
1809 #endif
1810
1811         /* If we move sk_tx_queue_mapping out of the private section,
1812          * we must check if sk_tx_queue_clear() is called after
1813          * sock_copy() in sk_clone_lock().
1814          */
1815         BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1816                      offsetof(struct sock, sk_dontcopy_begin) ||
1817                      offsetof(struct sock, sk_tx_queue_mapping) >=
1818                      offsetof(struct sock, sk_dontcopy_end));
1819
1820         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1821
1822         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1823                prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1824
1825 #ifdef CONFIG_SECURITY_NETWORK
1826         nsk->sk_security = sptr;
1827         security_sk_clone(osk, nsk);
1828 #endif
1829 }
1830
1831 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1832                 int family)
1833 {
1834         struct sock *sk;
1835         struct kmem_cache *slab;
1836
1837         slab = prot->slab;
1838         if (slab != NULL) {
1839                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1840                 if (!sk)
1841                         return sk;
1842                 if (want_init_on_alloc(priority))
1843                         sk_prot_clear_nulls(sk, prot->obj_size);
1844         } else
1845                 sk = kmalloc(prot->obj_size, priority);
1846
1847         if (sk != NULL) {
1848                 if (security_sk_alloc(sk, family, priority))
1849                         goto out_free;
1850
1851                 if (!try_module_get(prot->owner))
1852                         goto out_free_sec;
1853         }
1854
1855         return sk;
1856
1857 out_free_sec:
1858         security_sk_free(sk);
1859 out_free:
1860         if (slab != NULL)
1861                 kmem_cache_free(slab, sk);
1862         else
1863                 kfree(sk);
1864         return NULL;
1865 }
1866
1867 static void sk_prot_free(struct proto *prot, struct sock *sk)
1868 {
1869         struct kmem_cache *slab;
1870         struct module *owner;
1871
1872         owner = prot->owner;
1873         slab = prot->slab;
1874
1875         cgroup_sk_free(&sk->sk_cgrp_data);
1876         mem_cgroup_sk_free(sk);
1877         security_sk_free(sk);
1878         if (slab != NULL)
1879                 kmem_cache_free(slab, sk);
1880         else
1881                 kfree(sk);
1882         module_put(owner);
1883 }
1884
1885 /**
1886  *      sk_alloc - All socket objects are allocated here
1887  *      @net: the applicable net namespace
1888  *      @family: protocol family
1889  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1890  *      @prot: struct proto associated with this new sock instance
1891  *      @kern: is this to be a kernel socket?
1892  */
1893 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1894                       struct proto *prot, int kern)
1895 {
1896         struct sock *sk;
1897
1898         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1899         if (sk) {
1900                 sk->sk_family = family;
1901                 /*
1902                  * See comment in struct sock definition to understand
1903                  * why we need sk_prot_creator -acme
1904                  */
1905                 sk->sk_prot = sk->sk_prot_creator = prot;
1906                 sk->sk_kern_sock = kern;
1907                 sock_lock_init(sk);
1908                 sk->sk_net_refcnt = kern ? 0 : 1;
1909                 if (likely(sk->sk_net_refcnt)) {
1910                         get_net(net);
1911                         sock_inuse_add(net, 1);
1912                 }
1913
1914                 sock_net_set(sk, net);
1915                 refcount_set(&sk->sk_wmem_alloc, 1);
1916
1917                 mem_cgroup_sk_alloc(sk);
1918                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1919                 sock_update_classid(&sk->sk_cgrp_data);
1920                 sock_update_netprioidx(&sk->sk_cgrp_data);
1921                 sk_tx_queue_clear(sk);
1922         }
1923
1924         return sk;
1925 }
1926 EXPORT_SYMBOL(sk_alloc);
1927
1928 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1929  * grace period. This is the case for UDP sockets and TCP listeners.
1930  */
1931 static void __sk_destruct(struct rcu_head *head)
1932 {
1933         struct sock *sk = container_of(head, struct sock, sk_rcu);
1934         struct sk_filter *filter;
1935
1936         if (sk->sk_destruct)
1937                 sk->sk_destruct(sk);
1938
1939         filter = rcu_dereference_check(sk->sk_filter,
1940                                        refcount_read(&sk->sk_wmem_alloc) == 0);
1941         if (filter) {
1942                 sk_filter_uncharge(sk, filter);
1943                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1944         }
1945
1946         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1947
1948 #ifdef CONFIG_BPF_SYSCALL
1949         bpf_sk_storage_free(sk);
1950 #endif
1951
1952         if (atomic_read(&sk->sk_omem_alloc))
1953                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1954                          __func__, atomic_read(&sk->sk_omem_alloc));
1955
1956         if (sk->sk_frag.page) {
1957                 put_page(sk->sk_frag.page);
1958                 sk->sk_frag.page = NULL;
1959         }
1960
1961         /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
1962         put_cred(sk->sk_peer_cred);
1963         put_pid(sk->sk_peer_pid);
1964
1965         if (likely(sk->sk_net_refcnt))
1966                 put_net(sock_net(sk));
1967         sk_prot_free(sk->sk_prot_creator, sk);
1968 }
1969
1970 void sk_destruct(struct sock *sk)
1971 {
1972         bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1973
1974         if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1975                 reuseport_detach_sock(sk);
1976                 use_call_rcu = true;
1977         }
1978
1979         if (use_call_rcu)
1980                 call_rcu(&sk->sk_rcu, __sk_destruct);
1981         else
1982                 __sk_destruct(&sk->sk_rcu);
1983 }
1984
1985 static void __sk_free(struct sock *sk)
1986 {
1987         if (likely(sk->sk_net_refcnt))
1988                 sock_inuse_add(sock_net(sk), -1);
1989
1990         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1991                 sock_diag_broadcast_destroy(sk);
1992         else
1993                 sk_destruct(sk);
1994 }
1995
1996 void sk_free(struct sock *sk)
1997 {
1998         /*
1999          * We subtract one from sk_wmem_alloc and can know if
2000          * some packets are still in some tx queue.
2001          * If not null, sock_wfree() will call __sk_free(sk) later
2002          */
2003         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2004                 __sk_free(sk);
2005 }
2006 EXPORT_SYMBOL(sk_free);
2007
2008 static void sk_init_common(struct sock *sk)
2009 {
2010         skb_queue_head_init(&sk->sk_receive_queue);
2011         skb_queue_head_init(&sk->sk_write_queue);
2012         skb_queue_head_init(&sk->sk_error_queue);
2013
2014         rwlock_init(&sk->sk_callback_lock);
2015         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2016                         af_rlock_keys + sk->sk_family,
2017                         af_family_rlock_key_strings[sk->sk_family]);
2018         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2019                         af_wlock_keys + sk->sk_family,
2020                         af_family_wlock_key_strings[sk->sk_family]);
2021         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2022                         af_elock_keys + sk->sk_family,
2023                         af_family_elock_key_strings[sk->sk_family]);
2024         lockdep_set_class_and_name(&sk->sk_callback_lock,
2025                         af_callback_keys + sk->sk_family,
2026                         af_family_clock_key_strings[sk->sk_family]);
2027 }
2028
2029 /**
2030  *      sk_clone_lock - clone a socket, and lock its clone
2031  *      @sk: the socket to clone
2032  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2033  *
2034  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2035  */
2036 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2037 {
2038         struct proto *prot = READ_ONCE(sk->sk_prot);
2039         struct sk_filter *filter;
2040         bool is_charged = true;
2041         struct sock *newsk;
2042
2043         newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2044         if (!newsk)
2045                 goto out;
2046
2047         sock_copy(newsk, sk);
2048
2049         newsk->sk_prot_creator = prot;
2050
2051         /* SANITY */
2052         if (likely(newsk->sk_net_refcnt)) {
2053                 get_net(sock_net(newsk));
2054                 sock_inuse_add(sock_net(newsk), 1);
2055         }
2056         sk_node_init(&newsk->sk_node);
2057         sock_lock_init(newsk);
2058         bh_lock_sock(newsk);
2059         newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
2060         newsk->sk_backlog.len = 0;
2061
2062         atomic_set(&newsk->sk_rmem_alloc, 0);
2063
2064         /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2065         refcount_set(&newsk->sk_wmem_alloc, 1);
2066
2067         atomic_set(&newsk->sk_omem_alloc, 0);
2068         sk_init_common(newsk);
2069
2070         newsk->sk_dst_cache     = NULL;
2071         newsk->sk_dst_pending_confirm = 0;
2072         newsk->sk_wmem_queued   = 0;
2073         newsk->sk_forward_alloc = 0;
2074         atomic_set(&newsk->sk_drops, 0);
2075         newsk->sk_send_head     = NULL;
2076         newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2077         atomic_set(&newsk->sk_zckey, 0);
2078
2079         sock_reset_flag(newsk, SOCK_DONE);
2080
2081         /* sk->sk_memcg will be populated at accept() time */
2082         newsk->sk_memcg = NULL;
2083
2084         cgroup_sk_clone(&newsk->sk_cgrp_data);
2085
2086         rcu_read_lock();
2087         filter = rcu_dereference(sk->sk_filter);
2088         if (filter != NULL)
2089                 /* though it's an empty new sock, the charging may fail
2090                  * if sysctl_optmem_max was changed between creation of
2091                  * original socket and cloning
2092                  */
2093                 is_charged = sk_filter_charge(newsk, filter);
2094         RCU_INIT_POINTER(newsk->sk_filter, filter);
2095         rcu_read_unlock();
2096
2097         if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2098                 /* We need to make sure that we don't uncharge the new
2099                  * socket if we couldn't charge it in the first place
2100                  * as otherwise we uncharge the parent's filter.
2101                  */
2102                 if (!is_charged)
2103                         RCU_INIT_POINTER(newsk->sk_filter, NULL);
2104                 sk_free_unlock_clone(newsk);
2105                 newsk = NULL;
2106                 goto out;
2107         }
2108         RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2109
2110         if (bpf_sk_storage_clone(sk, newsk)) {
2111                 sk_free_unlock_clone(newsk);
2112                 newsk = NULL;
2113                 goto out;
2114         }
2115
2116         /* Clear sk_user_data if parent had the pointer tagged
2117          * as not suitable for copying when cloning.
2118          */
2119         if (sk_user_data_is_nocopy(newsk))
2120                 newsk->sk_user_data = NULL;
2121
2122         newsk->sk_err      = 0;
2123         newsk->sk_err_soft = 0;
2124         newsk->sk_priority = 0;
2125         newsk->sk_incoming_cpu = raw_smp_processor_id();
2126
2127         /* Before updating sk_refcnt, we must commit prior changes to memory
2128          * (Documentation/RCU/rculist_nulls.rst for details)
2129          */
2130         smp_wmb();
2131         refcount_set(&newsk->sk_refcnt, 2);
2132
2133         /* Increment the counter in the same struct proto as the master
2134          * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2135          * is the same as sk->sk_prot->socks, as this field was copied
2136          * with memcpy).
2137          *
2138          * This _changes_ the previous behaviour, where
2139          * tcp_create_openreq_child always was incrementing the
2140          * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2141          * to be taken into account in all callers. -acme
2142          */
2143         sk_refcnt_debug_inc(newsk);
2144         sk_set_socket(newsk, NULL);
2145         sk_tx_queue_clear(newsk);
2146         RCU_INIT_POINTER(newsk->sk_wq, NULL);
2147
2148         if (newsk->sk_prot->sockets_allocated)
2149                 sk_sockets_allocated_inc(newsk);
2150
2151         if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2152                 net_enable_timestamp();
2153 out:
2154         return newsk;
2155 }
2156 EXPORT_SYMBOL_GPL(sk_clone_lock);
2157
2158 void sk_free_unlock_clone(struct sock *sk)
2159 {
2160         /* It is still raw copy of parent, so invalidate
2161          * destructor and make plain sk_free() */
2162         sk->sk_destruct = NULL;
2163         bh_unlock_sock(sk);
2164         sk_free(sk);
2165 }
2166 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2167
2168 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2169 {
2170         u32 max_segs = 1;
2171
2172         sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
2173         if (sk->sk_route_caps & NETIF_F_GSO)
2174                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2175         sk->sk_route_caps &= ~sk->sk_route_nocaps;
2176         if (sk_can_gso(sk)) {
2177                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2178                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2179                 } else {
2180                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2181                         sk->sk_gso_max_size = dst->dev->gso_max_size;
2182                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2183                 }
2184         }
2185         sk->sk_gso_max_segs = max_segs;
2186         sk_dst_set(sk, dst);
2187 }
2188 EXPORT_SYMBOL_GPL(sk_setup_caps);
2189
2190 /*
2191  *      Simple resource managers for sockets.
2192  */
2193
2194
2195 /*
2196  * Write buffer destructor automatically called from kfree_skb.
2197  */
2198 void sock_wfree(struct sk_buff *skb)
2199 {
2200         struct sock *sk = skb->sk;
2201         unsigned int len = skb->truesize;
2202
2203         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2204                 /*
2205                  * Keep a reference on sk_wmem_alloc, this will be released
2206                  * after sk_write_space() call
2207                  */
2208                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2209                 sk->sk_write_space(sk);
2210                 len = 1;
2211         }
2212         /*
2213          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2214          * could not do because of in-flight packets
2215          */
2216         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2217                 __sk_free(sk);
2218 }
2219 EXPORT_SYMBOL(sock_wfree);
2220
2221 /* This variant of sock_wfree() is used by TCP,
2222  * since it sets SOCK_USE_WRITE_QUEUE.
2223  */
2224 void __sock_wfree(struct sk_buff *skb)
2225 {
2226         struct sock *sk = skb->sk;
2227
2228         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2229                 __sk_free(sk);
2230 }
2231
2232 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2233 {
2234         skb_orphan(skb);
2235         skb->sk = sk;
2236 #ifdef CONFIG_INET
2237         if (unlikely(!sk_fullsock(sk))) {
2238                 skb->destructor = sock_edemux;
2239                 sock_hold(sk);
2240                 return;
2241         }
2242 #endif
2243         skb->destructor = sock_wfree;
2244         skb_set_hash_from_sk(skb, sk);
2245         /*
2246          * We used to take a refcount on sk, but following operation
2247          * is enough to guarantee sk_free() wont free this sock until
2248          * all in-flight packets are completed
2249          */
2250         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2251 }
2252 EXPORT_SYMBOL(skb_set_owner_w);
2253
2254 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2255 {
2256 #ifdef CONFIG_TLS_DEVICE
2257         /* Drivers depend on in-order delivery for crypto offload,
2258          * partial orphan breaks out-of-order-OK logic.
2259          */
2260         if (skb->decrypted)
2261                 return false;
2262 #endif
2263         return (skb->destructor == sock_wfree ||
2264                 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2265 }
2266
2267 /* This helper is used by netem, as it can hold packets in its
2268  * delay queue. We want to allow the owner socket to send more
2269  * packets, as if they were already TX completed by a typical driver.
2270  * But we also want to keep skb->sk set because some packet schedulers
2271  * rely on it (sch_fq for example).
2272  */
2273 void skb_orphan_partial(struct sk_buff *skb)
2274 {
2275         if (skb_is_tcp_pure_ack(skb))
2276                 return;
2277
2278         if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2279                 return;
2280
2281         skb_orphan(skb);
2282 }
2283 EXPORT_SYMBOL(skb_orphan_partial);
2284
2285 /*
2286  * Read buffer destructor automatically called from kfree_skb.
2287  */
2288 void sock_rfree(struct sk_buff *skb)
2289 {
2290         struct sock *sk = skb->sk;
2291         unsigned int len = skb->truesize;
2292
2293         atomic_sub(len, &sk->sk_rmem_alloc);
2294         sk_mem_uncharge(sk, len);
2295 }
2296 EXPORT_SYMBOL(sock_rfree);
2297
2298 /*
2299  * Buffer destructor for skbs that are not used directly in read or write
2300  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2301  */
2302 void sock_efree(struct sk_buff *skb)
2303 {
2304         sock_put(skb->sk);
2305 }
2306 EXPORT_SYMBOL(sock_efree);
2307
2308 /* Buffer destructor for prefetch/receive path where reference count may
2309  * not be held, e.g. for listen sockets.
2310  */
2311 #ifdef CONFIG_INET
2312 void sock_pfree(struct sk_buff *skb)
2313 {
2314         if (sk_is_refcounted(skb->sk))
2315                 sock_gen_put(skb->sk);
2316 }
2317 EXPORT_SYMBOL(sock_pfree);
2318 #endif /* CONFIG_INET */
2319
2320 kuid_t sock_i_uid(struct sock *sk)
2321 {
2322         kuid_t uid;
2323
2324         read_lock_bh(&sk->sk_callback_lock);
2325         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2326         read_unlock_bh(&sk->sk_callback_lock);
2327         return uid;
2328 }
2329 EXPORT_SYMBOL(sock_i_uid);
2330
2331 unsigned long __sock_i_ino(struct sock *sk)
2332 {
2333         unsigned long ino;
2334
2335         read_lock(&sk->sk_callback_lock);
2336         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2337         read_unlock(&sk->sk_callback_lock);
2338         return ino;
2339 }
2340 EXPORT_SYMBOL(__sock_i_ino);
2341
2342 unsigned long sock_i_ino(struct sock *sk)
2343 {
2344         unsigned long ino;
2345
2346         local_bh_disable();
2347         ino = __sock_i_ino(sk);
2348         local_bh_enable();
2349         return ino;
2350 }
2351 EXPORT_SYMBOL(sock_i_ino);
2352
2353 /*
2354  * Allocate a skb from the socket's send buffer.
2355  */
2356 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2357                              gfp_t priority)
2358 {
2359         if (force ||
2360             refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2361                 struct sk_buff *skb = alloc_skb(size, priority);
2362
2363                 if (skb) {
2364                         skb_set_owner_w(skb, sk);
2365                         return skb;
2366                 }
2367         }
2368         return NULL;
2369 }
2370 EXPORT_SYMBOL(sock_wmalloc);
2371
2372 static void sock_ofree(struct sk_buff *skb)
2373 {
2374         struct sock *sk = skb->sk;
2375
2376         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2377 }
2378
2379 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2380                              gfp_t priority)
2381 {
2382         struct sk_buff *skb;
2383
2384         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2385         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2386             READ_ONCE(sysctl_optmem_max))
2387                 return NULL;
2388
2389         skb = alloc_skb(size, priority);
2390         if (!skb)
2391                 return NULL;
2392
2393         atomic_add(skb->truesize, &sk->sk_omem_alloc);
2394         skb->sk = sk;
2395         skb->destructor = sock_ofree;
2396         return skb;
2397 }
2398
2399 /*
2400  * Allocate a memory block from the socket's option memory buffer.
2401  */
2402 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2403 {
2404         int optmem_max = READ_ONCE(sysctl_optmem_max);
2405
2406         if ((unsigned int)size <= optmem_max &&
2407             atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2408                 void *mem;
2409                 /* First do the add, to avoid the race if kmalloc
2410                  * might sleep.
2411                  */
2412                 atomic_add(size, &sk->sk_omem_alloc);
2413                 mem = kmalloc(size, priority);
2414                 if (mem)
2415                         return mem;
2416                 atomic_sub(size, &sk->sk_omem_alloc);
2417         }
2418         return NULL;
2419 }
2420 EXPORT_SYMBOL(sock_kmalloc);
2421
2422 /* Free an option memory block. Note, we actually want the inline
2423  * here as this allows gcc to detect the nullify and fold away the
2424  * condition entirely.
2425  */
2426 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2427                                   const bool nullify)
2428 {
2429         if (WARN_ON_ONCE(!mem))
2430                 return;
2431         if (nullify)
2432                 kfree_sensitive(mem);
2433         else
2434                 kfree(mem);
2435         atomic_sub(size, &sk->sk_omem_alloc);
2436 }
2437
2438 void sock_kfree_s(struct sock *sk, void *mem, int size)
2439 {
2440         __sock_kfree_s(sk, mem, size, false);
2441 }
2442 EXPORT_SYMBOL(sock_kfree_s);
2443
2444 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2445 {
2446         __sock_kfree_s(sk, mem, size, true);
2447 }
2448 EXPORT_SYMBOL(sock_kzfree_s);
2449
2450 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2451    I think, these locks should be removed for datagram sockets.
2452  */
2453 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2454 {
2455         DEFINE_WAIT(wait);
2456
2457         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2458         for (;;) {
2459                 if (!timeo)
2460                         break;
2461                 if (signal_pending(current))
2462                         break;
2463                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2464                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2465                 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2466                         break;
2467                 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2468                         break;
2469                 if (READ_ONCE(sk->sk_err))
2470                         break;
2471                 timeo = schedule_timeout(timeo);
2472         }
2473         finish_wait(sk_sleep(sk), &wait);
2474         return timeo;
2475 }
2476
2477
2478 /*
2479  *      Generic send/receive buffer handlers
2480  */
2481
2482 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2483                                      unsigned long data_len, int noblock,
2484                                      int *errcode, int max_page_order)
2485 {
2486         struct sk_buff *skb;
2487         long timeo;
2488         int err;
2489
2490         timeo = sock_sndtimeo(sk, noblock);
2491         for (;;) {
2492                 err = sock_error(sk);
2493                 if (err != 0)
2494                         goto failure;
2495
2496                 err = -EPIPE;
2497                 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2498                         goto failure;
2499
2500                 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2501                         break;
2502
2503                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2504                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2505                 err = -EAGAIN;
2506                 if (!timeo)
2507                         goto failure;
2508                 if (signal_pending(current))
2509                         goto interrupted;
2510                 timeo = sock_wait_for_wmem(sk, timeo);
2511         }
2512         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2513                                    errcode, sk->sk_allocation);
2514         if (skb)
2515                 skb_set_owner_w(skb, sk);
2516         return skb;
2517
2518 interrupted:
2519         err = sock_intr_errno(timeo);
2520 failure:
2521         *errcode = err;
2522         return NULL;
2523 }
2524 EXPORT_SYMBOL(sock_alloc_send_pskb);
2525
2526 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2527                                     int noblock, int *errcode)
2528 {
2529         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2530 }
2531 EXPORT_SYMBOL(sock_alloc_send_skb);
2532
2533 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2534                      struct sockcm_cookie *sockc)
2535 {
2536         u32 tsflags;
2537
2538         switch (cmsg->cmsg_type) {
2539         case SO_MARK:
2540                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2541                         return -EPERM;
2542                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2543                         return -EINVAL;
2544                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2545                 break;
2546         case SO_TIMESTAMPING_OLD:
2547                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2548                         return -EINVAL;
2549
2550                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2551                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2552                         return -EINVAL;
2553
2554                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2555                 sockc->tsflags |= tsflags;
2556                 break;
2557         case SCM_TXTIME:
2558                 if (!sock_flag(sk, SOCK_TXTIME))
2559                         return -EINVAL;
2560                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2561                         return -EINVAL;
2562                 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2563                 break;
2564         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2565         case SCM_RIGHTS:
2566         case SCM_CREDENTIALS:
2567                 break;
2568         default:
2569                 return -EINVAL;
2570         }
2571         return 0;
2572 }
2573 EXPORT_SYMBOL(__sock_cmsg_send);
2574
2575 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2576                    struct sockcm_cookie *sockc)
2577 {
2578         struct cmsghdr *cmsg;
2579         int ret;
2580
2581         for_each_cmsghdr(cmsg, msg) {
2582                 if (!CMSG_OK(msg, cmsg))
2583                         return -EINVAL;
2584                 if (cmsg->cmsg_level != SOL_SOCKET)
2585                         continue;
2586                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2587                 if (ret)
2588                         return ret;
2589         }
2590         return 0;
2591 }
2592 EXPORT_SYMBOL(sock_cmsg_send);
2593
2594 static void sk_enter_memory_pressure(struct sock *sk)
2595 {
2596         if (!sk->sk_prot->enter_memory_pressure)
2597                 return;
2598
2599         sk->sk_prot->enter_memory_pressure(sk);
2600 }
2601
2602 static void sk_leave_memory_pressure(struct sock *sk)
2603 {
2604         if (sk->sk_prot->leave_memory_pressure) {
2605                 sk->sk_prot->leave_memory_pressure(sk);
2606         } else {
2607                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2608
2609                 if (memory_pressure && READ_ONCE(*memory_pressure))
2610                         WRITE_ONCE(*memory_pressure, 0);
2611         }
2612 }
2613
2614 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2615
2616 /**
2617  * skb_page_frag_refill - check that a page_frag contains enough room
2618  * @sz: minimum size of the fragment we want to get
2619  * @pfrag: pointer to page_frag
2620  * @gfp: priority for memory allocation
2621  *
2622  * Note: While this allocator tries to use high order pages, there is
2623  * no guarantee that allocations succeed. Therefore, @sz MUST be
2624  * less or equal than PAGE_SIZE.
2625  */
2626 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2627 {
2628         if (pfrag->page) {
2629                 if (page_ref_count(pfrag->page) == 1) {
2630                         pfrag->offset = 0;
2631                         return true;
2632                 }
2633                 if (pfrag->offset + sz <= pfrag->size)
2634                         return true;
2635                 put_page(pfrag->page);
2636         }
2637
2638         pfrag->offset = 0;
2639         if (SKB_FRAG_PAGE_ORDER &&
2640             !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2641                 /* Avoid direct reclaim but allow kswapd to wake */
2642                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2643                                           __GFP_COMP | __GFP_NOWARN |
2644                                           __GFP_NORETRY,
2645                                           SKB_FRAG_PAGE_ORDER);
2646                 if (likely(pfrag->page)) {
2647                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2648                         return true;
2649                 }
2650         }
2651         pfrag->page = alloc_page(gfp);
2652         if (likely(pfrag->page)) {
2653                 pfrag->size = PAGE_SIZE;
2654                 return true;
2655         }
2656         return false;
2657 }
2658 EXPORT_SYMBOL(skb_page_frag_refill);
2659
2660 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2661 {
2662         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2663                 return true;
2664
2665         sk_enter_memory_pressure(sk);
2666         sk_stream_moderate_sndbuf(sk);
2667         return false;
2668 }
2669 EXPORT_SYMBOL(sk_page_frag_refill);
2670
2671 void __lock_sock(struct sock *sk)
2672         __releases(&sk->sk_lock.slock)
2673         __acquires(&sk->sk_lock.slock)
2674 {
2675         DEFINE_WAIT(wait);
2676
2677         for (;;) {
2678                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2679                                         TASK_UNINTERRUPTIBLE);
2680                 spin_unlock_bh(&sk->sk_lock.slock);
2681                 schedule();
2682                 spin_lock_bh(&sk->sk_lock.slock);
2683                 if (!sock_owned_by_user(sk))
2684                         break;
2685         }
2686         finish_wait(&sk->sk_lock.wq, &wait);
2687 }
2688
2689 void __release_sock(struct sock *sk)
2690         __releases(&sk->sk_lock.slock)
2691         __acquires(&sk->sk_lock.slock)
2692 {
2693         struct sk_buff *skb, *next;
2694
2695         while ((skb = sk->sk_backlog.head) != NULL) {
2696                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2697
2698                 spin_unlock_bh(&sk->sk_lock.slock);
2699
2700                 do {
2701                         next = skb->next;
2702                         prefetch(next);
2703                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2704                         skb_mark_not_on_list(skb);
2705                         sk_backlog_rcv(sk, skb);
2706
2707                         cond_resched();
2708
2709                         skb = next;
2710                 } while (skb != NULL);
2711
2712                 spin_lock_bh(&sk->sk_lock.slock);
2713         }
2714
2715         /*
2716          * Doing the zeroing here guarantee we can not loop forever
2717          * while a wild producer attempts to flood us.
2718          */
2719         sk->sk_backlog.len = 0;
2720 }
2721
2722 void __sk_flush_backlog(struct sock *sk)
2723 {
2724         spin_lock_bh(&sk->sk_lock.slock);
2725         __release_sock(sk);
2726         spin_unlock_bh(&sk->sk_lock.slock);
2727 }
2728
2729 /**
2730  * sk_wait_data - wait for data to arrive at sk_receive_queue
2731  * @sk:    sock to wait on
2732  * @timeo: for how long
2733  * @skb:   last skb seen on sk_receive_queue
2734  *
2735  * Now socket state including sk->sk_err is changed only under lock,
2736  * hence we may omit checks after joining wait queue.
2737  * We check receive queue before schedule() only as optimization;
2738  * it is very likely that release_sock() added new data.
2739  */
2740 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2741 {
2742         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2743         int rc;
2744
2745         add_wait_queue(sk_sleep(sk), &wait);
2746         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2747         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2748         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2749         remove_wait_queue(sk_sleep(sk), &wait);
2750         return rc;
2751 }
2752 EXPORT_SYMBOL(sk_wait_data);
2753
2754 /**
2755  *      __sk_mem_raise_allocated - increase memory_allocated
2756  *      @sk: socket
2757  *      @size: memory size to allocate
2758  *      @amt: pages to allocate
2759  *      @kind: allocation type
2760  *
2761  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2762  */
2763 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2764 {
2765         struct proto *prot = sk->sk_prot;
2766         long allocated = sk_memory_allocated_add(sk, amt);
2767         bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2768         bool charged = true;
2769
2770         if (memcg_charge &&
2771             !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2772                                                 gfp_memcg_charge())))
2773                 goto suppress_allocation;
2774
2775         /* Under limit. */
2776         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2777                 sk_leave_memory_pressure(sk);
2778                 return 1;
2779         }
2780
2781         /* Under pressure. */
2782         if (allocated > sk_prot_mem_limits(sk, 1))
2783                 sk_enter_memory_pressure(sk);
2784
2785         /* Over hard limit. */
2786         if (allocated > sk_prot_mem_limits(sk, 2))
2787                 goto suppress_allocation;
2788
2789         /* guarantee minimum buffer size under pressure */
2790         if (kind == SK_MEM_RECV) {
2791                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2792                         return 1;
2793
2794         } else { /* SK_MEM_SEND */
2795                 int wmem0 = sk_get_wmem0(sk, prot);
2796
2797                 if (sk->sk_type == SOCK_STREAM) {
2798                         if (sk->sk_wmem_queued < wmem0)
2799                                 return 1;
2800                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2801                                 return 1;
2802                 }
2803         }
2804
2805         if (sk_has_memory_pressure(sk)) {
2806                 u64 alloc;
2807
2808                 if (!sk_under_memory_pressure(sk))
2809                         return 1;
2810                 alloc = sk_sockets_allocated_read_positive(sk);
2811                 if (sk_prot_mem_limits(sk, 2) > alloc *
2812                     sk_mem_pages(sk->sk_wmem_queued +
2813                                  atomic_read(&sk->sk_rmem_alloc) +
2814                                  sk->sk_forward_alloc))
2815                         return 1;
2816         }
2817
2818 suppress_allocation:
2819
2820         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2821                 sk_stream_moderate_sndbuf(sk);
2822
2823                 /* Fail only if socket is _under_ its sndbuf.
2824                  * In this case we cannot block, so that we have to fail.
2825                  */
2826                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2827                         /* Force charge with __GFP_NOFAIL */
2828                         if (memcg_charge && !charged) {
2829                                 mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2830                                         gfp_memcg_charge() | __GFP_NOFAIL);
2831                         }
2832                         return 1;
2833                 }
2834         }
2835
2836         if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2837                 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2838
2839         sk_memory_allocated_sub(sk, amt);
2840
2841         if (memcg_charge && charged)
2842                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2843
2844         return 0;
2845 }
2846 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2847
2848 /**
2849  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2850  *      @sk: socket
2851  *      @size: memory size to allocate
2852  *      @kind: allocation type
2853  *
2854  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2855  *      rmem allocation. This function assumes that protocols which have
2856  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2857  */
2858 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2859 {
2860         int ret, amt = sk_mem_pages(size);
2861
2862         sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2863         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2864         if (!ret)
2865                 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2866         return ret;
2867 }
2868 EXPORT_SYMBOL(__sk_mem_schedule);
2869
2870 /**
2871  *      __sk_mem_reduce_allocated - reclaim memory_allocated
2872  *      @sk: socket
2873  *      @amount: number of quanta
2874  *
2875  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2876  */
2877 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2878 {
2879         sk_memory_allocated_sub(sk, amount);
2880
2881         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2882                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2883
2884         if (sk_under_global_memory_pressure(sk) &&
2885             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2886                 sk_leave_memory_pressure(sk);
2887 }
2888 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2889
2890 /**
2891  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2892  *      @sk: socket
2893  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2894  */
2895 void __sk_mem_reclaim(struct sock *sk, int amount)
2896 {
2897         amount >>= SK_MEM_QUANTUM_SHIFT;
2898         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2899         __sk_mem_reduce_allocated(sk, amount);
2900 }
2901 EXPORT_SYMBOL(__sk_mem_reclaim);
2902
2903 int sk_set_peek_off(struct sock *sk, int val)
2904 {
2905         WRITE_ONCE(sk->sk_peek_off, val);
2906         return 0;
2907 }
2908 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2909
2910 /*
2911  * Set of default routines for initialising struct proto_ops when
2912  * the protocol does not support a particular function. In certain
2913  * cases where it makes no sense for a protocol to have a "do nothing"
2914  * function, some default processing is provided.
2915  */
2916
2917 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2918 {
2919         return -EOPNOTSUPP;
2920 }
2921 EXPORT_SYMBOL(sock_no_bind);
2922
2923 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2924                     int len, int flags)
2925 {
2926         return -EOPNOTSUPP;
2927 }
2928 EXPORT_SYMBOL(sock_no_connect);
2929
2930 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2931 {
2932         return -EOPNOTSUPP;
2933 }
2934 EXPORT_SYMBOL(sock_no_socketpair);
2935
2936 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2937                    bool kern)
2938 {
2939         return -EOPNOTSUPP;
2940 }
2941 EXPORT_SYMBOL(sock_no_accept);
2942
2943 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2944                     int peer)
2945 {
2946         return -EOPNOTSUPP;
2947 }
2948 EXPORT_SYMBOL(sock_no_getname);
2949
2950 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2951 {
2952         return -EOPNOTSUPP;
2953 }
2954 EXPORT_SYMBOL(sock_no_ioctl);
2955
2956 int sock_no_listen(struct socket *sock, int backlog)
2957 {
2958         return -EOPNOTSUPP;
2959 }
2960 EXPORT_SYMBOL(sock_no_listen);
2961
2962 int sock_no_shutdown(struct socket *sock, int how)
2963 {
2964         return -EOPNOTSUPP;
2965 }
2966 EXPORT_SYMBOL(sock_no_shutdown);
2967
2968 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2969 {
2970         return -EOPNOTSUPP;
2971 }
2972 EXPORT_SYMBOL(sock_no_sendmsg);
2973
2974 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2975 {
2976         return -EOPNOTSUPP;
2977 }
2978 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2979
2980 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2981                     int flags)
2982 {
2983         return -EOPNOTSUPP;
2984 }
2985 EXPORT_SYMBOL(sock_no_recvmsg);
2986
2987 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2988 {
2989         /* Mirror missing mmap method error code */
2990         return -ENODEV;
2991 }
2992 EXPORT_SYMBOL(sock_no_mmap);
2993
2994 /*
2995  * When a file is received (via SCM_RIGHTS, etc), we must bump the
2996  * various sock-based usage counts.
2997  */
2998 void __receive_sock(struct file *file)
2999 {
3000         struct socket *sock;
3001
3002         sock = sock_from_file(file);
3003         if (sock) {
3004                 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3005                 sock_update_classid(&sock->sk->sk_cgrp_data);
3006         }
3007 }
3008
3009 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3010 {
3011         ssize_t res;
3012         struct msghdr msg = {.msg_flags = flags};
3013         struct kvec iov;
3014         char *kaddr = kmap(page);
3015         iov.iov_base = kaddr + offset;
3016         iov.iov_len = size;
3017         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3018         kunmap(page);
3019         return res;
3020 }
3021 EXPORT_SYMBOL(sock_no_sendpage);
3022
3023 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3024                                 int offset, size_t size, int flags)
3025 {
3026         ssize_t res;
3027         struct msghdr msg = {.msg_flags = flags};
3028         struct kvec iov;
3029         char *kaddr = kmap(page);
3030
3031         iov.iov_base = kaddr + offset;
3032         iov.iov_len = size;
3033         res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3034         kunmap(page);
3035         return res;
3036 }
3037 EXPORT_SYMBOL(sock_no_sendpage_locked);
3038
3039 /*
3040  *      Default Socket Callbacks
3041  */
3042
3043 static void sock_def_wakeup(struct sock *sk)
3044 {
3045         struct socket_wq *wq;
3046
3047         rcu_read_lock();
3048         wq = rcu_dereference(sk->sk_wq);
3049         if (skwq_has_sleeper(wq))
3050                 wake_up_interruptible_all(&wq->wait);
3051         rcu_read_unlock();
3052 }
3053
3054 static void sock_def_error_report(struct sock *sk)
3055 {
3056         struct socket_wq *wq;
3057
3058         rcu_read_lock();
3059         wq = rcu_dereference(sk->sk_wq);
3060         if (skwq_has_sleeper(wq))
3061                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3062         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3063         rcu_read_unlock();
3064 }
3065
3066 void sock_def_readable(struct sock *sk)
3067 {
3068         struct socket_wq *wq;
3069
3070         rcu_read_lock();
3071         wq = rcu_dereference(sk->sk_wq);
3072         if (skwq_has_sleeper(wq))
3073                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3074                                                 EPOLLRDNORM | EPOLLRDBAND);
3075         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3076         rcu_read_unlock();
3077 }
3078
3079 static void sock_def_write_space(struct sock *sk)
3080 {
3081         struct socket_wq *wq;
3082
3083         rcu_read_lock();
3084
3085         /* Do not wake up a writer until he can make "significant"
3086          * progress.  --DaveM
3087          */
3088         if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
3089                 wq = rcu_dereference(sk->sk_wq);
3090                 if (skwq_has_sleeper(wq))
3091                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3092                                                 EPOLLWRNORM | EPOLLWRBAND);
3093
3094                 /* Should agree with poll, otherwise some programs break */
3095                 if (sock_writeable(sk))
3096                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3097         }
3098
3099         rcu_read_unlock();
3100 }
3101
3102 static void sock_def_destruct(struct sock *sk)
3103 {
3104 }
3105
3106 void sk_send_sigurg(struct sock *sk)
3107 {
3108         if (sk->sk_socket && sk->sk_socket->file)
3109                 if (send_sigurg(&sk->sk_socket->file->f_owner))
3110                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3111 }
3112 EXPORT_SYMBOL(sk_send_sigurg);
3113
3114 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3115                     unsigned long expires)
3116 {
3117         if (!mod_timer(timer, expires))
3118                 sock_hold(sk);
3119 }
3120 EXPORT_SYMBOL(sk_reset_timer);
3121
3122 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3123 {
3124         if (del_timer(timer))
3125                 __sock_put(sk);
3126 }
3127 EXPORT_SYMBOL(sk_stop_timer);
3128
3129 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3130 {
3131         if (del_timer_sync(timer))
3132                 __sock_put(sk);
3133 }
3134 EXPORT_SYMBOL(sk_stop_timer_sync);
3135
3136 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3137 {
3138         sk_init_common(sk);
3139         sk->sk_send_head        =       NULL;
3140
3141         timer_setup(&sk->sk_timer, NULL, 0);
3142
3143         sk->sk_allocation       =       GFP_KERNEL;
3144         sk->sk_rcvbuf           =       READ_ONCE(sysctl_rmem_default);
3145         sk->sk_sndbuf           =       READ_ONCE(sysctl_wmem_default);
3146         sk->sk_state            =       TCP_CLOSE;
3147         sk_set_socket(sk, sock);
3148
3149         sock_set_flag(sk, SOCK_ZAPPED);
3150
3151         if (sock) {
3152                 sk->sk_type     =       sock->type;
3153                 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3154                 sock->sk        =       sk;
3155         } else {
3156                 RCU_INIT_POINTER(sk->sk_wq, NULL);
3157         }
3158         sk->sk_uid      =       uid;
3159
3160         rwlock_init(&sk->sk_callback_lock);
3161         if (sk->sk_kern_sock)
3162                 lockdep_set_class_and_name(
3163                         &sk->sk_callback_lock,
3164                         af_kern_callback_keys + sk->sk_family,
3165                         af_family_kern_clock_key_strings[sk->sk_family]);
3166         else
3167                 lockdep_set_class_and_name(
3168                         &sk->sk_callback_lock,
3169                         af_callback_keys + sk->sk_family,
3170                         af_family_clock_key_strings[sk->sk_family]);
3171
3172         sk->sk_state_change     =       sock_def_wakeup;
3173         sk->sk_data_ready       =       sock_def_readable;
3174         sk->sk_write_space      =       sock_def_write_space;
3175         sk->sk_error_report     =       sock_def_error_report;
3176         sk->sk_destruct         =       sock_def_destruct;
3177
3178         sk->sk_frag.page        =       NULL;
3179         sk->sk_frag.offset      =       0;
3180         sk->sk_peek_off         =       -1;
3181
3182         sk->sk_peer_pid         =       NULL;
3183         sk->sk_peer_cred        =       NULL;
3184         spin_lock_init(&sk->sk_peer_lock);
3185
3186         sk->sk_write_pending    =       0;
3187         sk->sk_rcvlowat         =       1;
3188         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
3189         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
3190
3191         sk->sk_stamp = SK_DEFAULT_STAMP;
3192 #if BITS_PER_LONG==32
3193         seqlock_init(&sk->sk_stamp_seq);
3194 #endif
3195         atomic_set(&sk->sk_zckey, 0);
3196
3197 #ifdef CONFIG_NET_RX_BUSY_POLL
3198         sk->sk_napi_id          =       0;
3199         sk->sk_ll_usec          =       READ_ONCE(sysctl_net_busy_read);
3200 #endif
3201
3202         sk->sk_max_pacing_rate = ~0UL;
3203         sk->sk_pacing_rate = ~0UL;
3204         WRITE_ONCE(sk->sk_pacing_shift, 10);
3205         sk->sk_incoming_cpu = -1;
3206
3207         sk_rx_queue_clear(sk);
3208         /*
3209          * Before updating sk_refcnt, we must commit prior changes to memory
3210          * (Documentation/RCU/rculist_nulls.rst for details)
3211          */
3212         smp_wmb();
3213         refcount_set(&sk->sk_refcnt, 1);
3214         atomic_set(&sk->sk_drops, 0);
3215 }
3216 EXPORT_SYMBOL(sock_init_data_uid);
3217
3218 void sock_init_data(struct socket *sock, struct sock *sk)
3219 {
3220         kuid_t uid = sock ?
3221                 SOCK_INODE(sock)->i_uid :
3222                 make_kuid(sock_net(sk)->user_ns, 0);
3223
3224         sock_init_data_uid(sock, sk, uid);
3225 }
3226 EXPORT_SYMBOL(sock_init_data);
3227
3228 void lock_sock_nested(struct sock *sk, int subclass)
3229 {
3230         /* The sk_lock has mutex_lock() semantics here. */
3231         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3232
3233         might_sleep();
3234         spin_lock_bh(&sk->sk_lock.slock);
3235         if (sk->sk_lock.owned)
3236                 __lock_sock(sk);
3237         sk->sk_lock.owned = 1;
3238         spin_unlock_bh(&sk->sk_lock.slock);
3239 }
3240 EXPORT_SYMBOL(lock_sock_nested);
3241
3242 void release_sock(struct sock *sk)
3243 {
3244         spin_lock_bh(&sk->sk_lock.slock);
3245         if (sk->sk_backlog.tail)
3246                 __release_sock(sk);
3247
3248         /* Warning : release_cb() might need to release sk ownership,
3249          * ie call sock_release_ownership(sk) before us.
3250          */
3251         if (sk->sk_prot->release_cb)
3252                 sk->sk_prot->release_cb(sk);
3253
3254         sock_release_ownership(sk);
3255         if (waitqueue_active(&sk->sk_lock.wq))
3256                 wake_up(&sk->sk_lock.wq);
3257         spin_unlock_bh(&sk->sk_lock.slock);
3258 }
3259 EXPORT_SYMBOL(release_sock);
3260
3261 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3262 {
3263         might_sleep();
3264         spin_lock_bh(&sk->sk_lock.slock);
3265
3266         if (!sk->sk_lock.owned) {
3267                 /*
3268                  * Fast path return with bottom halves disabled and
3269                  * sock::sk_lock.slock held.
3270                  *
3271                  * The 'mutex' is not contended and holding
3272                  * sock::sk_lock.slock prevents all other lockers to
3273                  * proceed so the corresponding unlock_sock_fast() can
3274                  * avoid the slow path of release_sock() completely and
3275                  * just release slock.
3276                  *
3277                  * From a semantical POV this is equivalent to 'acquiring'
3278                  * the 'mutex', hence the corresponding lockdep
3279                  * mutex_release() has to happen in the fast path of
3280                  * unlock_sock_fast().
3281                  */
3282                 return false;
3283         }
3284
3285         __lock_sock(sk);
3286         sk->sk_lock.owned = 1;
3287         __acquire(&sk->sk_lock.slock);
3288         spin_unlock_bh(&sk->sk_lock.slock);
3289         return true;
3290 }
3291 EXPORT_SYMBOL(__lock_sock_fast);
3292
3293 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3294                    bool timeval, bool time32)
3295 {
3296         struct sock *sk = sock->sk;
3297         struct timespec64 ts;
3298
3299         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3300         ts = ktime_to_timespec64(sock_read_timestamp(sk));
3301         if (ts.tv_sec == -1)
3302                 return -ENOENT;
3303         if (ts.tv_sec == 0) {
3304                 ktime_t kt = ktime_get_real();
3305                 sock_write_timestamp(sk, kt);
3306                 ts = ktime_to_timespec64(kt);
3307         }
3308
3309         if (timeval)
3310                 ts.tv_nsec /= 1000;
3311
3312 #ifdef CONFIG_COMPAT_32BIT_TIME
3313         if (time32)
3314                 return put_old_timespec32(&ts, userstamp);
3315 #endif
3316 #ifdef CONFIG_SPARC64
3317         /* beware of padding in sparc64 timeval */
3318         if (timeval && !in_compat_syscall()) {
3319                 struct __kernel_old_timeval __user tv = {
3320                         .tv_sec = ts.tv_sec,
3321                         .tv_usec = ts.tv_nsec,
3322                 };
3323                 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3324                         return -EFAULT;
3325                 return 0;
3326         }
3327 #endif
3328         return put_timespec64(&ts, userstamp);
3329 }
3330 EXPORT_SYMBOL(sock_gettstamp);
3331
3332 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3333 {
3334         if (!sock_flag(sk, flag)) {
3335                 unsigned long previous_flags = sk->sk_flags;
3336
3337                 sock_set_flag(sk, flag);
3338                 /*
3339                  * we just set one of the two flags which require net
3340                  * time stamping, but time stamping might have been on
3341                  * already because of the other one
3342                  */
3343                 if (sock_needs_netstamp(sk) &&
3344                     !(previous_flags & SK_FLAGS_TIMESTAMP))
3345                         net_enable_timestamp();
3346         }
3347 }
3348
3349 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3350                        int level, int type)
3351 {
3352         struct sock_exterr_skb *serr;
3353         struct sk_buff *skb;
3354         int copied, err;
3355
3356         err = -EAGAIN;
3357         skb = sock_dequeue_err_skb(sk);
3358         if (skb == NULL)
3359                 goto out;
3360
3361         copied = skb->len;
3362         if (copied > len) {
3363                 msg->msg_flags |= MSG_TRUNC;
3364                 copied = len;
3365         }
3366         err = skb_copy_datagram_msg(skb, 0, msg, copied);
3367         if (err)
3368                 goto out_free_skb;
3369
3370         sock_recv_timestamp(msg, sk, skb);
3371
3372         serr = SKB_EXT_ERR(skb);
3373         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3374
3375         msg->msg_flags |= MSG_ERRQUEUE;
3376         err = copied;
3377
3378 out_free_skb:
3379         kfree_skb(skb);
3380 out:
3381         return err;
3382 }
3383 EXPORT_SYMBOL(sock_recv_errqueue);
3384
3385 /*
3386  *      Get a socket option on an socket.
3387  *
3388  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3389  *      asynchronous errors should be reported by getsockopt. We assume
3390  *      this means if you specify SO_ERROR (otherwise whats the point of it).
3391  */
3392 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3393                            char __user *optval, int __user *optlen)
3394 {
3395         struct sock *sk = sock->sk;
3396
3397         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3398 }
3399 EXPORT_SYMBOL(sock_common_getsockopt);
3400
3401 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3402                         int flags)
3403 {
3404         struct sock *sk = sock->sk;
3405         int addr_len = 0;
3406         int err;
3407
3408         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3409                                    flags & ~MSG_DONTWAIT, &addr_len);
3410         if (err >= 0)
3411                 msg->msg_namelen = addr_len;
3412         return err;
3413 }
3414 EXPORT_SYMBOL(sock_common_recvmsg);
3415
3416 /*
3417  *      Set socket options on an inet socket.
3418  */
3419 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3420                            sockptr_t optval, unsigned int optlen)
3421 {
3422         struct sock *sk = sock->sk;
3423
3424         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3425 }
3426 EXPORT_SYMBOL(sock_common_setsockopt);
3427
3428 void sk_common_release(struct sock *sk)
3429 {
3430         if (sk->sk_prot->destroy)
3431                 sk->sk_prot->destroy(sk);
3432
3433         /*
3434          * Observation: when sk_common_release is called, processes have
3435          * no access to socket. But net still has.
3436          * Step one, detach it from networking:
3437          *
3438          * A. Remove from hash tables.
3439          */
3440
3441         sk->sk_prot->unhash(sk);
3442
3443         /*
3444          * In this point socket cannot receive new packets, but it is possible
3445          * that some packets are in flight because some CPU runs receiver and
3446          * did hash table lookup before we unhashed socket. They will achieve
3447          * receive queue and will be purged by socket destructor.
3448          *
3449          * Also we still have packets pending on receive queue and probably,
3450          * our own packets waiting in device queues. sock_destroy will drain
3451          * receive queue, but transmitted packets will delay socket destruction
3452          * until the last reference will be released.
3453          */
3454
3455         sock_orphan(sk);
3456
3457         xfrm_sk_free_policy(sk);
3458
3459         sk_refcnt_debug_release(sk);
3460
3461         sock_put(sk);
3462 }
3463 EXPORT_SYMBOL(sk_common_release);
3464
3465 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3466 {
3467         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3468
3469         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3470         mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3471         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3472         mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3473         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3474         mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3475         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3476         mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3477         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3478 }
3479
3480 #ifdef CONFIG_PROC_FS
3481 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
3482 struct prot_inuse {
3483         int val[PROTO_INUSE_NR];
3484 };
3485
3486 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3487
3488 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3489 {
3490         __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3491 }
3492 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3493
3494 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3495 {
3496         int cpu, idx = prot->inuse_idx;
3497         int res = 0;
3498
3499         for_each_possible_cpu(cpu)
3500                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3501
3502         return res >= 0 ? res : 0;
3503 }
3504 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3505
3506 static void sock_inuse_add(struct net *net, int val)
3507 {
3508         this_cpu_add(*net->core.sock_inuse, val);
3509 }
3510
3511 int sock_inuse_get(struct net *net)
3512 {
3513         int cpu, res = 0;
3514
3515         for_each_possible_cpu(cpu)
3516                 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3517
3518         return res;
3519 }
3520
3521 EXPORT_SYMBOL_GPL(sock_inuse_get);
3522
3523 static int __net_init sock_inuse_init_net(struct net *net)
3524 {
3525         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3526         if (net->core.prot_inuse == NULL)
3527                 return -ENOMEM;
3528
3529         net->core.sock_inuse = alloc_percpu(int);
3530         if (net->core.sock_inuse == NULL)
3531                 goto out;
3532
3533         return 0;
3534
3535 out:
3536         free_percpu(net->core.prot_inuse);
3537         return -ENOMEM;
3538 }
3539
3540 static void __net_exit sock_inuse_exit_net(struct net *net)
3541 {
3542         free_percpu(net->core.prot_inuse);
3543         free_percpu(net->core.sock_inuse);
3544 }
3545
3546 static struct pernet_operations net_inuse_ops = {
3547         .init = sock_inuse_init_net,
3548         .exit = sock_inuse_exit_net,
3549 };
3550
3551 static __init int net_inuse_init(void)
3552 {
3553         if (register_pernet_subsys(&net_inuse_ops))
3554                 panic("Cannot initialize net inuse counters");
3555
3556         return 0;
3557 }
3558
3559 core_initcall(net_inuse_init);
3560
3561 static int assign_proto_idx(struct proto *prot)
3562 {
3563         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3564
3565         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3566                 pr_err("PROTO_INUSE_NR exhausted\n");
3567                 return -ENOSPC;
3568         }
3569
3570         set_bit(prot->inuse_idx, proto_inuse_idx);
3571         return 0;
3572 }
3573
3574 static void release_proto_idx(struct proto *prot)
3575 {
3576         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3577                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3578 }
3579 #else
3580 static inline int assign_proto_idx(struct proto *prot)
3581 {
3582         return 0;
3583 }
3584
3585 static inline void release_proto_idx(struct proto *prot)
3586 {
3587 }
3588
3589 static void sock_inuse_add(struct net *net, int val)
3590 {
3591 }
3592 #endif
3593
3594 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3595 {
3596         if (!twsk_prot)
3597                 return;
3598         kfree(twsk_prot->twsk_slab_name);
3599         twsk_prot->twsk_slab_name = NULL;
3600         kmem_cache_destroy(twsk_prot->twsk_slab);
3601         twsk_prot->twsk_slab = NULL;
3602 }
3603
3604 static int tw_prot_init(const struct proto *prot)
3605 {
3606         struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3607
3608         if (!twsk_prot)
3609                 return 0;
3610
3611         twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3612                                               prot->name);
3613         if (!twsk_prot->twsk_slab_name)
3614                 return -ENOMEM;
3615
3616         twsk_prot->twsk_slab =
3617                 kmem_cache_create(twsk_prot->twsk_slab_name,
3618                                   twsk_prot->twsk_obj_size, 0,
3619                                   SLAB_ACCOUNT | prot->slab_flags,
3620                                   NULL);
3621         if (!twsk_prot->twsk_slab) {
3622                 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3623                         prot->name);
3624                 return -ENOMEM;
3625         }
3626
3627         return 0;
3628 }
3629
3630 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3631 {
3632         if (!rsk_prot)
3633                 return;
3634         kfree(rsk_prot->slab_name);
3635         rsk_prot->slab_name = NULL;
3636         kmem_cache_destroy(rsk_prot->slab);
3637         rsk_prot->slab = NULL;
3638 }
3639
3640 static int req_prot_init(const struct proto *prot)
3641 {
3642         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3643
3644         if (!rsk_prot)
3645                 return 0;
3646
3647         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3648                                         prot->name);
3649         if (!rsk_prot->slab_name)
3650                 return -ENOMEM;
3651
3652         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3653                                            rsk_prot->obj_size, 0,
3654                                            SLAB_ACCOUNT | prot->slab_flags,
3655                                            NULL);
3656
3657         if (!rsk_prot->slab) {
3658                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3659                         prot->name);
3660                 return -ENOMEM;
3661         }
3662         return 0;
3663 }
3664
3665 int proto_register(struct proto *prot, int alloc_slab)
3666 {
3667         int ret = -ENOBUFS;
3668
3669         if (alloc_slab) {
3670                 prot->slab = kmem_cache_create_usercopy(prot->name,
3671                                         prot->obj_size, 0,
3672                                         SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3673                                         prot->slab_flags,
3674                                         prot->useroffset, prot->usersize,
3675                                         NULL);
3676
3677                 if (prot->slab == NULL) {
3678                         pr_crit("%s: Can't create sock SLAB cache!\n",
3679                                 prot->name);
3680                         goto out;
3681                 }
3682
3683                 if (req_prot_init(prot))
3684                         goto out_free_request_sock_slab;
3685
3686                 if (tw_prot_init(prot))
3687                         goto out_free_timewait_sock_slab;
3688         }
3689
3690         mutex_lock(&proto_list_mutex);
3691         ret = assign_proto_idx(prot);
3692         if (ret) {
3693                 mutex_unlock(&proto_list_mutex);
3694                 goto out_free_timewait_sock_slab;
3695         }
3696         list_add(&prot->node, &proto_list);
3697         mutex_unlock(&proto_list_mutex);
3698         return ret;
3699
3700 out_free_timewait_sock_slab:
3701         if (alloc_slab)
3702                 tw_prot_cleanup(prot->twsk_prot);
3703 out_free_request_sock_slab:
3704         if (alloc_slab) {
3705                 req_prot_cleanup(prot->rsk_prot);
3706
3707                 kmem_cache_destroy(prot->slab);
3708                 prot->slab = NULL;
3709         }
3710 out:
3711         return ret;
3712 }
3713 EXPORT_SYMBOL(proto_register);
3714
3715 void proto_unregister(struct proto *prot)
3716 {
3717         mutex_lock(&proto_list_mutex);
3718         release_proto_idx(prot);
3719         list_del(&prot->node);
3720         mutex_unlock(&proto_list_mutex);
3721
3722         kmem_cache_destroy(prot->slab);
3723         prot->slab = NULL;
3724
3725         req_prot_cleanup(prot->rsk_prot);
3726         tw_prot_cleanup(prot->twsk_prot);
3727 }
3728 EXPORT_SYMBOL(proto_unregister);
3729
3730 int sock_load_diag_module(int family, int protocol)
3731 {
3732         if (!protocol) {
3733                 if (!sock_is_registered(family))
3734                         return -ENOENT;
3735
3736                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3737                                       NETLINK_SOCK_DIAG, family);
3738         }
3739
3740 #ifdef CONFIG_INET
3741         if (family == AF_INET &&
3742             protocol != IPPROTO_RAW &&
3743             protocol < MAX_INET_PROTOS &&
3744             !rcu_access_pointer(inet_protos[protocol]))
3745                 return -ENOENT;
3746 #endif
3747
3748         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3749                               NETLINK_SOCK_DIAG, family, protocol);
3750 }
3751 EXPORT_SYMBOL(sock_load_diag_module);
3752
3753 #ifdef CONFIG_PROC_FS
3754 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3755         __acquires(proto_list_mutex)
3756 {
3757         mutex_lock(&proto_list_mutex);
3758         return seq_list_start_head(&proto_list, *pos);
3759 }
3760
3761 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3762 {
3763         return seq_list_next(v, &proto_list, pos);
3764 }
3765
3766 static void proto_seq_stop(struct seq_file *seq, void *v)
3767         __releases(proto_list_mutex)
3768 {
3769         mutex_unlock(&proto_list_mutex);
3770 }
3771
3772 static char proto_method_implemented(const void *method)
3773 {
3774         return method == NULL ? 'n' : 'y';
3775 }
3776 static long sock_prot_memory_allocated(struct proto *proto)
3777 {
3778         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3779 }
3780
3781 static const char *sock_prot_memory_pressure(struct proto *proto)
3782 {
3783         return proto->memory_pressure != NULL ?
3784         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3785 }
3786
3787 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3788 {
3789
3790         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3791                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3792                    proto->name,
3793                    proto->obj_size,
3794                    sock_prot_inuse_get(seq_file_net(seq), proto),
3795                    sock_prot_memory_allocated(proto),
3796                    sock_prot_memory_pressure(proto),
3797                    proto->max_header,
3798                    proto->slab == NULL ? "no" : "yes",
3799                    module_name(proto->owner),
3800                    proto_method_implemented(proto->close),
3801                    proto_method_implemented(proto->connect),
3802                    proto_method_implemented(proto->disconnect),
3803                    proto_method_implemented(proto->accept),
3804                    proto_method_implemented(proto->ioctl),
3805                    proto_method_implemented(proto->init),
3806                    proto_method_implemented(proto->destroy),
3807                    proto_method_implemented(proto->shutdown),
3808                    proto_method_implemented(proto->setsockopt),
3809                    proto_method_implemented(proto->getsockopt),
3810                    proto_method_implemented(proto->sendmsg),
3811                    proto_method_implemented(proto->recvmsg),
3812                    proto_method_implemented(proto->sendpage),
3813                    proto_method_implemented(proto->bind),
3814                    proto_method_implemented(proto->backlog_rcv),
3815                    proto_method_implemented(proto->hash),
3816                    proto_method_implemented(proto->unhash),
3817                    proto_method_implemented(proto->get_port),
3818                    proto_method_implemented(proto->enter_memory_pressure));
3819 }
3820
3821 static int proto_seq_show(struct seq_file *seq, void *v)
3822 {
3823         if (v == &proto_list)
3824                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3825                            "protocol",
3826                            "size",
3827                            "sockets",
3828                            "memory",
3829                            "press",
3830                            "maxhdr",
3831                            "slab",
3832                            "module",
3833                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3834         else
3835                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3836         return 0;
3837 }
3838
3839 static const struct seq_operations proto_seq_ops = {
3840         .start  = proto_seq_start,
3841         .next   = proto_seq_next,
3842         .stop   = proto_seq_stop,
3843         .show   = proto_seq_show,
3844 };
3845
3846 static __net_init int proto_init_net(struct net *net)
3847 {
3848         if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3849                         sizeof(struct seq_net_private)))
3850                 return -ENOMEM;
3851
3852         return 0;
3853 }
3854
3855 static __net_exit void proto_exit_net(struct net *net)
3856 {
3857         remove_proc_entry("protocols", net->proc_net);
3858 }
3859
3860
3861 static __net_initdata struct pernet_operations proto_net_ops = {
3862         .init = proto_init_net,
3863         .exit = proto_exit_net,
3864 };
3865
3866 static int __init proto_init(void)
3867 {
3868         return register_pernet_subsys(&proto_net_ops);
3869 }
3870
3871 subsys_initcall(proto_init);
3872
3873 #endif /* PROC_FS */
3874
3875 #ifdef CONFIG_NET_RX_BUSY_POLL
3876 bool sk_busy_loop_end(void *p, unsigned long start_time)
3877 {
3878         struct sock *sk = p;
3879
3880         return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3881                sk_busy_loop_timeout(sk, start_time);
3882 }
3883 EXPORT_SYMBOL(sk_busy_loop_end);
3884 #endif /* CONFIG_NET_RX_BUSY_POLL */
3885
3886 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3887 {
3888         if (!sk->sk_prot->bind_add)
3889                 return -EOPNOTSUPP;
3890         return sk->sk_prot->bind_add(sk, addr, addr_len);
3891 }
3892 EXPORT_SYMBOL(sock_bind_add);