net/core/sock.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Generic socket support routines. Memory allocators, socket lock/release
   8  *              handler for protocols to use and generic option handler.
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  */
  85
  86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88 #include <asm/unaligned.h>
  89 #include <linux/capability.h>
  90 #include <linux/errno.h>
  91 #include <linux/errqueue.h>
  92 #include <linux/types.h>
  93 #include <linux/socket.h>
  94 #include <linux/in.h>
  95 #include <linux/kernel.h>
  96 #include <linux/module.h>
  97 #include <linux/proc_fs.h>
  98 #include <linux/seq_file.h>
  99 #include <linux/sched.h>
 100 #include <linux/sched/mm.h>
 101 #include <linux/timer.h>
 102 #include <linux/string.h>
 103 #include <linux/sockios.h>
 104 #include <linux/net.h>
 105 #include <linux/mm.h>
 106 #include <linux/slab.h>
 107 #include <linux/interrupt.h>
 108 #include <linux/poll.h>
 109 #include <linux/tcp.h>
 110 #include <linux/init.h>
 111 #include <linux/highmem.h>
 112 #include <linux/user_namespace.h>
 113 #include <linux/static_key.h>
 114 #include <linux/memcontrol.h>
 115 #include <linux/prefetch.h>
 116
 117 #include <linux/uaccess.h>
 118
 119 #include <linux/netdevice.h>
 120 #include <net/protocol.h>
 121 #include <linux/skbuff.h>
 122 #include <net/net_namespace.h>
 123 #include <net/request_sock.h>
 124 #include <net/sock.h>
 125 #include <linux/net_tstamp.h>
 126 #include <net/xfrm.h>
 127 #include <linux/ipsec.h>
 128 #include <net/cls_cgroup.h>
 129 #include <net/netprio_cgroup.h>
 130 #include <linux/sock_diag.h>
 131
 132 #include <linux/filter.h>
 133 #include <net/sock_reuseport.h>
 134 #include <net/bpf_sk_storage.h>
 135
 136 #include <trace/events/sock.h>
 137
 138 #include <net/tcp.h>
 139 #include <net/busy_poll.h>
 140
 141 static DEFINE_MUTEX(proto_list_mutex);
 142 static LIST_HEAD(proto_list);
 143
 144 static void sock_inuse_add(struct net *net, int val);
 145
 146 /**
 147  * sk_ns_capable - General socket capability test
 148  * @sk: Socket to use a capability on or through
 149  * @user_ns: The user namespace of the capability to use
 150  * @cap: The capability to use
 151  *
 152  * Test to see if the opener of the socket had when the socket was
 153  * created and the current process has the capability @cap in the user
 154  * namespace @user_ns.
 155  */
 156 bool sk_ns_capable(const struct sock *sk,
 157                    struct user_namespace *user_ns, int cap)
 158 {
 159         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 160                 ns_capable(user_ns, cap);
 161 }
 162 EXPORT_SYMBOL(sk_ns_capable);
 163
 164 /**
 165  * sk_capable - Socket global capability test
 166  * @sk: Socket to use a capability on or through
 167  * @cap: The global capability to use
 168  *
 169  * Test to see if the opener of the socket had when the socket was
 170  * created and the current process has the capability @cap in all user
 171  * namespaces.
 172  */
 173 bool sk_capable(const struct sock *sk, int cap)
 174 {
 175         return sk_ns_capable(sk, &init_user_ns, cap);
 176 }
 177 EXPORT_SYMBOL(sk_capable);
 178
 179 /**
 180  * sk_net_capable - Network namespace socket capability test
 181  * @sk: Socket to use a capability on or through
 182  * @cap: The capability to use
 183  *
 184  * Test to see if the opener of the socket had when the socket was created
 185  * and the current process has the capability @cap over the network namespace
 186  * the socket is a member of.
 187  */
 188 bool sk_net_capable(const struct sock *sk, int cap)
 189 {
 190         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 191 }
 192 EXPORT_SYMBOL(sk_net_capable);
 193
 194 /*
 195  * Each address family might have different locking rules, so we have
 196  * one slock key per address family and separate keys for internal and
 197  * userspace sockets.
 198  */
 199 static struct lock_class_key af_family_keys[AF_MAX];
 200 static struct lock_class_key af_family_kern_keys[AF_MAX];
 201 static struct lock_class_key af_family_slock_keys[AF_MAX];
 202 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 203
 204 /*
 205  * Make lock validator output more readable. (we pre-construct these
 206  * strings build-time, so that runtime initialization of socket
 207  * locks is fast):
 208  */
 209
 210 #define _sock_locks(x)                                            \
 211   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 212   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 213   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 214   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 215   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 216   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 217   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 218   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 219   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 220   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 221   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 222   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 223   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 224   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 225   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 226   x "AF_MAX"
 227
 228 static const char *const af_family_key_strings[AF_MAX+1] = {
 229         _sock_locks("sk_lock-")
 230 };
 231 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 232         _sock_locks("slock-")
 233 };
 234 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 235         _sock_locks("clock-")
 236 };
 237
 238 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 239         _sock_locks("k-sk_lock-")
 240 };
 241 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 242         _sock_locks("k-slock-")
 243 };
 244 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 245         _sock_locks("k-clock-")
 246 };
 247 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 248         _sock_locks("rlock-")
 249 };
 250 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 251         _sock_locks("wlock-")
 252 };
 253 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 254         _sock_locks("elock-")
 255 };
 256
 257 /*
 258  * sk_callback_lock and sk queues locking rules are per-address-family,
 259  * so split the lock classes by using a per-AF key:
 260  */
 261 static struct lock_class_key af_callback_keys[AF_MAX];
 262 static struct lock_class_key af_rlock_keys[AF_MAX];
 263 static struct lock_class_key af_wlock_keys[AF_MAX];
 264 static struct lock_class_key af_elock_keys[AF_MAX];
 265 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 266
 267 /* Run time adjustable parameters. */
 268 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 269 EXPORT_SYMBOL(sysctl_wmem_max);
 270 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 271 EXPORT_SYMBOL(sysctl_rmem_max);
 272 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 273 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 274
 275 /* Maximal space eaten by iovec or ancillary data plus some space */
 276 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 277 EXPORT_SYMBOL(sysctl_optmem_max);
 278
 279 int sysctl_tstamp_allow_data __read_mostly = 1;
 280
 281 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 282 EXPORT_SYMBOL_GPL(memalloc_socks_key);
 283
 284 /**
 285  * sk_set_memalloc - sets %SOCK_MEMALLOC
 286  * @sk: socket to set it on
 287  *
 288  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 289  * It's the responsibility of the admin to adjust min_free_kbytes
 290  * to meet the requirements
 291  */
 292 void sk_set_memalloc(struct sock *sk)
 293 {
 294         sock_set_flag(sk, SOCK_MEMALLOC);
 295         sk->sk_allocation |= __GFP_MEMALLOC;
 296         static_branch_inc(&memalloc_socks_key);
 297 }
 298 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 299
 300 void sk_clear_memalloc(struct sock *sk)
 301 {
 302         sock_reset_flag(sk, SOCK_MEMALLOC);
 303         sk->sk_allocation &= ~__GFP_MEMALLOC;
 304         static_branch_dec(&memalloc_socks_key);
 305
 306         /*
 307          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 308          * progress of swapping. SOCK_MEMALLOC may be cleared while
 309          * it has rmem allocations due to the last swapfile being deactivated
 310          * but there is a risk that the socket is unusable due to exceeding
 311          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 312          */
 313         sk_mem_reclaim(sk);
 314 }
 315 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 316
 317 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 318 {
 319         int ret;
 320         unsigned int noreclaim_flag;
 321
 322         /* these should have been dropped before queueing */
 323         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 324
 325         noreclaim_flag = memalloc_noreclaim_save();
 326         ret = sk->sk_backlog_rcv(sk, skb);
 327         memalloc_noreclaim_restore(noreclaim_flag);
 328
 329         return ret;
 330 }
 331 EXPORT_SYMBOL(__sk_backlog_rcv);
 332
 333 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 334 {
 335         struct __kernel_sock_timeval tv;
 336         int size;
 337
 338         if (timeo == MAX_SCHEDULE_TIMEOUT) {
 339                 tv.tv_sec = 0;
 340                 tv.tv_usec = 0;
 341         } else {
 342                 tv.tv_sec = timeo / HZ;
 343                 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 344         }
 345
 346         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 347                 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 348                 *(struct old_timeval32 *)optval = tv32;
 349                 return sizeof(tv32);
 350         }
 351
 352         if (old_timeval) {
 353                 struct __kernel_old_timeval old_tv;
 354                 old_tv.tv_sec = tv.tv_sec;
 355                 old_tv.tv_usec = tv.tv_usec;
 356                 *(struct __kernel_old_timeval *)optval = old_tv;
 357                 size = sizeof(old_tv);
 358         } else {
 359                 *(struct __kernel_sock_timeval *)optval = tv;
 360                 size = sizeof(tv);
 361         }
 362
 363         return size;
 364 }
 365
 366 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval)
 367 {
 368         struct __kernel_sock_timeval tv;
 369
 370         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 371                 struct old_timeval32 tv32;
 372
 373                 if (optlen < sizeof(tv32))
 374                         return -EINVAL;
 375
 376                 if (copy_from_user(&tv32, optval, sizeof(tv32)))
 377                         return -EFAULT;
 378                 tv.tv_sec = tv32.tv_sec;
 379                 tv.tv_usec = tv32.tv_usec;
 380         } else if (old_timeval) {
 381                 struct __kernel_old_timeval old_tv;
 382
 383                 if (optlen < sizeof(old_tv))
 384                         return -EINVAL;
 385                 if (copy_from_user(&old_tv, optval, sizeof(old_tv)))
 386                         return -EFAULT;
 387                 tv.tv_sec = old_tv.tv_sec;
 388                 tv.tv_usec = old_tv.tv_usec;
 389         } else {
 390                 if (optlen < sizeof(tv))
 391                         return -EINVAL;
 392                 if (copy_from_user(&tv, optval, sizeof(tv)))
 393                         return -EFAULT;
 394         }
 395         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 396                 return -EDOM;
 397
 398         if (tv.tv_sec < 0) {
 399                 static int warned __read_mostly;
 400
 401                 *timeo_p = 0;
 402                 if (warned < 10 && net_ratelimit()) {
 403                         warned++;
 404                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 405                                 __func__, current->comm, task_pid_nr(current));
 406                 }
 407                 return 0;
 408         }
 409         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 410         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 411                 return 0;
 412         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
 413                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 414         return 0;
 415 }
 416
 417 static void sock_warn_obsolete_bsdism(const char *name)
 418 {
 419         static int warned;
 420         static char warncomm[TASK_COMM_LEN];
 421         if (strcmp(warncomm, current->comm) && warned < 5) {
 422                 strcpy(warncomm,  current->comm);
 423                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 424                         warncomm, name);
 425                 warned++;
 426         }
 427 }
 428
 429 static bool sock_needs_netstamp(const struct sock *sk)
 430 {
 431         switch (sk->sk_family) {
 432         case AF_UNSPEC:
 433         case AF_UNIX:
 434                 return false;
 435         default:
 436                 return true;
 437         }
 438 }
 439
 440 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 441 {
 442         if (sk->sk_flags & flags) {
 443                 sk->sk_flags &= ~flags;
 444                 if (sock_needs_netstamp(sk) &&
 445                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 446                         net_disable_timestamp();
 447         }
 448 }
 449
 450
 451 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 452 {
 453         unsigned long flags;
 454         struct sk_buff_head *list = &sk->sk_receive_queue;
 455
 456         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 457                 atomic_inc(&sk->sk_drops);
 458                 trace_sock_rcvqueue_full(sk, skb);
 459                 return -ENOMEM;
 460         }
 461
 462         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 463                 atomic_inc(&sk->sk_drops);
 464                 return -ENOBUFS;
 465         }
 466
 467         skb->dev = NULL;
 468         skb_set_owner_r(skb, sk);
 469
 470         /* we escape from rcu protected region, make sure we dont leak
 471          * a norefcounted dst
 472          */
 473         skb_dst_force(skb);
 474
 475         spin_lock_irqsave(&list->lock, flags);
 476         sock_skb_set_dropcount(sk, skb);
 477         __skb_queue_tail(list, skb);
 478         spin_unlock_irqrestore(&list->lock, flags);
 479
 480         if (!sock_flag(sk, SOCK_DEAD))
 481                 sk->sk_data_ready(sk);
 482         return 0;
 483 }
 484 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 485
 486 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 487 {
 488         int err;
 489
 490         err = sk_filter(sk, skb);
 491         if (err)
 492                 return err;
 493
 494         return __sock_queue_rcv_skb(sk, skb);
 495 }
 496 EXPORT_SYMBOL(sock_queue_rcv_skb);
 497
 498 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 499                      const int nested, unsigned int trim_cap, bool refcounted)
 500 {
 501         int rc = NET_RX_SUCCESS;
 502
 503         if (sk_filter_trim_cap(sk, skb, trim_cap))
 504                 goto discard_and_relse;
 505
 506         skb->dev = NULL;
 507
 508         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 509                 atomic_inc(&sk->sk_drops);
 510                 goto discard_and_relse;
 511         }
 512         if (nested)
 513                 bh_lock_sock_nested(sk);
 514         else
 515                 bh_lock_sock(sk);
 516         if (!sock_owned_by_user(sk)) {
 517                 /*
 518                  * trylock + unlock semantics:
 519                  */
 520                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 521
 522                 rc = sk_backlog_rcv(sk, skb);
 523
 524                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 525         } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 526                 bh_unlock_sock(sk);
 527                 atomic_inc(&sk->sk_drops);
 528                 goto discard_and_relse;
 529         }
 530
 531         bh_unlock_sock(sk);
 532 out:
 533         if (refcounted)
 534                 sock_put(sk);
 535         return rc;
 536 discard_and_relse:
 537         kfree_skb(skb);
 538         goto out;
 539 }
 540 EXPORT_SYMBOL(__sk_receive_skb);
 541
 542 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 543 {
 544         struct dst_entry *dst = __sk_dst_get(sk);
 545
 546         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 547                 sk_tx_queue_clear(sk);
 548                 sk->sk_dst_pending_confirm = 0;
 549                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 550                 dst_release(dst);
 551                 return NULL;
 552         }
 553
 554         return dst;
 555 }
 556 EXPORT_SYMBOL(__sk_dst_check);
 557
 558 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 559 {
 560         struct dst_entry *dst = sk_dst_get(sk);
 561
 562         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 563                 sk_dst_reset(sk);
 564                 dst_release(dst);
 565                 return NULL;
 566         }
 567
 568         return dst;
 569 }
 570 EXPORT_SYMBOL(sk_dst_check);
 571
 572 static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
 573 {
 574         int ret = -ENOPROTOOPT;
 575 #ifdef CONFIG_NETDEVICES
 576         struct net *net = sock_net(sk);
 577
 578         /* Sorry... */
 579         ret = -EPERM;
 580         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 581                 goto out;
 582
 583         ret = -EINVAL;
 584         if (ifindex < 0)
 585                 goto out;
 586
 587         sk->sk_bound_dev_if = ifindex;
 588         if (sk->sk_prot->rehash)
 589                 sk->sk_prot->rehash(sk);
 590         sk_dst_reset(sk);
 591
 592         ret = 0;
 593
 594 out:
 595 #endif
 596
 597         return ret;
 598 }
 599
 600 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 601                                 int optlen)
 602 {
 603         int ret = -ENOPROTOOPT;
 604 #ifdef CONFIG_NETDEVICES
 605         struct net *net = sock_net(sk);
 606         char devname[IFNAMSIZ];
 607         int index;
 608
 609         ret = -EINVAL;
 610         if (optlen < 0)
 611                 goto out;
 612
 613         /* Bind this socket to a particular device like "eth0",
 614          * as specified in the passed interface name. If the
 615          * name is "" or the option length is zero the socket
 616          * is not bound.
 617          */
 618         if (optlen > IFNAMSIZ - 1)
 619                 optlen = IFNAMSIZ - 1;
 620         memset(devname, 0, sizeof(devname));
 621
 622         ret = -EFAULT;
 623         if (copy_from_user(devname, optval, optlen))
 624                 goto out;
 625
 626         index = 0;
 627         if (devname[0] != '\0') {
 628                 struct net_device *dev;
 629
 630                 rcu_read_lock();
 631                 dev = dev_get_by_name_rcu(net, devname);
 632                 if (dev)
 633                         index = dev->ifindex;
 634                 rcu_read_unlock();
 635                 ret = -ENODEV;
 636                 if (!dev)
 637                         goto out;
 638         }
 639
 640         lock_sock(sk);
 641         ret = sock_setbindtodevice_locked(sk, index);
 642         release_sock(sk);
 643
 644 out:
 645 #endif
 646
 647         return ret;
 648 }
 649
 650 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 651                                 int __user *optlen, int len)
 652 {
 653         int ret = -ENOPROTOOPT;
 654 #ifdef CONFIG_NETDEVICES
 655         struct net *net = sock_net(sk);
 656         char devname[IFNAMSIZ];
 657
 658         if (sk->sk_bound_dev_if == 0) {
 659                 len = 0;
 660                 goto zero;
 661         }
 662
 663         ret = -EINVAL;
 664         if (len < IFNAMSIZ)
 665                 goto out;
 666
 667         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 668         if (ret)
 669                 goto out;
 670
 671         len = strlen(devname) + 1;
 672
 673         ret = -EFAULT;
 674         if (copy_to_user(optval, devname, len))
 675                 goto out;
 676
 677 zero:
 678         ret = -EFAULT;
 679         if (put_user(len, optlen))
 680                 goto out;
 681
 682         ret = 0;
 683
 684 out:
 685 #endif
 686
 687         return ret;
 688 }
 689
 690 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 691 {
 692         if (valbool)
 693                 sock_set_flag(sk, bit);
 694         else
 695                 sock_reset_flag(sk, bit);
 696 }
 697
 698 bool sk_mc_loop(struct sock *sk)
 699 {
 700         if (dev_recursion_level())
 701                 return false;
 702         if (!sk)
 703                 return true;
 704         /* IPV6_ADDRFORM can change sk->sk_family under us. */
 705         switch (READ_ONCE(sk->sk_family)) {
 706         case AF_INET:
 707                 return inet_sk(sk)->mc_loop;
 708 #if IS_ENABLED(CONFIG_IPV6)
 709         case AF_INET6:
 710                 return inet6_sk(sk)->mc_loop;
 711 #endif
 712         }
 713         WARN_ON_ONCE(1);
 714         return true;
 715 }
 716 EXPORT_SYMBOL(sk_mc_loop);
 717
 718 /*
 719  *      This is meant for all protocols to use and covers goings on
 720  *      at the socket level. Everything here is generic.
 721  */
 722
 723 int sock_setsockopt(struct socket *sock, int level, int optname,
 724                     char __user *optval, unsigned int optlen)
 725 {
 726         struct sock_txtime sk_txtime;
 727         struct sock *sk = sock->sk;
 728         int val;
 729         int valbool;
 730         struct linger ling;
 731         int ret = 0;
 732
 733         /*
 734          *      Options without arguments
 735          */
 736
 737         if (optname == SO_BINDTODEVICE)
 738                 return sock_setbindtodevice(sk, optval, optlen);
 739
 740         if (optlen < sizeof(int))
 741                 return -EINVAL;
 742
 743         if (get_user(val, (int __user *)optval))
 744                 return -EFAULT;
 745
 746         valbool = val ? 1 : 0;
 747
 748         lock_sock(sk);
 749
 750         switch (optname) {
 751         case SO_DEBUG:
 752                 if (val && !capable(CAP_NET_ADMIN))
 753                         ret = -EACCES;
 754                 else
 755                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 756                 break;
 757         case SO_REUSEADDR:
 758                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 759                 break;
 760         case SO_REUSEPORT:
 761                 sk->sk_reuseport = valbool;
 762                 break;
 763         case SO_TYPE:
 764         case SO_PROTOCOL:
 765         case SO_DOMAIN:
 766         case SO_ERROR:
 767                 ret = -ENOPROTOOPT;
 768                 break;
 769         case SO_DONTROUTE:
 770                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 771                 sk_dst_reset(sk);
 772                 break;
 773         case SO_BROADCAST:
 774                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 775                 break;
 776         case SO_SNDBUF:
 777                 /* Don't error on this BSD doesn't and if you think
 778                  * about it this is right. Otherwise apps have to
 779                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 780                  * are treated in BSD as hints
 781                  */
 782                 val = min_t(u32, val, sysctl_wmem_max);
 783 set_sndbuf:
 784                 /* Ensure val * 2 fits into an int, to prevent max_t()
 785                  * from treating it as a negative value.
 786                  */
 787                 val = min_t(int, val, INT_MAX / 2);
 788                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 789                 WRITE_ONCE(sk->sk_sndbuf,
 790                            max_t(int, val * 2, SOCK_MIN_SNDBUF));
 791                 /* Wake up sending tasks if we upped the value. */
 792                 sk->sk_write_space(sk);
 793                 break;
 794
 795         case SO_SNDBUFFORCE:
 796                 if (!capable(CAP_NET_ADMIN)) {
 797                         ret = -EPERM;
 798                         break;
 799                 }
 800
 801                 /* No negative values (to prevent underflow, as val will be
 802                  * multiplied by 2).
 803                  */
 804                 if (val < 0)
 805                         val = 0;
 806                 goto set_sndbuf;
 807
 808         case SO_RCVBUF:
 809                 /* Don't error on this BSD doesn't and if you think
 810                  * about it this is right. Otherwise apps have to
 811                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 812                  * are treated in BSD as hints
 813                  */
 814                 val = min_t(u32, val, sysctl_rmem_max);
 815 set_rcvbuf:
 816                 /* Ensure val * 2 fits into an int, to prevent max_t()
 817                  * from treating it as a negative value.
 818                  */
 819                 val = min_t(int, val, INT_MAX / 2);
 820                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 821                 /*
 822                  * We double it on the way in to account for
 823                  * "struct sk_buff" etc. overhead.   Applications
 824                  * assume that the SO_RCVBUF setting they make will
 825                  * allow that much actual data to be received on that
 826                  * socket.
 827                  *
 828                  * Applications are unaware that "struct sk_buff" and
 829                  * other overheads allocate from the receive buffer
 830                  * during socket buffer allocation.
 831                  *
 832                  * And after considering the possible alternatives,
 833                  * returning the value we actually used in getsockopt
 834                  * is the most desirable behavior.
 835                  */
 836                 WRITE_ONCE(sk->sk_rcvbuf,
 837                            max_t(int, val * 2, SOCK_MIN_RCVBUF));
 838                 break;
 839
 840         case SO_RCVBUFFORCE:
 841                 if (!capable(CAP_NET_ADMIN)) {
 842                         ret = -EPERM;
 843                         break;
 844                 }
 845
 846                 /* No negative values (to prevent underflow, as val will be
 847                  * multiplied by 2).
 848                  */
 849                 if (val < 0)
 850                         val = 0;
 851                 goto set_rcvbuf;
 852
 853         case SO_KEEPALIVE:
 854                 if (sk->sk_prot->keepalive)
 855                         sk->sk_prot->keepalive(sk, valbool);
 856                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 857                 break;
 858
 859         case SO_OOBINLINE:
 860                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 861                 break;
 862
 863         case SO_NO_CHECK:
 864                 sk->sk_no_check_tx = valbool;
 865                 break;
 866
 867         case SO_PRIORITY:
 868                 if ((val >= 0 && val <= 6) ||
 869                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 870                         sk->sk_priority = val;
 871                 else
 872                         ret = -EPERM;
 873                 break;
 874
 875         case SO_LINGER:
 876                 if (optlen < sizeof(ling)) {
 877                         ret = -EINVAL;  /* 1003.1g */
 878                         break;
 879                 }
 880                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 881                         ret = -EFAULT;
 882                         break;
 883                 }
 884                 if (!ling.l_onoff)
 885                         sock_reset_flag(sk, SOCK_LINGER);
 886                 else {
 887 #if (BITS_PER_LONG == 32)
 888                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 889                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 890                         else
 891 #endif
 892                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 893                         sock_set_flag(sk, SOCK_LINGER);
 894                 }
 895                 break;
 896
 897         case SO_BSDCOMPAT:
 898                 sock_warn_obsolete_bsdism("setsockopt");
 899                 break;
 900
 901         case SO_PASSCRED:
 902                 if (valbool)
 903                         set_bit(SOCK_PASSCRED, &sock->flags);
 904                 else
 905                         clear_bit(SOCK_PASSCRED, &sock->flags);
 906                 break;
 907
 908         case SO_TIMESTAMP_OLD:
 909         case SO_TIMESTAMP_NEW:
 910         case SO_TIMESTAMPNS_OLD:
 911         case SO_TIMESTAMPNS_NEW:
 912                 if (valbool)  {
 913                         if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW)
 914                                 sock_set_flag(sk, SOCK_TSTAMP_NEW);
 915                         else
 916                                 sock_reset_flag(sk, SOCK_TSTAMP_NEW);
 917
 918                         if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW)
 919                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 920                         else
 921                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 922                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 923                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 924                 } else {
 925                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 926                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 927                 }
 928                 break;
 929
 930         case SO_TIMESTAMPING_NEW:
 931         case SO_TIMESTAMPING_OLD:
 932                 if (val & ~SOF_TIMESTAMPING_MASK) {
 933                         ret = -EINVAL;
 934                         break;
 935                 }
 936
 937                 if (val & SOF_TIMESTAMPING_OPT_ID &&
 938                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 939                         if (sk->sk_protocol == IPPROTO_TCP &&
 940                             sk->sk_type == SOCK_STREAM) {
 941                                 if ((1 << sk->sk_state) &
 942                                     (TCPF_CLOSE | TCPF_LISTEN)) {
 943                                         ret = -EINVAL;
 944                                         break;
 945                                 }
 946                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
 947                         } else {
 948                                 sk->sk_tskey = 0;
 949                         }
 950                 }
 951
 952                 if (val & SOF_TIMESTAMPING_OPT_STATS &&
 953                     !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 954                         ret = -EINVAL;
 955                         break;
 956                 }
 957
 958                 sk->sk_tsflags = val;
 959                 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 960
 961                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 962                         sock_enable_timestamp(sk,
 963                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 964                 else
 965                         sock_disable_timestamp(sk,
 966                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 967                 break;
 968
 969         case SO_RCVLOWAT:
 970                 if (val < 0)
 971                         val = INT_MAX;
 972                 if (sock->ops->set_rcvlowat)
 973                         ret = sock->ops->set_rcvlowat(sk, val);
 974                 else
 975                         WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
 976                 break;
 977
 978         case SO_RCVTIMEO_OLD:
 979         case SO_RCVTIMEO_NEW:
 980                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD);
 981                 break;
 982
 983         case SO_SNDTIMEO_OLD:
 984         case SO_SNDTIMEO_NEW:
 985                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD);
 986                 break;
 987
 988         case SO_ATTACH_FILTER:
 989                 ret = -EINVAL;
 990                 if (optlen == sizeof(struct sock_fprog)) {
 991                         struct sock_fprog fprog;
 992
 993                         ret = -EFAULT;
 994                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 995                                 break;
 996
 997                         ret = sk_attach_filter(&fprog, sk);
 998                 }
 999                 break;
1000
1001         case SO_ATTACH_BPF:
1002                 ret = -EINVAL;
1003                 if (optlen == sizeof(u32)) {
1004                         u32 ufd;
1005
1006                         ret = -EFAULT;
1007                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
1008                                 break;
1009
1010                         ret = sk_attach_bpf(ufd, sk);
1011                 }
1012                 break;
1013
1014         case SO_ATTACH_REUSEPORT_CBPF:
1015                 ret = -EINVAL;
1016                 if (optlen == sizeof(struct sock_fprog)) {
1017                         struct sock_fprog fprog;
1018
1019                         ret = -EFAULT;
1020                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
1021                                 break;
1022
1023                         ret = sk_reuseport_attach_filter(&fprog, sk);
1024                 }
1025                 break;
1026
1027         case SO_ATTACH_REUSEPORT_EBPF:
1028                 ret = -EINVAL;
1029                 if (optlen == sizeof(u32)) {
1030                         u32 ufd;
1031
1032                         ret = -EFAULT;
1033                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
1034                                 break;
1035
1036                         ret = sk_reuseport_attach_bpf(ufd, sk);
1037                 }
1038                 break;
1039
1040         case SO_DETACH_REUSEPORT_BPF:
1041                 ret = reuseport_detach_prog(sk);
1042                 break;
1043
1044         case SO_DETACH_FILTER:
1045                 ret = sk_detach_filter(sk);
1046                 break;
1047
1048         case SO_LOCK_FILTER:
1049                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1050                         ret = -EPERM;
1051                 else
1052                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1053                 break;
1054
1055         case SO_PASSSEC:
1056                 if (valbool)
1057                         set_bit(SOCK_PASSSEC, &sock->flags);
1058                 else
1059                         clear_bit(SOCK_PASSSEC, &sock->flags);
1060                 break;
1061         case SO_MARK:
1062                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1063                         ret = -EPERM;
1064                 } else if (val != sk->sk_mark) {
1065                         sk->sk_mark = val;
1066                         sk_dst_reset(sk);
1067                 }
1068                 break;
1069
1070         case SO_RXQ_OVFL:
1071                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1072                 break;
1073
1074         case SO_WIFI_STATUS:
1075                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1076                 break;
1077
1078         case SO_PEEK_OFF:
1079                 if (sock->ops->set_peek_off)
1080                         ret = sock->ops->set_peek_off(sk, val);
1081                 else
1082                         ret = -EOPNOTSUPP;
1083                 break;
1084
1085         case SO_NOFCS:
1086                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1087                 break;
1088
1089         case SO_SELECT_ERR_QUEUE:
1090                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1091                 break;
1092
1093 #ifdef CONFIG_NET_RX_BUSY_POLL
1094         case SO_BUSY_POLL:
1095                 /* allow unprivileged users to decrease the value */
1096                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1097                         ret = -EPERM;
1098                 else {
1099                         if (val < 0)
1100                                 ret = -EINVAL;
1101                         else
1102                                 WRITE_ONCE(sk->sk_ll_usec, val);
1103                 }
1104                 break;
1105 #endif
1106
1107         case SO_MAX_PACING_RATE:
1108                 {
1109                 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1110
1111                 if (sizeof(ulval) != sizeof(val) &&
1112                     optlen >= sizeof(ulval) &&
1113                     get_user(ulval, (unsigned long __user *)optval)) {
1114                         ret = -EFAULT;
1115                         break;
1116                 }
1117                 if (ulval != ~0UL)
1118                         cmpxchg(&sk->sk_pacing_status,
1119                                 SK_PACING_NONE,
1120                                 SK_PACING_NEEDED);
1121                 /* Pairs with READ_ONCE() from sk_getsockopt() */
1122                 WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1123                 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1124                 break;
1125                 }
1126         case SO_INCOMING_CPU:
1127                 WRITE_ONCE(sk->sk_incoming_cpu, val);
1128                 break;
1129
1130         case SO_CNX_ADVICE:
1131                 if (val == 1)
1132                         dst_negative_advice(sk);
1133                 break;
1134
1135         case SO_ZEROCOPY:
1136                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1137                         if (!((sk->sk_type == SOCK_STREAM &&
1138                                sk->sk_protocol == IPPROTO_TCP) ||
1139                               (sk->sk_type == SOCK_DGRAM &&
1140                                sk->sk_protocol == IPPROTO_UDP)))
1141                                 ret = -ENOTSUPP;
1142                 } else if (sk->sk_family != PF_RDS) {
1143                         ret = -ENOTSUPP;
1144                 }
1145                 if (!ret) {
1146                         if (val < 0 || val > 1)
1147                                 ret = -EINVAL;
1148                         else
1149                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1150                 }
1151                 break;
1152
1153         case SO_TXTIME:
1154                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1155                         ret = -EPERM;
1156                 } else if (optlen != sizeof(struct sock_txtime)) {
1157                         ret = -EINVAL;
1158                 } else if (copy_from_user(&sk_txtime, optval,
1159                            sizeof(struct sock_txtime))) {
1160                         ret = -EFAULT;
1161                 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1162                         ret = -EINVAL;
1163                 } else {
1164                         sock_valbool_flag(sk, SOCK_TXTIME, true);
1165                         sk->sk_clockid = sk_txtime.clockid;
1166                         sk->sk_txtime_deadline_mode =
1167                                 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1168                         sk->sk_txtime_report_errors =
1169                                 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1170                 }
1171                 break;
1172
1173         case SO_BINDTOIFINDEX:
1174                 ret = sock_setbindtodevice_locked(sk, val);
1175                 break;
1176
1177         default:
1178                 ret = -ENOPROTOOPT;
1179                 break;
1180         }
1181         release_sock(sk);
1182         return ret;
1183 }
1184 EXPORT_SYMBOL(sock_setsockopt);
1185
1186 static const struct cred *sk_get_peer_cred(struct sock *sk)
1187 {
1188         const struct cred *cred;
1189
1190         spin_lock(&sk->sk_peer_lock);
1191         cred = get_cred(sk->sk_peer_cred);
1192         spin_unlock(&sk->sk_peer_lock);
1193
1194         return cred;
1195 }
1196
1197 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1198                           struct ucred *ucred)
1199 {
1200         ucred->pid = pid_vnr(pid);
1201         ucred->uid = ucred->gid = -1;
1202         if (cred) {
1203                 struct user_namespace *current_ns = current_user_ns();
1204
1205                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1206                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1207         }
1208 }
1209
1210 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1211 {
1212         struct user_namespace *user_ns = current_user_ns();
1213         int i;
1214
1215         for (i = 0; i < src->ngroups; i++)
1216                 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1217                         return -EFAULT;
1218
1219         return 0;
1220 }
1221
1222 int sock_getsockopt(struct socket *sock, int level, int optname,
1223                     char __user *optval, int __user *optlen)
1224 {
1225         struct sock *sk = sock->sk;
1226
1227         union {
1228                 int val;
1229                 u64 val64;
1230                 unsigned long ulval;
1231                 struct linger ling;
1232                 struct old_timeval32 tm32;
1233                 struct __kernel_old_timeval tm;
1234                 struct  __kernel_sock_timeval stm;
1235                 struct sock_txtime txtime;
1236         } v;
1237
1238         int lv = sizeof(int);
1239         int len;
1240
1241         if (get_user(len, optlen))
1242                 return -EFAULT;
1243         if (len < 0)
1244                 return -EINVAL;
1245
1246         memset(&v, 0, sizeof(v));
1247
1248         switch (optname) {
1249         case SO_DEBUG:
1250                 v.val = sock_flag(sk, SOCK_DBG);
1251                 break;
1252
1253         case SO_DONTROUTE:
1254                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1255                 break;
1256
1257         case SO_BROADCAST:
1258                 v.val = sock_flag(sk, SOCK_BROADCAST);
1259                 break;
1260
1261         case SO_SNDBUF:
1262                 v.val = READ_ONCE(sk->sk_sndbuf);
1263                 break;
1264
1265         case SO_RCVBUF:
1266                 v.val = READ_ONCE(sk->sk_rcvbuf);
1267                 break;
1268
1269         case SO_REUSEADDR:
1270                 v.val = sk->sk_reuse;
1271                 break;
1272
1273         case SO_REUSEPORT:
1274                 v.val = sk->sk_reuseport;
1275                 break;
1276
1277         case SO_KEEPALIVE:
1278                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1279                 break;
1280
1281         case SO_TYPE:
1282                 v.val = sk->sk_type;
1283                 break;
1284
1285         case SO_PROTOCOL:
1286                 v.val = sk->sk_protocol;
1287                 break;
1288
1289         case SO_DOMAIN:
1290                 v.val = sk->sk_family;
1291                 break;
1292
1293         case SO_ERROR:
1294                 v.val = -sock_error(sk);
1295                 if (v.val == 0)
1296                         v.val = xchg(&sk->sk_err_soft, 0);
1297                 break;
1298
1299         case SO_OOBINLINE:
1300                 v.val = sock_flag(sk, SOCK_URGINLINE);
1301                 break;
1302
1303         case SO_NO_CHECK:
1304                 v.val = sk->sk_no_check_tx;
1305                 break;
1306
1307         case SO_PRIORITY:
1308                 v.val = sk->sk_priority;
1309                 break;
1310
1311         case SO_LINGER:
1312                 lv              = sizeof(v.ling);
1313                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1314                 v.ling.l_linger = sk->sk_lingertime / HZ;
1315                 break;
1316
1317         case SO_BSDCOMPAT:
1318                 sock_warn_obsolete_bsdism("getsockopt");
1319                 break;
1320
1321         case SO_TIMESTAMP_OLD:
1322                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1323                                 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1324                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1325                 break;
1326
1327         case SO_TIMESTAMPNS_OLD:
1328                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1329                 break;
1330
1331         case SO_TIMESTAMP_NEW:
1332                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1333                 break;
1334
1335         case SO_TIMESTAMPNS_NEW:
1336                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1337                 break;
1338
1339         case SO_TIMESTAMPING_OLD:
1340                 v.val = sk->sk_tsflags;
1341                 break;
1342
1343         case SO_RCVTIMEO_OLD:
1344         case SO_RCVTIMEO_NEW:
1345                 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1346                 break;
1347
1348         case SO_SNDTIMEO_OLD:
1349         case SO_SNDTIMEO_NEW:
1350                 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1351                 break;
1352
1353         case SO_RCVLOWAT:
1354                 v.val = READ_ONCE(sk->sk_rcvlowat);
1355                 break;
1356
1357         case SO_SNDLOWAT:
1358                 v.val = 1;
1359                 break;
1360
1361         case SO_PASSCRED:
1362                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1363                 break;
1364
1365         case SO_PEERCRED:
1366         {
1367                 struct ucred peercred;
1368                 if (len > sizeof(peercred))
1369                         len = sizeof(peercred);
1370
1371                 spin_lock(&sk->sk_peer_lock);
1372                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1373                 spin_unlock(&sk->sk_peer_lock);
1374
1375                 if (copy_to_user(optval, &peercred, len))
1376                         return -EFAULT;
1377                 goto lenout;
1378         }
1379
1380         case SO_PEERGROUPS:
1381         {
1382                 const struct cred *cred;
1383                 int ret, n;
1384
1385                 cred = sk_get_peer_cred(sk);
1386                 if (!cred)
1387                         return -ENODATA;
1388
1389                 n = cred->group_info->ngroups;
1390                 if (len < n * sizeof(gid_t)) {
1391                         len = n * sizeof(gid_t);
1392                         put_cred(cred);
1393                         return put_user(len, optlen) ? -EFAULT : -ERANGE;
1394                 }
1395                 len = n * sizeof(gid_t);
1396
1397                 ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1398                 put_cred(cred);
1399                 if (ret)
1400                         return ret;
1401                 goto lenout;
1402         }
1403
1404         case SO_PEERNAME:
1405         {
1406                 char address[128];
1407
1408                 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1409                 if (lv < 0)
1410                         return -ENOTCONN;
1411                 if (lv < len)
1412                         return -EINVAL;
1413                 if (copy_to_user(optval, address, len))
1414                         return -EFAULT;
1415                 goto lenout;
1416         }
1417
1418         /* Dubious BSD thing... Probably nobody even uses it, but
1419          * the UNIX standard wants it for whatever reason... -DaveM
1420          */
1421         case SO_ACCEPTCONN:
1422                 v.val = sk->sk_state == TCP_LISTEN;
1423                 break;
1424
1425         case SO_PASSSEC:
1426                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1427                 break;
1428
1429         case SO_PEERSEC:
1430                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1431
1432         case SO_MARK:
1433                 v.val = sk->sk_mark;
1434                 break;
1435
1436         case SO_RXQ_OVFL:
1437                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1438                 break;
1439
1440         case SO_WIFI_STATUS:
1441                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1442                 break;
1443
1444         case SO_PEEK_OFF:
1445                 if (!sock->ops->set_peek_off)
1446                         return -EOPNOTSUPP;
1447
1448                 v.val = READ_ONCE(sk->sk_peek_off);
1449                 break;
1450         case SO_NOFCS:
1451                 v.val = sock_flag(sk, SOCK_NOFCS);
1452                 break;
1453
1454         case SO_BINDTODEVICE:
1455                 return sock_getbindtodevice(sk, optval, optlen, len);
1456
1457         case SO_GET_FILTER:
1458                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1459                 if (len < 0)
1460                         return len;
1461
1462                 goto lenout;
1463
1464         case SO_LOCK_FILTER:
1465                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1466                 break;
1467
1468         case SO_BPF_EXTENSIONS:
1469                 v.val = bpf_tell_extensions();
1470                 break;
1471
1472         case SO_SELECT_ERR_QUEUE:
1473                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1474                 break;
1475
1476 #ifdef CONFIG_NET_RX_BUSY_POLL
1477         case SO_BUSY_POLL:
1478                 v.val = READ_ONCE(sk->sk_ll_usec);
1479                 break;
1480 #endif
1481
1482         case SO_MAX_PACING_RATE:
1483                 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1484                 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1485                         lv = sizeof(v.ulval);
1486                         v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1487                 } else {
1488                         /* 32bit version */
1489                         v.val = min_t(unsigned long, ~0U,
1490                                       READ_ONCE(sk->sk_max_pacing_rate));
1491                 }
1492                 break;
1493
1494         case SO_INCOMING_CPU:
1495                 v.val = READ_ONCE(sk->sk_incoming_cpu);
1496                 break;
1497
1498         case SO_MEMINFO:
1499         {
1500                 u32 meminfo[SK_MEMINFO_VARS];
1501
1502                 sk_get_meminfo(sk, meminfo);
1503
1504                 len = min_t(unsigned int, len, sizeof(meminfo));
1505                 if (copy_to_user(optval, &meminfo, len))
1506                         return -EFAULT;
1507
1508                 goto lenout;
1509         }
1510
1511 #ifdef CONFIG_NET_RX_BUSY_POLL
1512         case SO_INCOMING_NAPI_ID:
1513                 v.val = READ_ONCE(sk->sk_napi_id);
1514
1515                 /* aggregate non-NAPI IDs down to 0 */
1516                 if (v.val < MIN_NAPI_ID)
1517                         v.val = 0;
1518
1519                 break;
1520 #endif
1521
1522         case SO_COOKIE:
1523                 lv = sizeof(u64);
1524                 if (len < lv)
1525                         return -EINVAL;
1526                 v.val64 = sock_gen_cookie(sk);
1527                 break;
1528
1529         case SO_ZEROCOPY:
1530                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1531                 break;
1532
1533         case SO_TXTIME:
1534                 lv = sizeof(v.txtime);
1535                 v.txtime.clockid = sk->sk_clockid;
1536                 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1537                                   SOF_TXTIME_DEADLINE_MODE : 0;
1538                 v.txtime.flags |= sk->sk_txtime_report_errors ?
1539                                   SOF_TXTIME_REPORT_ERRORS : 0;
1540                 break;
1541
1542         case SO_BINDTOIFINDEX:
1543                 v.val = sk->sk_bound_dev_if;
1544                 break;
1545
1546         default:
1547                 /* We implement the SO_SNDLOWAT etc to not be settable
1548                  * (1003.1g 7).
1549                  */
1550                 return -ENOPROTOOPT;
1551         }
1552
1553         if (len > lv)
1554                 len = lv;
1555         if (copy_to_user(optval, &v, len))
1556                 return -EFAULT;
1557 lenout:
1558         if (put_user(len, optlen))
1559                 return -EFAULT;
1560         return 0;
1561 }
1562
1563 /*
1564  * Initialize an sk_lock.
1565  *
1566  * (We also register the sk_lock with the lock validator.)
1567  */
1568 static inline void sock_lock_init(struct sock *sk)
1569 {
1570         if (sk->sk_kern_sock)
1571                 sock_lock_init_class_and_name(
1572                         sk,
1573                         af_family_kern_slock_key_strings[sk->sk_family],
1574                         af_family_kern_slock_keys + sk->sk_family,
1575                         af_family_kern_key_strings[sk->sk_family],
1576                         af_family_kern_keys + sk->sk_family);
1577         else
1578                 sock_lock_init_class_and_name(
1579                         sk,
1580                         af_family_slock_key_strings[sk->sk_family],
1581                         af_family_slock_keys + sk->sk_family,
1582                         af_family_key_strings[sk->sk_family],
1583                         af_family_keys + sk->sk_family);
1584 }
1585
1586 /*
1587  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1588  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1589  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1590  */
1591 static void sock_copy(struct sock *nsk, const struct sock *osk)
1592 {
1593 #ifdef CONFIG_SECURITY_NETWORK
1594         void *sptr = nsk->sk_security;
1595 #endif
1596         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1597
1598         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1599                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1600
1601 #ifdef CONFIG_SECURITY_NETWORK
1602         nsk->sk_security = sptr;
1603         security_sk_clone(osk, nsk);
1604 #endif
1605 }
1606
1607 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1608                 int family)
1609 {
1610         struct sock *sk;
1611         struct kmem_cache *slab;
1612
1613         slab = prot->slab;
1614         if (slab != NULL) {
1615                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1616                 if (!sk)
1617                         return sk;
1618                 if (want_init_on_alloc(priority))
1619                         sk_prot_clear_nulls(sk, prot->obj_size);
1620         } else
1621                 sk = kmalloc(prot->obj_size, priority);
1622
1623         if (sk != NULL) {
1624                 if (security_sk_alloc(sk, family, priority))
1625                         goto out_free;
1626
1627                 if (!try_module_get(prot->owner))
1628                         goto out_free_sec;
1629                 sk_tx_queue_clear(sk);
1630         }
1631
1632         return sk;
1633
1634 out_free_sec:
1635         security_sk_free(sk);
1636 out_free:
1637         if (slab != NULL)
1638                 kmem_cache_free(slab, sk);
1639         else
1640                 kfree(sk);
1641         return NULL;
1642 }
1643
1644 static void sk_prot_free(struct proto *prot, struct sock *sk)
1645 {
1646         struct kmem_cache *slab;
1647         struct module *owner;
1648
1649         owner = prot->owner;
1650         slab = prot->slab;
1651
1652         cgroup_sk_free(&sk->sk_cgrp_data);
1653         mem_cgroup_sk_free(sk);
1654         security_sk_free(sk);
1655         if (slab != NULL)
1656                 kmem_cache_free(slab, sk);
1657         else
1658                 kfree(sk);
1659         module_put(owner);
1660 }
1661
1662 /**
1663  *      sk_alloc - All socket objects are allocated here
1664  *      @net: the applicable net namespace
1665  *      @family: protocol family
1666  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1667  *      @prot: struct proto associated with this new sock instance
1668  *      @kern: is this to be a kernel socket?
1669  */
1670 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1671                       struct proto *prot, int kern)
1672 {
1673         struct sock *sk;
1674
1675         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1676         if (sk) {
1677                 sk->sk_family = family;
1678                 /*
1679                  * See comment in struct sock definition to understand
1680                  * why we need sk_prot_creator -acme
1681                  */
1682                 sk->sk_prot = sk->sk_prot_creator = prot;
1683                 sk->sk_kern_sock = kern;
1684                 sock_lock_init(sk);
1685                 sk->sk_net_refcnt = kern ? 0 : 1;
1686                 if (likely(sk->sk_net_refcnt)) {
1687                         get_net(net);
1688                         sock_inuse_add(net, 1);
1689                 }
1690
1691                 sock_net_set(sk, net);
1692                 refcount_set(&sk->sk_wmem_alloc, 1);
1693
1694                 mem_cgroup_sk_alloc(sk);
1695                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1696                 sock_update_classid(&sk->sk_cgrp_data);
1697                 sock_update_netprioidx(&sk->sk_cgrp_data);
1698                 sk_tx_queue_clear(sk);
1699         }
1700
1701         return sk;
1702 }
1703 EXPORT_SYMBOL(sk_alloc);
1704
1705 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1706  * grace period. This is the case for UDP sockets and TCP listeners.
1707  */
1708 static void __sk_destruct(struct rcu_head *head)
1709 {
1710         struct sock *sk = container_of(head, struct sock, sk_rcu);
1711         struct sk_filter *filter;
1712
1713         if (sk->sk_destruct)
1714                 sk->sk_destruct(sk);
1715
1716         filter = rcu_dereference_check(sk->sk_filter,
1717                                        refcount_read(&sk->sk_wmem_alloc) == 0);
1718         if (filter) {
1719                 sk_filter_uncharge(sk, filter);
1720                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1721         }
1722
1723         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1724
1725 #ifdef CONFIG_BPF_SYSCALL
1726         bpf_sk_storage_free(sk);
1727 #endif
1728
1729         if (atomic_read(&sk->sk_omem_alloc))
1730                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1731                          __func__, atomic_read(&sk->sk_omem_alloc));
1732
1733         if (sk->sk_frag.page) {
1734                 put_page(sk->sk_frag.page);
1735                 sk->sk_frag.page = NULL;
1736         }
1737
1738         /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
1739         put_cred(sk->sk_peer_cred);
1740         put_pid(sk->sk_peer_pid);
1741
1742         if (likely(sk->sk_net_refcnt))
1743                 put_net(sock_net(sk));
1744         sk_prot_free(sk->sk_prot_creator, sk);
1745 }
1746
1747 void sk_destruct(struct sock *sk)
1748 {
1749         bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1750
1751         if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1752                 reuseport_detach_sock(sk);
1753                 use_call_rcu = true;
1754         }
1755
1756         if (use_call_rcu)
1757                 call_rcu(&sk->sk_rcu, __sk_destruct);
1758         else
1759                 __sk_destruct(&sk->sk_rcu);
1760 }
1761
1762 static void __sk_free(struct sock *sk)
1763 {
1764         if (likely(sk->sk_net_refcnt))
1765                 sock_inuse_add(sock_net(sk), -1);
1766
1767         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1768                 sock_diag_broadcast_destroy(sk);
1769         else
1770                 sk_destruct(sk);
1771 }
1772
1773 void sk_free(struct sock *sk)
1774 {
1775         /*
1776          * We subtract one from sk_wmem_alloc and can know if
1777          * some packets are still in some tx queue.
1778          * If not null, sock_wfree() will call __sk_free(sk) later
1779          */
1780         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1781                 __sk_free(sk);
1782 }
1783 EXPORT_SYMBOL(sk_free);
1784
1785 static void sk_init_common(struct sock *sk)
1786 {
1787         skb_queue_head_init(&sk->sk_receive_queue);
1788         skb_queue_head_init(&sk->sk_write_queue);
1789         skb_queue_head_init(&sk->sk_error_queue);
1790
1791         rwlock_init(&sk->sk_callback_lock);
1792         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1793                         af_rlock_keys + sk->sk_family,
1794                         af_family_rlock_key_strings[sk->sk_family]);
1795         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1796                         af_wlock_keys + sk->sk_family,
1797                         af_family_wlock_key_strings[sk->sk_family]);
1798         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1799                         af_elock_keys + sk->sk_family,
1800                         af_family_elock_key_strings[sk->sk_family]);
1801         lockdep_set_class_and_name(&sk->sk_callback_lock,
1802                         af_callback_keys + sk->sk_family,
1803                         af_family_clock_key_strings[sk->sk_family]);
1804 }
1805
1806 /**
1807  *      sk_clone_lock - clone a socket, and lock its clone
1808  *      @sk: the socket to clone
1809  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1810  *
1811  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1812  */
1813 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1814 {
1815         struct sock *newsk;
1816         bool is_charged = true;
1817
1818         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1819         if (newsk != NULL) {
1820                 struct sk_filter *filter;
1821
1822                 sock_copy(newsk, sk);
1823
1824                 newsk->sk_prot_creator = sk->sk_prot;
1825
1826                 /* SANITY */
1827                 if (likely(newsk->sk_net_refcnt))
1828                         get_net(sock_net(newsk));
1829                 sk_node_init(&newsk->sk_node);
1830                 sock_lock_init(newsk);
1831                 bh_lock_sock(newsk);
1832                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1833                 newsk->sk_backlog.len = 0;
1834
1835                 atomic_set(&newsk->sk_rmem_alloc, 0);
1836                 /*
1837                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1838                  */
1839                 refcount_set(&newsk->sk_wmem_alloc, 1);
1840                 atomic_set(&newsk->sk_omem_alloc, 0);
1841                 sk_init_common(newsk);
1842
1843                 newsk->sk_dst_cache     = NULL;
1844                 newsk->sk_dst_pending_confirm = 0;
1845                 newsk->sk_wmem_queued   = 0;
1846                 newsk->sk_forward_alloc = 0;
1847                 atomic_set(&newsk->sk_drops, 0);
1848                 newsk->sk_send_head     = NULL;
1849                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1850                 atomic_set(&newsk->sk_zckey, 0);
1851
1852                 sock_reset_flag(newsk, SOCK_DONE);
1853
1854                 /* sk->sk_memcg will be populated at accept() time */
1855                 newsk->sk_memcg = NULL;
1856
1857                 cgroup_sk_clone(&newsk->sk_cgrp_data);
1858
1859                 rcu_read_lock();
1860                 filter = rcu_dereference(sk->sk_filter);
1861                 if (filter != NULL)
1862                         /* though it's an empty new sock, the charging may fail
1863                          * if sysctl_optmem_max was changed between creation of
1864                          * original socket and cloning
1865                          */
1866                         is_charged = sk_filter_charge(newsk, filter);
1867                 RCU_INIT_POINTER(newsk->sk_filter, filter);
1868                 rcu_read_unlock();
1869
1870                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1871                         /* We need to make sure that we don't uncharge the new
1872                          * socket if we couldn't charge it in the first place
1873                          * as otherwise we uncharge the parent's filter.
1874                          */
1875                         if (!is_charged)
1876                                 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1877                         sk_free_unlock_clone(newsk);
1878                         newsk = NULL;
1879                         goto out;
1880                 }
1881                 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1882
1883                 if (bpf_sk_storage_clone(sk, newsk)) {
1884                         sk_free_unlock_clone(newsk);
1885                         newsk = NULL;
1886                         goto out;
1887                 }
1888
1889                 newsk->sk_err      = 0;
1890                 newsk->sk_err_soft = 0;
1891                 newsk->sk_priority = 0;
1892                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1893                 if (likely(newsk->sk_net_refcnt))
1894                         sock_inuse_add(sock_net(newsk), 1);
1895
1896                 /*
1897                  * Before updating sk_refcnt, we must commit prior changes to memory
1898                  * (Documentation/RCU/rculist_nulls.txt for details)
1899                  */
1900                 smp_wmb();
1901                 refcount_set(&newsk->sk_refcnt, 2);
1902
1903                 /*
1904                  * Increment the counter in the same struct proto as the master
1905                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1906                  * is the same as sk->sk_prot->socks, as this field was copied
1907                  * with memcpy).
1908                  *
1909                  * This _changes_ the previous behaviour, where
1910                  * tcp_create_openreq_child always was incrementing the
1911                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1912                  * to be taken into account in all callers. -acme
1913                  */
1914                 sk_refcnt_debug_inc(newsk);
1915                 sk_set_socket(newsk, NULL);
1916                 sk_tx_queue_clear(newsk);
1917                 RCU_INIT_POINTER(newsk->sk_wq, NULL);
1918
1919                 if (newsk->sk_prot->sockets_allocated)
1920                         sk_sockets_allocated_inc(newsk);
1921
1922                 if (sock_needs_netstamp(sk) &&
1923                     newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1924                         net_enable_timestamp();
1925         }
1926 out:
1927         return newsk;
1928 }
1929 EXPORT_SYMBOL_GPL(sk_clone_lock);
1930
1931 void sk_free_unlock_clone(struct sock *sk)
1932 {
1933         /* It is still raw copy of parent, so invalidate
1934          * destructor and make plain sk_free() */
1935         sk->sk_destruct = NULL;
1936         bh_unlock_sock(sk);
1937         sk_free(sk);
1938 }
1939 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1940
1941 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1942 {
1943         u32 max_segs = 1;
1944
1945         sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1946         if (sk->sk_route_caps & NETIF_F_GSO)
1947                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1948         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1949         if (sk_can_gso(sk)) {
1950                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1951                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1952                 } else {
1953                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1954                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1955                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1956                 }
1957         }
1958         sk->sk_gso_max_segs = max_segs;
1959         sk_dst_set(sk, dst);
1960 }
1961 EXPORT_SYMBOL_GPL(sk_setup_caps);
1962
1963 /*
1964  *      Simple resource managers for sockets.
1965  */
1966
1967
1968 /*
1969  * Write buffer destructor automatically called from kfree_skb.
1970  */
1971 void sock_wfree(struct sk_buff *skb)
1972 {
1973         struct sock *sk = skb->sk;
1974         unsigned int len = skb->truesize;
1975
1976         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1977                 /*
1978                  * Keep a reference on sk_wmem_alloc, this will be released
1979                  * after sk_write_space() call
1980                  */
1981                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1982                 sk->sk_write_space(sk);
1983                 len = 1;
1984         }
1985         /*
1986          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1987          * could not do because of in-flight packets
1988          */
1989         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1990                 __sk_free(sk);
1991 }
1992 EXPORT_SYMBOL(sock_wfree);
1993
1994 /* This variant of sock_wfree() is used by TCP,
1995  * since it sets SOCK_USE_WRITE_QUEUE.
1996  */
1997 void __sock_wfree(struct sk_buff *skb)
1998 {
1999         struct sock *sk = skb->sk;
2000
2001         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2002                 __sk_free(sk);
2003 }
2004
2005 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2006 {
2007         skb_orphan(skb);
2008         skb->sk = sk;
2009 #ifdef CONFIG_INET
2010         if (unlikely(!sk_fullsock(sk))) {
2011                 skb->destructor = sock_edemux;
2012                 sock_hold(sk);
2013                 return;
2014         }
2015 #endif
2016         skb->destructor = sock_wfree;
2017         skb_set_hash_from_sk(skb, sk);
2018         /*
2019          * We used to take a refcount on sk, but following operation
2020          * is enough to guarantee sk_free() wont free this sock until
2021          * all in-flight packets are completed
2022          */
2023         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2024 }
2025 EXPORT_SYMBOL(skb_set_owner_w);
2026
2027 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2028 {
2029 #ifdef CONFIG_TLS_DEVICE
2030         /* Drivers depend on in-order delivery for crypto offload,
2031          * partial orphan breaks out-of-order-OK logic.
2032          */
2033         if (skb->decrypted)
2034                 return false;
2035 #endif
2036         return (skb->destructor == sock_wfree ||
2037                 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2038 }
2039
2040 /* This helper is used by netem, as it can hold packets in its
2041  * delay queue. We want to allow the owner socket to send more
2042  * packets, as if they were already TX completed by a typical driver.
2043  * But we also want to keep skb->sk set because some packet schedulers
2044  * rely on it (sch_fq for example).
2045  */
2046 void skb_orphan_partial(struct sk_buff *skb)
2047 {
2048         if (skb_is_tcp_pure_ack(skb))
2049                 return;
2050
2051         if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2052                 return;
2053
2054         skb_orphan(skb);
2055 }
2056 EXPORT_SYMBOL(skb_orphan_partial);
2057
2058 /*
2059  * Read buffer destructor automatically called from kfree_skb.
2060  */
2061 void sock_rfree(struct sk_buff *skb)
2062 {
2063         struct sock *sk = skb->sk;
2064         unsigned int len = skb->truesize;
2065
2066         atomic_sub(len, &sk->sk_rmem_alloc);
2067         sk_mem_uncharge(sk, len);
2068 }
2069 EXPORT_SYMBOL(sock_rfree);
2070
2071 /*
2072  * Buffer destructor for skbs that are not used directly in read or write
2073  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2074  */
2075 void sock_efree(struct sk_buff *skb)
2076 {
2077         sock_put(skb->sk);
2078 }
2079 EXPORT_SYMBOL(sock_efree);
2080
2081 kuid_t sock_i_uid(struct sock *sk)
2082 {
2083         kuid_t uid;
2084
2085         read_lock_bh(&sk->sk_callback_lock);
2086         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2087         read_unlock_bh(&sk->sk_callback_lock);
2088         return uid;
2089 }
2090 EXPORT_SYMBOL(sock_i_uid);
2091
2092 unsigned long __sock_i_ino(struct sock *sk)
2093 {
2094         unsigned long ino;
2095
2096         read_lock(&sk->sk_callback_lock);
2097         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2098         read_unlock(&sk->sk_callback_lock);
2099         return ino;
2100 }
2101 EXPORT_SYMBOL(__sock_i_ino);
2102
2103 unsigned long sock_i_ino(struct sock *sk)
2104 {
2105         unsigned long ino;
2106
2107         local_bh_disable();
2108         ino = __sock_i_ino(sk);
2109         local_bh_enable();
2110         return ino;
2111 }
2112 EXPORT_SYMBOL(sock_i_ino);
2113
2114 /*
2115  * Allocate a skb from the socket's send buffer.
2116  */
2117 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2118                              gfp_t priority)
2119 {
2120         if (force ||
2121             refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2122                 struct sk_buff *skb = alloc_skb(size, priority);
2123
2124                 if (skb) {
2125                         skb_set_owner_w(skb, sk);
2126                         return skb;
2127                 }
2128         }
2129         return NULL;
2130 }
2131 EXPORT_SYMBOL(sock_wmalloc);
2132
2133 static void sock_ofree(struct sk_buff *skb)
2134 {
2135         struct sock *sk = skb->sk;
2136
2137         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2138 }
2139
2140 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2141                              gfp_t priority)
2142 {
2143         struct sk_buff *skb;
2144
2145         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2146         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2147             sysctl_optmem_max)
2148                 return NULL;
2149
2150         skb = alloc_skb(size, priority);
2151         if (!skb)
2152                 return NULL;
2153
2154         atomic_add(skb->truesize, &sk->sk_omem_alloc);
2155         skb->sk = sk;
2156         skb->destructor = sock_ofree;
2157         return skb;
2158 }
2159
2160 /*
2161  * Allocate a memory block from the socket's option memory buffer.
2162  */
2163 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2164 {
2165         if ((unsigned int)size <= sysctl_optmem_max &&
2166             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2167                 void *mem;
2168                 /* First do the add, to avoid the race if kmalloc
2169                  * might sleep.
2170                  */
2171                 atomic_add(size, &sk->sk_omem_alloc);
2172                 mem = kmalloc(size, priority);
2173                 if (mem)
2174                         return mem;
2175                 atomic_sub(size, &sk->sk_omem_alloc);
2176         }
2177         return NULL;
2178 }
2179 EXPORT_SYMBOL(sock_kmalloc);
2180
2181 /* Free an option memory block. Note, we actually want the inline
2182  * here as this allows gcc to detect the nullify and fold away the
2183  * condition entirely.
2184  */
2185 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2186                                   const bool nullify)
2187 {
2188         if (WARN_ON_ONCE(!mem))
2189                 return;
2190         if (nullify)
2191                 kzfree(mem);
2192         else
2193                 kfree(mem);
2194         atomic_sub(size, &sk->sk_omem_alloc);
2195 }
2196
2197 void sock_kfree_s(struct sock *sk, void *mem, int size)
2198 {
2199         __sock_kfree_s(sk, mem, size, false);
2200 }
2201 EXPORT_SYMBOL(sock_kfree_s);
2202
2203 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2204 {
2205         __sock_kfree_s(sk, mem, size, true);
2206 }
2207 EXPORT_SYMBOL(sock_kzfree_s);
2208
2209 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2210    I think, these locks should be removed for datagram sockets.
2211  */
2212 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2213 {
2214         DEFINE_WAIT(wait);
2215
2216         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2217         for (;;) {
2218                 if (!timeo)
2219                         break;
2220                 if (signal_pending(current))
2221                         break;
2222                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2223                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2224                 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2225                         break;
2226                 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2227                         break;
2228                 if (READ_ONCE(sk->sk_err))
2229                         break;
2230                 timeo = schedule_timeout(timeo);
2231         }
2232         finish_wait(sk_sleep(sk), &wait);
2233         return timeo;
2234 }
2235
2236
2237 /*
2238  *      Generic send/receive buffer handlers
2239  */
2240
2241 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2242                                      unsigned long data_len, int noblock,
2243                                      int *errcode, int max_page_order)
2244 {
2245         struct sk_buff *skb;
2246         long timeo;
2247         int err;
2248
2249         timeo = sock_sndtimeo(sk, noblock);
2250         for (;;) {
2251                 err = sock_error(sk);
2252                 if (err != 0)
2253                         goto failure;
2254
2255                 err = -EPIPE;
2256                 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2257                         goto failure;
2258
2259                 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2260                         break;
2261
2262                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2263                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2264                 err = -EAGAIN;
2265                 if (!timeo)
2266                         goto failure;
2267                 if (signal_pending(current))
2268                         goto interrupted;
2269                 timeo = sock_wait_for_wmem(sk, timeo);
2270         }
2271         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2272                                    errcode, sk->sk_allocation);
2273         if (skb)
2274                 skb_set_owner_w(skb, sk);
2275         return skb;
2276
2277 interrupted:
2278         err = sock_intr_errno(timeo);
2279 failure:
2280         *errcode = err;
2281         return NULL;
2282 }
2283 EXPORT_SYMBOL(sock_alloc_send_pskb);
2284
2285 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2286                                     int noblock, int *errcode)
2287 {
2288         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2289 }
2290 EXPORT_SYMBOL(sock_alloc_send_skb);
2291
2292 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2293                      struct sockcm_cookie *sockc)
2294 {
2295         u32 tsflags;
2296
2297         switch (cmsg->cmsg_type) {
2298         case SO_MARK:
2299                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2300                         return -EPERM;
2301                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2302                         return -EINVAL;
2303                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2304                 break;
2305         case SO_TIMESTAMPING_OLD:
2306                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2307                         return -EINVAL;
2308
2309                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2310                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2311                         return -EINVAL;
2312
2313                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2314                 sockc->tsflags |= tsflags;
2315                 break;
2316         case SCM_TXTIME:
2317                 if (!sock_flag(sk, SOCK_TXTIME))
2318                         return -EINVAL;
2319                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2320                         return -EINVAL;
2321                 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2322                 break;
2323         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2324         case SCM_RIGHTS:
2325         case SCM_CREDENTIALS:
2326                 break;
2327         default:
2328                 return -EINVAL;
2329         }
2330         return 0;
2331 }
2332 EXPORT_SYMBOL(__sock_cmsg_send);
2333
2334 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2335                    struct sockcm_cookie *sockc)
2336 {
2337         struct cmsghdr *cmsg;
2338         int ret;
2339
2340         for_each_cmsghdr(cmsg, msg) {
2341                 if (!CMSG_OK(msg, cmsg))
2342                         return -EINVAL;
2343                 if (cmsg->cmsg_level != SOL_SOCKET)
2344                         continue;
2345                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2346                 if (ret)
2347                         return ret;
2348         }
2349         return 0;
2350 }
2351 EXPORT_SYMBOL(sock_cmsg_send);
2352
2353 static void sk_enter_memory_pressure(struct sock *sk)
2354 {
2355         if (!sk->sk_prot->enter_memory_pressure)
2356                 return;
2357
2358         sk->sk_prot->enter_memory_pressure(sk);
2359 }
2360
2361 static void sk_leave_memory_pressure(struct sock *sk)
2362 {
2363         if (sk->sk_prot->leave_memory_pressure) {
2364                 sk->sk_prot->leave_memory_pressure(sk);
2365         } else {
2366                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2367
2368                 if (memory_pressure && READ_ONCE(*memory_pressure))
2369                         WRITE_ONCE(*memory_pressure, 0);
2370         }
2371 }
2372
2373 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2374
2375 /**
2376  * skb_page_frag_refill - check that a page_frag contains enough room
2377  * @sz: minimum size of the fragment we want to get
2378  * @pfrag: pointer to page_frag
2379  * @gfp: priority for memory allocation
2380  *
2381  * Note: While this allocator tries to use high order pages, there is
2382  * no guarantee that allocations succeed. Therefore, @sz MUST be
2383  * less or equal than PAGE_SIZE.
2384  */
2385 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2386 {
2387         if (pfrag->page) {
2388                 if (page_ref_count(pfrag->page) == 1) {
2389                         pfrag->offset = 0;
2390                         return true;
2391                 }
2392                 if (pfrag->offset + sz <= pfrag->size)
2393                         return true;
2394                 put_page(pfrag->page);
2395         }
2396
2397         pfrag->offset = 0;
2398         if (SKB_FRAG_PAGE_ORDER &&
2399             !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2400                 /* Avoid direct reclaim but allow kswapd to wake */
2401                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2402                                           __GFP_COMP | __GFP_NOWARN |
2403                                           __GFP_NORETRY,
2404                                           SKB_FRAG_PAGE_ORDER);
2405                 if (likely(pfrag->page)) {
2406                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2407                         return true;
2408                 }
2409         }
2410         pfrag->page = alloc_page(gfp);
2411         if (likely(pfrag->page)) {
2412                 pfrag->size = PAGE_SIZE;
2413                 return true;
2414         }
2415         return false;
2416 }
2417 EXPORT_SYMBOL(skb_page_frag_refill);
2418
2419 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2420 {
2421         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2422                 return true;
2423
2424         sk_enter_memory_pressure(sk);
2425         sk_stream_moderate_sndbuf(sk);
2426         return false;
2427 }
2428 EXPORT_SYMBOL(sk_page_frag_refill);
2429
2430 static void __lock_sock(struct sock *sk)
2431         __releases(&sk->sk_lock.slock)
2432         __acquires(&sk->sk_lock.slock)
2433 {
2434         DEFINE_WAIT(wait);
2435
2436         for (;;) {
2437                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2438                                         TASK_UNINTERRUPTIBLE);
2439                 spin_unlock_bh(&sk->sk_lock.slock);
2440                 schedule();
2441                 spin_lock_bh(&sk->sk_lock.slock);
2442                 if (!sock_owned_by_user(sk))
2443                         break;
2444         }
2445         finish_wait(&sk->sk_lock.wq, &wait);
2446 }
2447
2448 void __release_sock(struct sock *sk)
2449         __releases(&sk->sk_lock.slock)
2450         __acquires(&sk->sk_lock.slock)
2451 {
2452         struct sk_buff *skb, *next;
2453
2454         while ((skb = sk->sk_backlog.head) != NULL) {
2455                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2456
2457                 spin_unlock_bh(&sk->sk_lock.slock);
2458
2459                 do {
2460                         next = skb->next;
2461                         prefetch(next);
2462                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2463                         skb_mark_not_on_list(skb);
2464                         sk_backlog_rcv(sk, skb);
2465
2466                         cond_resched();
2467
2468                         skb = next;
2469                 } while (skb != NULL);
2470
2471                 spin_lock_bh(&sk->sk_lock.slock);
2472         }
2473
2474         /*
2475          * Doing the zeroing here guarantee we can not loop forever
2476          * while a wild producer attempts to flood us.
2477          */
2478         sk->sk_backlog.len = 0;
2479 }
2480
2481 void __sk_flush_backlog(struct sock *sk)
2482 {
2483         spin_lock_bh(&sk->sk_lock.slock);
2484         __release_sock(sk);
2485         spin_unlock_bh(&sk->sk_lock.slock);
2486 }
2487
2488 /**
2489  * sk_wait_data - wait for data to arrive at sk_receive_queue
2490  * @sk:    sock to wait on
2491  * @timeo: for how long
2492  * @skb:   last skb seen on sk_receive_queue
2493  *
2494  * Now socket state including sk->sk_err is changed only under lock,
2495  * hence we may omit checks after joining wait queue.
2496  * We check receive queue before schedule() only as optimization;
2497  * it is very likely that release_sock() added new data.
2498  */
2499 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2500 {
2501         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2502         int rc;
2503
2504         add_wait_queue(sk_sleep(sk), &wait);
2505         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2506         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2507         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2508         remove_wait_queue(sk_sleep(sk), &wait);
2509         return rc;
2510 }
2511 EXPORT_SYMBOL(sk_wait_data);
2512
2513 /**
2514  *      __sk_mem_raise_allocated - increase memory_allocated
2515  *      @sk: socket
2516  *      @size: memory size to allocate
2517  *      @amt: pages to allocate
2518  *      @kind: allocation type
2519  *
2520  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2521  */
2522 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2523 {
2524         struct proto *prot = sk->sk_prot;
2525         long allocated = sk_memory_allocated_add(sk, amt);
2526         bool charged = true;
2527
2528         if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2529             !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2530                 goto suppress_allocation;
2531
2532         /* Under limit. */
2533         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2534                 sk_leave_memory_pressure(sk);
2535                 return 1;
2536         }
2537
2538         /* Under pressure. */
2539         if (allocated > sk_prot_mem_limits(sk, 1))
2540                 sk_enter_memory_pressure(sk);
2541
2542         /* Over hard limit. */
2543         if (allocated > sk_prot_mem_limits(sk, 2))
2544                 goto suppress_allocation;
2545
2546         /* guarantee minimum buffer size under pressure */
2547         if (kind == SK_MEM_RECV) {
2548                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2549                         return 1;
2550
2551         } else { /* SK_MEM_SEND */
2552                 int wmem0 = sk_get_wmem0(sk, prot);
2553
2554                 if (sk->sk_type == SOCK_STREAM) {
2555                         if (sk->sk_wmem_queued < wmem0)
2556                                 return 1;
2557                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2558                                 return 1;
2559                 }
2560         }
2561
2562         if (sk_has_memory_pressure(sk)) {
2563                 u64 alloc;
2564
2565                 if (!sk_under_memory_pressure(sk))
2566                         return 1;
2567                 alloc = sk_sockets_allocated_read_positive(sk);
2568                 if (sk_prot_mem_limits(sk, 2) > alloc *
2569                     sk_mem_pages(sk->sk_wmem_queued +
2570                                  atomic_read(&sk->sk_rmem_alloc) +
2571                                  sk->sk_forward_alloc))
2572                         return 1;
2573         }
2574
2575 suppress_allocation:
2576
2577         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2578                 sk_stream_moderate_sndbuf(sk);
2579
2580                 /* Fail only if socket is _under_ its sndbuf.
2581                  * In this case we cannot block, so that we have to fail.
2582                  */
2583                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2584                         return 1;
2585         }
2586
2587         if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2588                 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2589
2590         sk_memory_allocated_sub(sk, amt);
2591
2592         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2593                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2594
2595         return 0;
2596 }
2597 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2598
2599 /**
2600  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2601  *      @sk: socket
2602  *      @size: memory size to allocate
2603  *      @kind: allocation type
2604  *
2605  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2606  *      rmem allocation. This function assumes that protocols which have
2607  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2608  */
2609 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2610 {
2611         int ret, amt = sk_mem_pages(size);
2612
2613         sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2614         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2615         if (!ret)
2616                 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2617         return ret;
2618 }
2619 EXPORT_SYMBOL(__sk_mem_schedule);
2620
2621 /**
2622  *      __sk_mem_reduce_allocated - reclaim memory_allocated
2623  *      @sk: socket
2624  *      @amount: number of quanta
2625  *
2626  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2627  */
2628 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2629 {
2630         sk_memory_allocated_sub(sk, amount);
2631
2632         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2633                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2634
2635         if (sk_under_global_memory_pressure(sk) &&
2636             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2637                 sk_leave_memory_pressure(sk);
2638 }
2639 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2640
2641 /**
2642  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2643  *      @sk: socket
2644  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2645  */
2646 void __sk_mem_reclaim(struct sock *sk, int amount)
2647 {
2648         amount >>= SK_MEM_QUANTUM_SHIFT;
2649         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2650         __sk_mem_reduce_allocated(sk, amount);
2651 }
2652 EXPORT_SYMBOL(__sk_mem_reclaim);
2653
2654 int sk_set_peek_off(struct sock *sk, int val)
2655 {
2656         WRITE_ONCE(sk->sk_peek_off, val);
2657         return 0;
2658 }
2659 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2660
2661 /*
2662  * Set of default routines for initialising struct proto_ops when
2663  * the protocol does not support a particular function. In certain
2664  * cases where it makes no sense for a protocol to have a "do nothing"
2665  * function, some default processing is provided.
2666  */
2667
2668 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2669 {
2670         return -EOPNOTSUPP;
2671 }
2672 EXPORT_SYMBOL(sock_no_bind);
2673
2674 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2675                     int len, int flags)
2676 {
2677         return -EOPNOTSUPP;
2678 }
2679 EXPORT_SYMBOL(sock_no_connect);
2680
2681 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2682 {
2683         return -EOPNOTSUPP;
2684 }
2685 EXPORT_SYMBOL(sock_no_socketpair);
2686
2687 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2688                    bool kern)
2689 {
2690         return -EOPNOTSUPP;
2691 }
2692 EXPORT_SYMBOL(sock_no_accept);
2693
2694 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2695                     int peer)
2696 {
2697         return -EOPNOTSUPP;
2698 }
2699 EXPORT_SYMBOL(sock_no_getname);
2700
2701 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2702 {
2703         return -EOPNOTSUPP;
2704 }
2705 EXPORT_SYMBOL(sock_no_ioctl);
2706
2707 int sock_no_listen(struct socket *sock, int backlog)
2708 {
2709         return -EOPNOTSUPP;
2710 }
2711 EXPORT_SYMBOL(sock_no_listen);
2712
2713 int sock_no_shutdown(struct socket *sock, int how)
2714 {
2715         return -EOPNOTSUPP;
2716 }
2717 EXPORT_SYMBOL(sock_no_shutdown);
2718
2719 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2720                     char __user *optval, unsigned int optlen)
2721 {
2722         return -EOPNOTSUPP;
2723 }
2724 EXPORT_SYMBOL(sock_no_setsockopt);
2725
2726 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2727                     char __user *optval, int __user *optlen)
2728 {
2729         return -EOPNOTSUPP;
2730 }
2731 EXPORT_SYMBOL(sock_no_getsockopt);
2732
2733 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2734 {
2735         return -EOPNOTSUPP;
2736 }
2737 EXPORT_SYMBOL(sock_no_sendmsg);
2738
2739 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2740 {
2741         return -EOPNOTSUPP;
2742 }
2743 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2744
2745 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2746                     int flags)
2747 {
2748         return -EOPNOTSUPP;
2749 }
2750 EXPORT_SYMBOL(sock_no_recvmsg);
2751
2752 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2753 {
2754         /* Mirror missing mmap method error code */
2755         return -ENODEV;
2756 }
2757 EXPORT_SYMBOL(sock_no_mmap);
2758
2759 /*
2760  * When a file is received (via SCM_RIGHTS, etc), we must bump the
2761  * various sock-based usage counts.
2762  */
2763 void __receive_sock(struct file *file)
2764 {
2765         struct socket *sock;
2766         int error;
2767
2768         /*
2769          * The resulting value of "error" is ignored here since we only
2770          * need to take action when the file is a socket and testing
2771          * "sock" for NULL is sufficient.
2772          */
2773         sock = sock_from_file(file, &error);
2774         if (sock) {
2775                 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2776                 sock_update_classid(&sock->sk->sk_cgrp_data);
2777         }
2778 }
2779
2780 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2781 {
2782         ssize_t res;
2783         struct msghdr msg = {.msg_flags = flags};
2784         struct kvec iov;
2785         char *kaddr = kmap(page);
2786         iov.iov_base = kaddr + offset;
2787         iov.iov_len = size;
2788         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2789         kunmap(page);
2790         return res;
2791 }
2792 EXPORT_SYMBOL(sock_no_sendpage);
2793
2794 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2795                                 int offset, size_t size, int flags)
2796 {
2797         ssize_t res;
2798         struct msghdr msg = {.msg_flags = flags};
2799         struct kvec iov;
2800         char *kaddr = kmap(page);
2801
2802         iov.iov_base = kaddr + offset;
2803         iov.iov_len = size;
2804         res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2805         kunmap(page);
2806         return res;
2807 }
2808 EXPORT_SYMBOL(sock_no_sendpage_locked);
2809
2810 /*
2811  *      Default Socket Callbacks
2812  */
2813
2814 static void sock_def_wakeup(struct sock *sk)
2815 {
2816         struct socket_wq *wq;
2817
2818         rcu_read_lock();
2819         wq = rcu_dereference(sk->sk_wq);
2820         if (skwq_has_sleeper(wq))
2821                 wake_up_interruptible_all(&wq->wait);
2822         rcu_read_unlock();
2823 }
2824
2825 static void sock_def_error_report(struct sock *sk)
2826 {
2827         struct socket_wq *wq;
2828
2829         rcu_read_lock();
2830         wq = rcu_dereference(sk->sk_wq);
2831         if (skwq_has_sleeper(wq))
2832                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2833         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2834         rcu_read_unlock();
2835 }
2836
2837 static void sock_def_readable(struct sock *sk)
2838 {
2839         struct socket_wq *wq;
2840
2841         rcu_read_lock();
2842         wq = rcu_dereference(sk->sk_wq);
2843         if (skwq_has_sleeper(wq))
2844                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2845                                                 EPOLLRDNORM | EPOLLRDBAND);
2846         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2847         rcu_read_unlock();
2848 }
2849
2850 static void sock_def_write_space(struct sock *sk)
2851 {
2852         struct socket_wq *wq;
2853
2854         rcu_read_lock();
2855
2856         /* Do not wake up a writer until he can make "significant"
2857          * progress.  --DaveM
2858          */
2859         if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
2860                 wq = rcu_dereference(sk->sk_wq);
2861                 if (skwq_has_sleeper(wq))
2862                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2863                                                 EPOLLWRNORM | EPOLLWRBAND);
2864
2865                 /* Should agree with poll, otherwise some programs break */
2866                 if (sock_writeable(sk))
2867                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2868         }
2869
2870         rcu_read_unlock();
2871 }
2872
2873 static void sock_def_destruct(struct sock *sk)
2874 {
2875 }
2876
2877 void sk_send_sigurg(struct sock *sk)
2878 {
2879         if (sk->sk_socket && sk->sk_socket->file)
2880                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2881                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2882 }
2883 EXPORT_SYMBOL(sk_send_sigurg);
2884
2885 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2886                     unsigned long expires)
2887 {
2888         if (!mod_timer(timer, expires))
2889                 sock_hold(sk);
2890 }
2891 EXPORT_SYMBOL(sk_reset_timer);
2892
2893 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2894 {
2895         if (del_timer(timer))
2896                 __sock_put(sk);
2897 }
2898 EXPORT_SYMBOL(sk_stop_timer);
2899
2900 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
2901 {
2902         if (del_timer_sync(timer))
2903                 __sock_put(sk);
2904 }
2905 EXPORT_SYMBOL(sk_stop_timer_sync);
2906
2907 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
2908 {
2909         sk_init_common(sk);
2910         sk->sk_send_head        =       NULL;
2911
2912         timer_setup(&sk->sk_timer, NULL, 0);
2913
2914         sk->sk_allocation       =       GFP_KERNEL;
2915         sk->sk_rcvbuf           =       sysctl_rmem_default;
2916         sk->sk_sndbuf           =       sysctl_wmem_default;
2917         sk->sk_state            =       TCP_CLOSE;
2918         sk_set_socket(sk, sock);
2919
2920         sock_set_flag(sk, SOCK_ZAPPED);
2921
2922         if (sock) {
2923                 sk->sk_type     =       sock->type;
2924                 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
2925                 sock->sk        =       sk;
2926         } else {
2927                 RCU_INIT_POINTER(sk->sk_wq, NULL);
2928         }
2929         sk->sk_uid      =       uid;
2930
2931         rwlock_init(&sk->sk_callback_lock);
2932         if (sk->sk_kern_sock)
2933                 lockdep_set_class_and_name(
2934                         &sk->sk_callback_lock,
2935                         af_kern_callback_keys + sk->sk_family,
2936                         af_family_kern_clock_key_strings[sk->sk_family]);
2937         else
2938                 lockdep_set_class_and_name(
2939                         &sk->sk_callback_lock,
2940                         af_callback_keys + sk->sk_family,
2941                         af_family_clock_key_strings[sk->sk_family]);
2942
2943         sk->sk_state_change     =       sock_def_wakeup;
2944         sk->sk_data_ready       =       sock_def_readable;
2945         sk->sk_write_space      =       sock_def_write_space;
2946         sk->sk_error_report     =       sock_def_error_report;
2947         sk->sk_destruct         =       sock_def_destruct;
2948
2949         sk->sk_frag.page        =       NULL;
2950         sk->sk_frag.offset      =       0;
2951         sk->sk_peek_off         =       -1;
2952
2953         sk->sk_peer_pid         =       NULL;
2954         sk->sk_peer_cred        =       NULL;
2955         spin_lock_init(&sk->sk_peer_lock);
2956
2957         sk->sk_write_pending    =       0;
2958         sk->sk_rcvlowat         =       1;
2959         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2960         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2961
2962         sk->sk_stamp = SK_DEFAULT_STAMP;
2963 #if BITS_PER_LONG==32
2964         seqlock_init(&sk->sk_stamp_seq);
2965 #endif
2966         atomic_set(&sk->sk_zckey, 0);
2967
2968 #ifdef CONFIG_NET_RX_BUSY_POLL
2969         sk->sk_napi_id          =       0;
2970         sk->sk_ll_usec          =       READ_ONCE(sysctl_net_busy_read);
2971 #endif
2972
2973         sk->sk_max_pacing_rate = ~0UL;
2974         sk->sk_pacing_rate = ~0UL;
2975         WRITE_ONCE(sk->sk_pacing_shift, 10);
2976         sk->sk_incoming_cpu = -1;
2977
2978         sk_rx_queue_clear(sk);
2979         /*
2980          * Before updating sk_refcnt, we must commit prior changes to memory
2981          * (Documentation/RCU/rculist_nulls.txt for details)
2982          */
2983         smp_wmb();
2984         refcount_set(&sk->sk_refcnt, 1);
2985         atomic_set(&sk->sk_drops, 0);
2986 }
2987 EXPORT_SYMBOL(sock_init_data_uid);
2988
2989 void sock_init_data(struct socket *sock, struct sock *sk)
2990 {
2991         kuid_t uid = sock ?
2992                 SOCK_INODE(sock)->i_uid :
2993                 make_kuid(sock_net(sk)->user_ns, 0);
2994
2995         sock_init_data_uid(sock, sk, uid);
2996 }
2997 EXPORT_SYMBOL(sock_init_data);
2998
2999 void lock_sock_nested(struct sock *sk, int subclass)
3000 {
3001         might_sleep();
3002         spin_lock_bh(&sk->sk_lock.slock);
3003         if (sk->sk_lock.owned)
3004                 __lock_sock(sk);
3005         sk->sk_lock.owned = 1;
3006         spin_unlock(&sk->sk_lock.slock);
3007         /*
3008          * The sk_lock has mutex_lock() semantics here:
3009          */
3010         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3011         local_bh_enable();
3012 }
3013 EXPORT_SYMBOL(lock_sock_nested);
3014
3015 void release_sock(struct sock *sk)
3016 {
3017         spin_lock_bh(&sk->sk_lock.slock);
3018         if (sk->sk_backlog.tail)
3019                 __release_sock(sk);
3020
3021         /* Warning : release_cb() might need to release sk ownership,
3022          * ie call sock_release_ownership(sk) before us.
3023          */
3024         if (sk->sk_prot->release_cb)
3025                 sk->sk_prot->release_cb(sk);
3026
3027         sock_release_ownership(sk);
3028         if (waitqueue_active(&sk->sk_lock.wq))
3029                 wake_up(&sk->sk_lock.wq);
3030         spin_unlock_bh(&sk->sk_lock.slock);
3031 }
3032 EXPORT_SYMBOL(release_sock);
3033
3034 /**
3035  * lock_sock_fast - fast version of lock_sock
3036  * @sk: socket
3037  *
3038  * This version should be used for very small section, where process wont block
3039  * return false if fast path is taken:
3040  *
3041  *   sk_lock.slock locked, owned = 0, BH disabled
3042  *
3043  * return true if slow path is taken:
3044  *
3045  *   sk_lock.slock unlocked, owned = 1, BH enabled
3046  */
3047 bool lock_sock_fast(struct sock *sk)
3048 {
3049         might_sleep();
3050         spin_lock_bh(&sk->sk_lock.slock);
3051
3052         if (!sk->sk_lock.owned)
3053                 /*
3054                  * Note : We must disable BH
3055                  */
3056                 return false;
3057
3058         __lock_sock(sk);
3059         sk->sk_lock.owned = 1;
3060         spin_unlock(&sk->sk_lock.slock);
3061         /*
3062          * The sk_lock has mutex_lock() semantics here:
3063          */
3064         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
3065         local_bh_enable();
3066         return true;
3067 }
3068 EXPORT_SYMBOL(lock_sock_fast);
3069
3070 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3071                    bool timeval, bool time32)
3072 {
3073         struct sock *sk = sock->sk;
3074         struct timespec64 ts;
3075
3076         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3077         ts = ktime_to_timespec64(sock_read_timestamp(sk));
3078         if (ts.tv_sec == -1)
3079                 return -ENOENT;
3080         if (ts.tv_sec == 0) {
3081                 ktime_t kt = ktime_get_real();
3082                 sock_write_timestamp(sk, kt);;
3083                 ts = ktime_to_timespec64(kt);
3084         }
3085
3086         if (timeval)
3087                 ts.tv_nsec /= 1000;
3088
3089 #ifdef CONFIG_COMPAT_32BIT_TIME
3090         if (time32)
3091                 return put_old_timespec32(&ts, userstamp);
3092 #endif
3093 #ifdef CONFIG_SPARC64
3094         /* beware of padding in sparc64 timeval */
3095         if (timeval && !in_compat_syscall()) {
3096                 struct __kernel_old_timeval __user tv = {
3097                         .tv_sec = ts.tv_sec,
3098                         .tv_usec = ts.tv_nsec,
3099                 };
3100                 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3101                         return -EFAULT;
3102                 return 0;
3103         }
3104 #endif
3105         return put_timespec64(&ts, userstamp);
3106 }
3107 EXPORT_SYMBOL(sock_gettstamp);
3108
3109 void sock_enable_timestamp(struct sock *sk, int flag)
3110 {
3111         if (!sock_flag(sk, flag)) {
3112                 unsigned long previous_flags = sk->sk_flags;
3113
3114                 sock_set_flag(sk, flag);
3115                 /*
3116                  * we just set one of the two flags which require net
3117                  * time stamping, but time stamping might have been on
3118                  * already because of the other one
3119                  */
3120                 if (sock_needs_netstamp(sk) &&
3121                     !(previous_flags & SK_FLAGS_TIMESTAMP))
3122                         net_enable_timestamp();
3123         }
3124 }
3125
3126 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3127                        int level, int type)
3128 {
3129         struct sock_exterr_skb *serr;
3130         struct sk_buff *skb;
3131         int copied, err;
3132
3133         err = -EAGAIN;
3134         skb = sock_dequeue_err_skb(sk);
3135         if (skb == NULL)
3136                 goto out;
3137
3138         copied = skb->len;
3139         if (copied > len) {
3140                 msg->msg_flags |= MSG_TRUNC;
3141                 copied = len;
3142         }
3143         err = skb_copy_datagram_msg(skb, 0, msg, copied);
3144         if (err)
3145                 goto out_free_skb;
3146
3147         sock_recv_timestamp(msg, sk, skb);
3148
3149         serr = SKB_EXT_ERR(skb);
3150         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3151
3152         msg->msg_flags |= MSG_ERRQUEUE;
3153         err = copied;
3154
3155 out_free_skb:
3156         kfree_skb(skb);
3157 out:
3158         return err;
3159 }
3160 EXPORT_SYMBOL(sock_recv_errqueue);
3161
3162 /*
3163  *      Get a socket option on an socket.
3164  *
3165  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3166  *      asynchronous errors should be reported by getsockopt. We assume
3167  *      this means if you specify SO_ERROR (otherwise whats the point of it).
3168  */
3169 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3170                            char __user *optval, int __user *optlen)
3171 {
3172         struct sock *sk = sock->sk;
3173
3174         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3175 }
3176 EXPORT_SYMBOL(sock_common_getsockopt);
3177
3178 #ifdef CONFIG_COMPAT
3179 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3180                                   char __user *optval, int __user *optlen)
3181 {
3182         struct sock *sk = sock->sk;
3183
3184         if (sk->sk_prot->compat_getsockopt != NULL)
3185                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
3186                                                       optval, optlen);
3187         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3188 }
3189 EXPORT_SYMBOL(compat_sock_common_getsockopt);
3190 #endif
3191
3192 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3193                         int flags)
3194 {
3195         struct sock *sk = sock->sk;
3196         int addr_len = 0;
3197         int err;
3198
3199         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3200                                    flags & ~MSG_DONTWAIT, &addr_len);
3201         if (err >= 0)
3202                 msg->msg_namelen = addr_len;
3203         return err;
3204 }
3205 EXPORT_SYMBOL(sock_common_recvmsg);
3206
3207 /*
3208  *      Set socket options on an inet socket.
3209  */
3210 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3211                            char __user *optval, unsigned int optlen)
3212 {
3213         struct sock *sk = sock->sk;
3214
3215         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3216 }
3217 EXPORT_SYMBOL(sock_common_setsockopt);
3218
3219 #ifdef CONFIG_COMPAT
3220 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
3221                                   char __user *optval, unsigned int optlen)
3222 {
3223         struct sock *sk = sock->sk;
3224
3225         if (sk->sk_prot->compat_setsockopt != NULL)
3226                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
3227                                                       optval, optlen);
3228         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3229 }
3230 EXPORT_SYMBOL(compat_sock_common_setsockopt);
3231 #endif
3232
3233 void sk_common_release(struct sock *sk)
3234 {
3235         if (sk->sk_prot->destroy)
3236                 sk->sk_prot->destroy(sk);
3237
3238         /*
3239          * Observation: when sock_common_release is called, processes have
3240          * no access to socket. But net still has.
3241          * Step one, detach it from networking:
3242          *
3243          * A. Remove from hash tables.
3244          */
3245
3246         sk->sk_prot->unhash(sk);
3247
3248         /*
3249          * In this point socket cannot receive new packets, but it is possible
3250          * that some packets are in flight because some CPU runs receiver and
3251          * did hash table lookup before we unhashed socket. They will achieve
3252          * receive queue and will be purged by socket destructor.
3253          *
3254          * Also we still have packets pending on receive queue and probably,
3255          * our own packets waiting in device queues. sock_destroy will drain
3256          * receive queue, but transmitted packets will delay socket destruction
3257          * until the last reference will be released.
3258          */
3259
3260         sock_orphan(sk);
3261
3262         xfrm_sk_free_policy(sk);
3263
3264         sk_refcnt_debug_release(sk);
3265
3266         sock_put(sk);
3267 }
3268 EXPORT_SYMBOL(sk_common_release);
3269
3270 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3271 {
3272         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3273
3274         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3275         mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3276         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3277         mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3278         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3279         mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3280         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3281         mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3282         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3283 }
3284
3285 #ifdef CONFIG_PROC_FS
3286 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
3287 struct prot_inuse {
3288         int val[PROTO_INUSE_NR];
3289 };
3290
3291 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3292
3293 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3294 {
3295         __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3296 }
3297 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3298
3299 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3300 {
3301         int cpu, idx = prot->inuse_idx;
3302         int res = 0;
3303
3304         for_each_possible_cpu(cpu)
3305                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3306
3307         return res >= 0 ? res : 0;
3308 }
3309 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3310
3311 static void sock_inuse_add(struct net *net, int val)
3312 {
3313         this_cpu_add(*net->core.sock_inuse, val);
3314 }
3315
3316 int sock_inuse_get(struct net *net)
3317 {
3318         int cpu, res = 0;
3319
3320         for_each_possible_cpu(cpu)
3321                 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3322
3323         return res;
3324 }
3325
3326 EXPORT_SYMBOL_GPL(sock_inuse_get);
3327
3328 static int __net_init sock_inuse_init_net(struct net *net)
3329 {
3330         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3331         if (net->core.prot_inuse == NULL)
3332                 return -ENOMEM;
3333
3334         net->core.sock_inuse = alloc_percpu(int);
3335         if (net->core.sock_inuse == NULL)
3336                 goto out;
3337
3338         return 0;
3339
3340 out:
3341         free_percpu(net->core.prot_inuse);
3342         return -ENOMEM;
3343 }
3344
3345 static void __net_exit sock_inuse_exit_net(struct net *net)
3346 {
3347         free_percpu(net->core.prot_inuse);
3348         free_percpu(net->core.sock_inuse);
3349 }
3350
3351 static struct pernet_operations net_inuse_ops = {
3352         .init = sock_inuse_init_net,
3353         .exit = sock_inuse_exit_net,
3354 };
3355
3356 static __init int net_inuse_init(void)
3357 {
3358         if (register_pernet_subsys(&net_inuse_ops))
3359                 panic("Cannot initialize net inuse counters");
3360
3361         return 0;
3362 }
3363
3364 core_initcall(net_inuse_init);
3365
3366 static int assign_proto_idx(struct proto *prot)
3367 {
3368         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3369
3370         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3371                 pr_err("PROTO_INUSE_NR exhausted\n");
3372                 return -ENOSPC;
3373         }
3374
3375         set_bit(prot->inuse_idx, proto_inuse_idx);
3376         return 0;
3377 }
3378
3379 static void release_proto_idx(struct proto *prot)
3380 {
3381         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3382                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3383 }
3384 #else
3385 static inline int assign_proto_idx(struct proto *prot)
3386 {
3387         return 0;
3388 }
3389
3390 static inline void release_proto_idx(struct proto *prot)
3391 {
3392 }
3393
3394 static void sock_inuse_add(struct net *net, int val)
3395 {
3396 }
3397 #endif
3398
3399 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3400 {
3401         if (!twsk_prot)
3402                 return;
3403         kfree(twsk_prot->twsk_slab_name);
3404         twsk_prot->twsk_slab_name = NULL;
3405         kmem_cache_destroy(twsk_prot->twsk_slab);
3406         twsk_prot->twsk_slab = NULL;
3407 }
3408
3409 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3410 {
3411         if (!rsk_prot)
3412                 return;
3413         kfree(rsk_prot->slab_name);
3414         rsk_prot->slab_name = NULL;
3415         kmem_cache_destroy(rsk_prot->slab);
3416         rsk_prot->slab = NULL;
3417 }
3418
3419 static int req_prot_init(const struct proto *prot)
3420 {
3421         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3422
3423         if (!rsk_prot)
3424                 return 0;
3425
3426         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3427                                         prot->name);
3428         if (!rsk_prot->slab_name)
3429                 return -ENOMEM;
3430
3431         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3432                                            rsk_prot->obj_size, 0,
3433                                            SLAB_ACCOUNT | prot->slab_flags,
3434                                            NULL);
3435
3436         if (!rsk_prot->slab) {
3437                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3438                         prot->name);
3439                 return -ENOMEM;
3440         }
3441         return 0;
3442 }
3443
3444 int proto_register(struct proto *prot, int alloc_slab)
3445 {
3446         int ret = -ENOBUFS;
3447
3448         if (alloc_slab) {
3449                 prot->slab = kmem_cache_create_usercopy(prot->name,
3450                                         prot->obj_size, 0,
3451                                         SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3452                                         prot->slab_flags,
3453                                         prot->useroffset, prot->usersize,
3454                                         NULL);
3455
3456                 if (prot->slab == NULL) {
3457                         pr_crit("%s: Can't create sock SLAB cache!\n",
3458                                 prot->name);
3459                         goto out;
3460                 }
3461
3462                 if (req_prot_init(prot))
3463                         goto out_free_request_sock_slab;
3464
3465                 if (prot->twsk_prot != NULL) {
3466                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3467
3468                         if (prot->twsk_prot->twsk_slab_name == NULL)
3469                                 goto out_free_request_sock_slab;
3470
3471                         prot->twsk_prot->twsk_slab =
3472                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3473                                                   prot->twsk_prot->twsk_obj_size,
3474                                                   0,
3475                                                   SLAB_ACCOUNT |
3476                                                   prot->slab_flags,
3477                                                   NULL);
3478                         if (prot->twsk_prot->twsk_slab == NULL)
3479                                 goto out_free_timewait_sock_slab;
3480                 }
3481         }
3482
3483         mutex_lock(&proto_list_mutex);
3484         ret = assign_proto_idx(prot);
3485         if (ret) {
3486                 mutex_unlock(&proto_list_mutex);
3487                 goto out_free_timewait_sock_slab;
3488         }
3489         list_add(&prot->node, &proto_list);
3490         mutex_unlock(&proto_list_mutex);
3491         return ret;
3492
3493 out_free_timewait_sock_slab:
3494         if (alloc_slab && prot->twsk_prot)
3495                 tw_prot_cleanup(prot->twsk_prot);
3496 out_free_request_sock_slab:
3497         if (alloc_slab) {
3498                 req_prot_cleanup(prot->rsk_prot);
3499
3500                 kmem_cache_destroy(prot->slab);
3501                 prot->slab = NULL;
3502         }
3503 out:
3504         return ret;
3505 }
3506 EXPORT_SYMBOL(proto_register);
3507
3508 void proto_unregister(struct proto *prot)
3509 {
3510         mutex_lock(&proto_list_mutex);
3511         release_proto_idx(prot);
3512         list_del(&prot->node);
3513         mutex_unlock(&proto_list_mutex);
3514
3515         kmem_cache_destroy(prot->slab);
3516         prot->slab = NULL;
3517
3518         req_prot_cleanup(prot->rsk_prot);
3519         tw_prot_cleanup(prot->twsk_prot);
3520 }
3521 EXPORT_SYMBOL(proto_unregister);
3522
3523 int sock_load_diag_module(int family, int protocol)
3524 {
3525         if (!protocol) {
3526                 if (!sock_is_registered(family))
3527                         return -ENOENT;
3528
3529                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3530                                       NETLINK_SOCK_DIAG, family);
3531         }
3532
3533 #ifdef CONFIG_INET
3534         if (family == AF_INET &&
3535             protocol != IPPROTO_RAW &&
3536             !rcu_access_pointer(inet_protos[protocol]))
3537                 return -ENOENT;
3538 #endif
3539
3540         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3541                               NETLINK_SOCK_DIAG, family, protocol);
3542 }
3543 EXPORT_SYMBOL(sock_load_diag_module);
3544
3545 #ifdef CONFIG_PROC_FS
3546 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3547         __acquires(proto_list_mutex)
3548 {
3549         mutex_lock(&proto_list_mutex);
3550         return seq_list_start_head(&proto_list, *pos);
3551 }
3552
3553 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3554 {
3555         return seq_list_next(v, &proto_list, pos);
3556 }
3557
3558 static void proto_seq_stop(struct seq_file *seq, void *v)
3559         __releases(proto_list_mutex)
3560 {
3561         mutex_unlock(&proto_list_mutex);
3562 }
3563
3564 static char proto_method_implemented(const void *method)
3565 {
3566         return method == NULL ? 'n' : 'y';
3567 }
3568 static long sock_prot_memory_allocated(struct proto *proto)
3569 {
3570         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3571 }
3572
3573 static const char *sock_prot_memory_pressure(struct proto *proto)
3574 {
3575         return proto->memory_pressure != NULL ?
3576         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3577 }
3578
3579 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3580 {
3581
3582         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3583                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3584                    proto->name,
3585                    proto->obj_size,
3586                    sock_prot_inuse_get(seq_file_net(seq), proto),
3587                    sock_prot_memory_allocated(proto),
3588                    sock_prot_memory_pressure(proto),
3589                    proto->max_header,
3590                    proto->slab == NULL ? "no" : "yes",
3591                    module_name(proto->owner),
3592                    proto_method_implemented(proto->close),
3593                    proto_method_implemented(proto->connect),
3594                    proto_method_implemented(proto->disconnect),
3595                    proto_method_implemented(proto->accept),
3596                    proto_method_implemented(proto->ioctl),
3597                    proto_method_implemented(proto->init),
3598                    proto_method_implemented(proto->destroy),
3599                    proto_method_implemented(proto->shutdown),
3600                    proto_method_implemented(proto->setsockopt),
3601                    proto_method_implemented(proto->getsockopt),
3602                    proto_method_implemented(proto->sendmsg),
3603                    proto_method_implemented(proto->recvmsg),
3604                    proto_method_implemented(proto->sendpage),
3605                    proto_method_implemented(proto->bind),
3606                    proto_method_implemented(proto->backlog_rcv),
3607                    proto_method_implemented(proto->hash),
3608                    proto_method_implemented(proto->unhash),
3609                    proto_method_implemented(proto->get_port),
3610                    proto_method_implemented(proto->enter_memory_pressure));
3611 }
3612
3613 static int proto_seq_show(struct seq_file *seq, void *v)
3614 {
3615         if (v == &proto_list)
3616                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3617                            "protocol",
3618                            "size",
3619                            "sockets",
3620                            "memory",
3621                            "press",
3622                            "maxhdr",
3623                            "slab",
3624                            "module",
3625                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3626         else
3627                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3628         return 0;
3629 }
3630
3631 static const struct seq_operations proto_seq_ops = {
3632         .start  = proto_seq_start,
3633         .next   = proto_seq_next,
3634         .stop   = proto_seq_stop,
3635         .show   = proto_seq_show,
3636 };
3637
3638 static __net_init int proto_init_net(struct net *net)
3639 {
3640         if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3641                         sizeof(struct seq_net_private)))
3642                 return -ENOMEM;
3643
3644         return 0;
3645 }
3646
3647 static __net_exit void proto_exit_net(struct net *net)
3648 {
3649         remove_proc_entry("protocols", net->proc_net);
3650 }
3651
3652
3653 static __net_initdata struct pernet_operations proto_net_ops = {
3654         .init = proto_init_net,
3655         .exit = proto_exit_net,
3656 };
3657
3658 static int __init proto_init(void)
3659 {
3660         return register_pernet_subsys(&proto_net_ops);
3661 }
3662
3663 subsys_initcall(proto_init);
3664
3665 #endif /* PROC_FS */
3666
3667 #ifdef CONFIG_NET_RX_BUSY_POLL
3668 bool sk_busy_loop_end(void *p, unsigned long start_time)
3669 {
3670         struct sock *sk = p;
3671
3672         return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3673                sk_busy_loop_timeout(sk, start_time);
3674 }
3675 EXPORT_SYMBOL(sk_busy_loop_end);
3676 #endif /* CONFIG_NET_RX_BUSY_POLL */