core/sock.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Generic socket support routines. Memory allocators, socket lock/release
   8  *              handler for protocols to use and generic option handler.
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  */
  85
  86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88 #include <asm/unaligned.h>
  89 #include <linux/capability.h>
  90 #include <linux/errno.h>
  91 #include <linux/errqueue.h>
  92 #include <linux/types.h>
  93 #include <linux/socket.h>
  94 #include <linux/in.h>
  95 #include <linux/kernel.h>
  96 #include <linux/module.h>
  97 #include <linux/proc_fs.h>
  98 #include <linux/seq_file.h>
  99 #include <linux/sched.h>
 100 #include <linux/sched/mm.h>
 101 #include <linux/timer.h>
 102 #include <linux/string.h>
 103 #include <linux/sockios.h>
 104 #include <linux/net.h>
 105 #include <linux/mm.h>
 106 #include <linux/slab.h>
 107 #include <linux/interrupt.h>
 108 #include <linux/poll.h>
 109 #include <linux/tcp.h>
 110 #include <linux/udp.h>
 111 #include <linux/init.h>
 112 #include <linux/highmem.h>
 113 #include <linux/user_namespace.h>
 114 #include <linux/static_key.h>
 115 #include <linux/memcontrol.h>
 116 #include <linux/prefetch.h>
 117 #include <linux/compat.h>
 118
 119 #include <linux/uaccess.h>
 120
 121 #include <linux/netdevice.h>
 122 #include <net/protocol.h>
 123 #include <linux/skbuff.h>
 124 #include <net/net_namespace.h>
 125 #include <net/request_sock.h>
 126 #include <net/sock.h>
 127 #include <linux/net_tstamp.h>
 128 #include <net/xfrm.h>
 129 #include <linux/ipsec.h>
 130 #include <net/cls_cgroup.h>
 131 #include <net/netprio_cgroup.h>
 132 #include <linux/sock_diag.h>
 133
 134 #include <linux/filter.h>
 135 #include <net/sock_reuseport.h>
 136 #include <net/bpf_sk_storage.h>
 137
 138 #include <trace/events/sock.h>
 139
 140 #include <net/tcp.h>
 141 #include <net/busy_poll.h>
 142
 143 #include <linux/ethtool.h>
 144
 145 #include "dev.h"
 146
 147 static DEFINE_MUTEX(proto_list_mutex);
 148 static LIST_HEAD(proto_list);
 149
 150 static void sock_def_write_space_wfree(struct sock *sk);
 151 static void sock_def_write_space(struct sock *sk);
 152
 153 /**
 154  * sk_ns_capable - General socket capability test
 155  * @sk: Socket to use a capability on or through
 156  * @user_ns: The user namespace of the capability to use
 157  * @cap: The capability to use
 158  *
 159  * Test to see if the opener of the socket had when the socket was
 160  * created and the current process has the capability @cap in the user
 161  * namespace @user_ns.
 162  */
 163 bool sk_ns_capable(const struct sock *sk,
 164                    struct user_namespace *user_ns, int cap)
 165 {
 166         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 167                 ns_capable(user_ns, cap);
 168 }
 169 EXPORT_SYMBOL(sk_ns_capable);
 170
 171 /**
 172  * sk_capable - Socket global capability test
 173  * @sk: Socket to use a capability on or through
 174  * @cap: The global capability to use
 175  *
 176  * Test to see if the opener of the socket had when the socket was
 177  * created and the current process has the capability @cap in all user
 178  * namespaces.
 179  */
 180 bool sk_capable(const struct sock *sk, int cap)
 181 {
 182         return sk_ns_capable(sk, &init_user_ns, cap);
 183 }
 184 EXPORT_SYMBOL(sk_capable);
 185
 186 /**
 187  * sk_net_capable - Network namespace socket capability test
 188  * @sk: Socket to use a capability on or through
 189  * @cap: The capability to use
 190  *
 191  * Test to see if the opener of the socket had when the socket was created
 192  * and the current process has the capability @cap over the network namespace
 193  * the socket is a member of.
 194  */
 195 bool sk_net_capable(const struct sock *sk, int cap)
 196 {
 197         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 198 }
 199 EXPORT_SYMBOL(sk_net_capable);
 200
 201 /*
 202  * Each address family might have different locking rules, so we have
 203  * one slock key per address family and separate keys for internal and
 204  * userspace sockets.
 205  */
 206 static struct lock_class_key af_family_keys[AF_MAX];
 207 static struct lock_class_key af_family_kern_keys[AF_MAX];
 208 static struct lock_class_key af_family_slock_keys[AF_MAX];
 209 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 210
 211 /*
 212  * Make lock validator output more readable. (we pre-construct these
 213  * strings build-time, so that runtime initialization of socket
 214  * locks is fast):
 215  */
 216
 217 #define _sock_locks(x)                                            \
 218   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 219   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 220   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 221   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 222   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 223   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 224   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 225   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 226   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 227   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 228   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 229   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 230   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 231   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 232   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 233   x "AF_MCTP"  , \
 234   x "AF_MAX"
 235
 236 static const char *const af_family_key_strings[AF_MAX+1] = {
 237         _sock_locks("sk_lock-")
 238 };
 239 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 240         _sock_locks("slock-")
 241 };
 242 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 243         _sock_locks("clock-")
 244 };
 245
 246 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 247         _sock_locks("k-sk_lock-")
 248 };
 249 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 250         _sock_locks("k-slock-")
 251 };
 252 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 253         _sock_locks("k-clock-")
 254 };
 255 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 256         _sock_locks("rlock-")
 257 };
 258 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 259         _sock_locks("wlock-")
 260 };
 261 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 262         _sock_locks("elock-")
 263 };
 264
 265 /*
 266  * sk_callback_lock and sk queues locking rules are per-address-family,
 267  * so split the lock classes by using a per-AF key:
 268  */
 269 static struct lock_class_key af_callback_keys[AF_MAX];
 270 static struct lock_class_key af_rlock_keys[AF_MAX];
 271 static struct lock_class_key af_wlock_keys[AF_MAX];
 272 static struct lock_class_key af_elock_keys[AF_MAX];
 273 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 274
 275 /* Run time adjustable parameters. */
 276 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 277 EXPORT_SYMBOL(sysctl_wmem_max);
 278 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 279 EXPORT_SYMBOL(sysctl_rmem_max);
 280 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 281 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 282 int sysctl_mem_pcpu_rsv __read_mostly = SK_MEMORY_PCPU_RESERVE;
 283
 284 /* Maximal space eaten by iovec or ancillary data plus some space */
 285 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 286 EXPORT_SYMBOL(sysctl_optmem_max);
 287
 288 int sysctl_tstamp_allow_data __read_mostly = 1;
 289
 290 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 291 EXPORT_SYMBOL_GPL(memalloc_socks_key);
 292
 293 /**
 294  * sk_set_memalloc - sets %SOCK_MEMALLOC
 295  * @sk: socket to set it on
 296  *
 297  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 298  * It's the responsibility of the admin to adjust min_free_kbytes
 299  * to meet the requirements
 300  */
 301 void sk_set_memalloc(struct sock *sk)
 302 {
 303         sock_set_flag(sk, SOCK_MEMALLOC);
 304         sk->sk_allocation |= __GFP_MEMALLOC;
 305         static_branch_inc(&memalloc_socks_key);
 306 }
 307 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 308
 309 void sk_clear_memalloc(struct sock *sk)
 310 {
 311         sock_reset_flag(sk, SOCK_MEMALLOC);
 312         sk->sk_allocation &= ~__GFP_MEMALLOC;
 313         static_branch_dec(&memalloc_socks_key);
 314
 315         /*
 316          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 317          * progress of swapping. SOCK_MEMALLOC may be cleared while
 318          * it has rmem allocations due to the last swapfile being deactivated
 319          * but there is a risk that the socket is unusable due to exceeding
 320          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 321          */
 322         sk_mem_reclaim(sk);
 323 }
 324 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 325
 326 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 327 {
 328         int ret;
 329         unsigned int noreclaim_flag;
 330
 331         /* these should have been dropped before queueing */
 332         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 333
 334         noreclaim_flag = memalloc_noreclaim_save();
 335         ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
 336                                  tcp_v6_do_rcv,
 337                                  tcp_v4_do_rcv,
 338                                  sk, skb);
 339         memalloc_noreclaim_restore(noreclaim_flag);
 340
 341         return ret;
 342 }
 343 EXPORT_SYMBOL(__sk_backlog_rcv);
 344
 345 void sk_error_report(struct sock *sk)
 346 {
 347         sk->sk_error_report(sk);
 348
 349         switch (sk->sk_family) {
 350         case AF_INET:
 351                 fallthrough;
 352         case AF_INET6:
 353                 trace_inet_sk_error_report(sk);
 354                 break;
 355         default:
 356                 break;
 357         }
 358 }
 359 EXPORT_SYMBOL(sk_error_report);
 360
 361 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 362 {
 363         struct __kernel_sock_timeval tv;
 364
 365         if (timeo == MAX_SCHEDULE_TIMEOUT) {
 366                 tv.tv_sec = 0;
 367                 tv.tv_usec = 0;
 368         } else {
 369                 tv.tv_sec = timeo / HZ;
 370                 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 371         }
 372
 373         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 374                 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 375                 *(struct old_timeval32 *)optval = tv32;
 376                 return sizeof(tv32);
 377         }
 378
 379         if (old_timeval) {
 380                 struct __kernel_old_timeval old_tv;
 381                 old_tv.tv_sec = tv.tv_sec;
 382                 old_tv.tv_usec = tv.tv_usec;
 383                 *(struct __kernel_old_timeval *)optval = old_tv;
 384                 return sizeof(old_tv);
 385         }
 386
 387         *(struct __kernel_sock_timeval *)optval = tv;
 388         return sizeof(tv);
 389 }
 390 EXPORT_SYMBOL(sock_get_timeout);
 391
 392 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
 393                            sockptr_t optval, int optlen, bool old_timeval)
 394 {
 395         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 396                 struct old_timeval32 tv32;
 397
 398                 if (optlen < sizeof(tv32))
 399                         return -EINVAL;
 400
 401                 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 402                         return -EFAULT;
 403                 tv->tv_sec = tv32.tv_sec;
 404                 tv->tv_usec = tv32.tv_usec;
 405         } else if (old_timeval) {
 406                 struct __kernel_old_timeval old_tv;
 407
 408                 if (optlen < sizeof(old_tv))
 409                         return -EINVAL;
 410                 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 411                         return -EFAULT;
 412                 tv->tv_sec = old_tv.tv_sec;
 413                 tv->tv_usec = old_tv.tv_usec;
 414         } else {
 415                 if (optlen < sizeof(*tv))
 416                         return -EINVAL;
 417                 if (copy_from_sockptr(tv, optval, sizeof(*tv)))
 418                         return -EFAULT;
 419         }
 420
 421         return 0;
 422 }
 423 EXPORT_SYMBOL(sock_copy_user_timeval);
 424
 425 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 426                             bool old_timeval)
 427 {
 428         struct __kernel_sock_timeval tv;
 429         int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
 430         long val;
 431
 432         if (err)
 433                 return err;
 434
 435         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 436                 return -EDOM;
 437
 438         if (tv.tv_sec < 0) {
 439                 static int warned __read_mostly;
 440
 441                 WRITE_ONCE(*timeo_p, 0);
 442                 if (warned < 10 && net_ratelimit()) {
 443                         warned++;
 444                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 445                                 __func__, current->comm, task_pid_nr(current));
 446                 }
 447                 return 0;
 448         }
 449         val = MAX_SCHEDULE_TIMEOUT;
 450         if ((tv.tv_sec || tv.tv_usec) &&
 451             (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
 452                 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
 453                                                     USEC_PER_SEC / HZ);
 454         WRITE_ONCE(*timeo_p, val);
 455         return 0;
 456 }
 457
 458 static bool sock_needs_netstamp(const struct sock *sk)
 459 {
 460         switch (sk->sk_family) {
 461         case AF_UNSPEC:
 462         case AF_UNIX:
 463                 return false;
 464         default:
 465                 return true;
 466         }
 467 }
 468
 469 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 470 {
 471         if (sk->sk_flags & flags) {
 472                 sk->sk_flags &= ~flags;
 473                 if (sock_needs_netstamp(sk) &&
 474                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 475                         net_disable_timestamp();
 476         }
 477 }
 478
 479
 480 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 481 {
 482         unsigned long flags;
 483         struct sk_buff_head *list = &sk->sk_receive_queue;
 484
 485         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 486                 atomic_inc(&sk->sk_drops);
 487                 trace_sock_rcvqueue_full(sk, skb);
 488                 return -ENOMEM;
 489         }
 490
 491         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 492                 atomic_inc(&sk->sk_drops);
 493                 return -ENOBUFS;
 494         }
 495
 496         skb->dev = NULL;
 497         skb_set_owner_r(skb, sk);
 498
 499         /* we escape from rcu protected region, make sure we dont leak
 500          * a norefcounted dst
 501          */
 502         skb_dst_force(skb);
 503
 504         spin_lock_irqsave(&list->lock, flags);
 505         sock_skb_set_dropcount(sk, skb);
 506         __skb_queue_tail(list, skb);
 507         spin_unlock_irqrestore(&list->lock, flags);
 508
 509         if (!sock_flag(sk, SOCK_DEAD))
 510                 sk->sk_data_ready(sk);
 511         return 0;
 512 }
 513 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 514
 515 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
 516                               enum skb_drop_reason *reason)
 517 {
 518         enum skb_drop_reason drop_reason;
 519         int err;
 520
 521         err = sk_filter(sk, skb);
 522         if (err) {
 523                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
 524                 goto out;
 525         }
 526         err = __sock_queue_rcv_skb(sk, skb);
 527         switch (err) {
 528         case -ENOMEM:
 529                 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
 530                 break;
 531         case -ENOBUFS:
 532                 drop_reason = SKB_DROP_REASON_PROTO_MEM;
 533                 break;
 534         default:
 535                 drop_reason = SKB_NOT_DROPPED_YET;
 536                 break;
 537         }
 538 out:
 539         if (reason)
 540                 *reason = drop_reason;
 541         return err;
 542 }
 543 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
 544
 545 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 546                      const int nested, unsigned int trim_cap, bool refcounted)
 547 {
 548         int rc = NET_RX_SUCCESS;
 549
 550         if (sk_filter_trim_cap(sk, skb, trim_cap))
 551                 goto discard_and_relse;
 552
 553         skb->dev = NULL;
 554
 555         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 556                 atomic_inc(&sk->sk_drops);
 557                 goto discard_and_relse;
 558         }
 559         if (nested)
 560                 bh_lock_sock_nested(sk);
 561         else
 562                 bh_lock_sock(sk);
 563         if (!sock_owned_by_user(sk)) {
 564                 /*
 565                  * trylock + unlock semantics:
 566                  */
 567                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 568
 569                 rc = sk_backlog_rcv(sk, skb);
 570
 571                 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 572         } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 573                 bh_unlock_sock(sk);
 574                 atomic_inc(&sk->sk_drops);
 575                 goto discard_and_relse;
 576         }
 577
 578         bh_unlock_sock(sk);
 579 out:
 580         if (refcounted)
 581                 sock_put(sk);
 582         return rc;
 583 discard_and_relse:
 584         kfree_skb(skb);
 585         goto out;
 586 }
 587 EXPORT_SYMBOL(__sk_receive_skb);
 588
 589 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 590                                                           u32));
 591 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 592                                                            u32));
 593 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 594 {
 595         struct dst_entry *dst = __sk_dst_get(sk);
 596
 597         if (dst && dst->obsolete &&
 598             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 599                                dst, cookie) == NULL) {
 600                 sk_tx_queue_clear(sk);
 601                 WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
 602                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 603                 dst_release(dst);
 604                 return NULL;
 605         }
 606
 607         return dst;
 608 }
 609 EXPORT_SYMBOL(__sk_dst_check);
 610
 611 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 612 {
 613         struct dst_entry *dst = sk_dst_get(sk);
 614
 615         if (dst && dst->obsolete &&
 616             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 617                                dst, cookie) == NULL) {
 618                 sk_dst_reset(sk);
 619                 dst_release(dst);
 620                 return NULL;
 621         }
 622
 623         return dst;
 624 }
 625 EXPORT_SYMBOL(sk_dst_check);
 626
 627 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 628 {
 629         int ret = -ENOPROTOOPT;
 630 #ifdef CONFIG_NETDEVICES
 631         struct net *net = sock_net(sk);
 632
 633         /* Sorry... */
 634         ret = -EPERM;
 635         if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 636                 goto out;
 637
 638         ret = -EINVAL;
 639         if (ifindex < 0)
 640                 goto out;
 641
 642         /* Paired with all READ_ONCE() done locklessly. */
 643         WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
 644
 645         if (sk->sk_prot->rehash)
 646                 sk->sk_prot->rehash(sk);
 647         sk_dst_reset(sk);
 648
 649         ret = 0;
 650
 651 out:
 652 #endif
 653
 654         return ret;
 655 }
 656
 657 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 658 {
 659         int ret;
 660
 661         if (lock_sk)
 662                 lock_sock(sk);
 663         ret = sock_bindtoindex_locked(sk, ifindex);
 664         if (lock_sk)
 665                 release_sock(sk);
 666
 667         return ret;
 668 }
 669 EXPORT_SYMBOL(sock_bindtoindex);
 670
 671 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 672 {
 673         int ret = -ENOPROTOOPT;
 674 #ifdef CONFIG_NETDEVICES
 675         struct net *net = sock_net(sk);
 676         char devname[IFNAMSIZ];
 677         int index;
 678
 679         ret = -EINVAL;
 680         if (optlen < 0)
 681                 goto out;
 682
 683         /* Bind this socket to a particular device like "eth0",
 684          * as specified in the passed interface name. If the
 685          * name is "" or the option length is zero the socket
 686          * is not bound.
 687          */
 688         if (optlen > IFNAMSIZ - 1)
 689                 optlen = IFNAMSIZ - 1;
 690         memset(devname, 0, sizeof(devname));
 691
 692         ret = -EFAULT;
 693         if (copy_from_sockptr(devname, optval, optlen))
 694                 goto out;
 695
 696         index = 0;
 697         if (devname[0] != '\0') {
 698                 struct net_device *dev;
 699
 700                 rcu_read_lock();
 701                 dev = dev_get_by_name_rcu(net, devname);
 702                 if (dev)
 703                         index = dev->ifindex;
 704                 rcu_read_unlock();
 705                 ret = -ENODEV;
 706                 if (!dev)
 707                         goto out;
 708         }
 709
 710         sockopt_lock_sock(sk);
 711         ret = sock_bindtoindex_locked(sk, index);
 712         sockopt_release_sock(sk);
 713 out:
 714 #endif
 715
 716         return ret;
 717 }
 718
 719 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
 720                                 sockptr_t optlen, int len)
 721 {
 722         int ret = -ENOPROTOOPT;
 723 #ifdef CONFIG_NETDEVICES
 724         int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
 725         struct net *net = sock_net(sk);
 726         char devname[IFNAMSIZ];
 727
 728         if (bound_dev_if == 0) {
 729                 len = 0;
 730                 goto zero;
 731         }
 732
 733         ret = -EINVAL;
 734         if (len < IFNAMSIZ)
 735                 goto out;
 736
 737         ret = netdev_get_name(net, devname, bound_dev_if);
 738         if (ret)
 739                 goto out;
 740
 741         len = strlen(devname) + 1;
 742
 743         ret = -EFAULT;
 744         if (copy_to_sockptr(optval, devname, len))
 745                 goto out;
 746
 747 zero:
 748         ret = -EFAULT;
 749         if (copy_to_sockptr(optlen, &len, sizeof(int)))
 750                 goto out;
 751
 752         ret = 0;
 753
 754 out:
 755 #endif
 756
 757         return ret;
 758 }
 759
 760 bool sk_mc_loop(struct sock *sk)
 761 {
 762         if (dev_recursion_level())
 763                 return false;
 764         if (!sk)
 765                 return true;
 766         /* IPV6_ADDRFORM can change sk->sk_family under us. */
 767         switch (READ_ONCE(sk->sk_family)) {
 768         case AF_INET:
 769                 return inet_sk(sk)->mc_loop;
 770 #if IS_ENABLED(CONFIG_IPV6)
 771         case AF_INET6:
 772                 return inet6_sk(sk)->mc_loop;
 773 #endif
 774         }
 775         WARN_ON_ONCE(1);
 776         return true;
 777 }
 778 EXPORT_SYMBOL(sk_mc_loop);
 779
 780 void sock_set_reuseaddr(struct sock *sk)
 781 {
 782         lock_sock(sk);
 783         sk->sk_reuse = SK_CAN_REUSE;
 784         release_sock(sk);
 785 }
 786 EXPORT_SYMBOL(sock_set_reuseaddr);
 787
 788 void sock_set_reuseport(struct sock *sk)
 789 {
 790         lock_sock(sk);
 791         sk->sk_reuseport = true;
 792         release_sock(sk);
 793 }
 794 EXPORT_SYMBOL(sock_set_reuseport);
 795
 796 void sock_no_linger(struct sock *sk)
 797 {
 798         lock_sock(sk);
 799         WRITE_ONCE(sk->sk_lingertime, 0);
 800         sock_set_flag(sk, SOCK_LINGER);
 801         release_sock(sk);
 802 }
 803 EXPORT_SYMBOL(sock_no_linger);
 804
 805 void sock_set_priority(struct sock *sk, u32 priority)
 806 {
 807         lock_sock(sk);
 808         WRITE_ONCE(sk->sk_priority, priority);
 809         release_sock(sk);
 810 }
 811 EXPORT_SYMBOL(sock_set_priority);
 812
 813 void sock_set_sndtimeo(struct sock *sk, s64 secs)
 814 {
 815         lock_sock(sk);
 816         if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 817                 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
 818         else
 819                 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
 820         release_sock(sk);
 821 }
 822 EXPORT_SYMBOL(sock_set_sndtimeo);
 823
 824 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 825 {
 826         if (val)  {
 827                 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 828                 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 829                 sock_set_flag(sk, SOCK_RCVTSTAMP);
 830                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 831         } else {
 832                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
 833                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 834         }
 835 }
 836
 837 void sock_enable_timestamps(struct sock *sk)
 838 {
 839         lock_sock(sk);
 840         __sock_set_timestamps(sk, true, false, true);
 841         release_sock(sk);
 842 }
 843 EXPORT_SYMBOL(sock_enable_timestamps);
 844
 845 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 846 {
 847         switch (optname) {
 848         case SO_TIMESTAMP_OLD:
 849                 __sock_set_timestamps(sk, valbool, false, false);
 850                 break;
 851         case SO_TIMESTAMP_NEW:
 852                 __sock_set_timestamps(sk, valbool, true, false);
 853                 break;
 854         case SO_TIMESTAMPNS_OLD:
 855                 __sock_set_timestamps(sk, valbool, false, true);
 856                 break;
 857         case SO_TIMESTAMPNS_NEW:
 858                 __sock_set_timestamps(sk, valbool, true, true);
 859                 break;
 860         }
 861 }
 862
 863 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 864 {
 865         struct net *net = sock_net(sk);
 866         struct net_device *dev = NULL;
 867         bool match = false;
 868         int *vclock_index;
 869         int i, num;
 870
 871         if (sk->sk_bound_dev_if)
 872                 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 873
 874         if (!dev) {
 875                 pr_err("%s: sock not bind to device\n", __func__);
 876                 return -EOPNOTSUPP;
 877         }
 878
 879         num = ethtool_get_phc_vclocks(dev, &vclock_index);
 880         dev_put(dev);
 881
 882         for (i = 0; i < num; i++) {
 883                 if (*(vclock_index + i) == phc_index) {
 884                         match = true;
 885                         break;
 886                 }
 887         }
 888
 889         if (num > 0)
 890                 kfree(vclock_index);
 891
 892         if (!match)
 893                 return -EINVAL;
 894
 895         WRITE_ONCE(sk->sk_bind_phc, phc_index);
 896
 897         return 0;
 898 }
 899
 900 int sock_set_timestamping(struct sock *sk, int optname,
 901                           struct so_timestamping timestamping)
 902 {
 903         int val = timestamping.flags;
 904         int ret;
 905
 906         if (val & ~SOF_TIMESTAMPING_MASK)
 907                 return -EINVAL;
 908
 909         if (val & SOF_TIMESTAMPING_OPT_ID &&
 910             !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 911                 if (sk_is_tcp(sk)) {
 912                         if ((1 << sk->sk_state) &
 913                             (TCPF_CLOSE | TCPF_LISTEN))
 914                                 return -EINVAL;
 915                         atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
 916                 } else {
 917                         atomic_set(&sk->sk_tskey, 0);
 918                 }
 919         }
 920
 921         if (val & SOF_TIMESTAMPING_OPT_STATS &&
 922             !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 923                 return -EINVAL;
 924
 925         if (val & SOF_TIMESTAMPING_BIND_PHC) {
 926                 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 927                 if (ret)
 928                         return ret;
 929         }
 930
 931         WRITE_ONCE(sk->sk_tsflags, val);
 932         sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 933
 934         if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 935                 sock_enable_timestamp(sk,
 936                                       SOCK_TIMESTAMPING_RX_SOFTWARE);
 937         else
 938                 sock_disable_timestamp(sk,
 939                                        (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 940         return 0;
 941 }
 942
 943 void sock_set_keepalive(struct sock *sk)
 944 {
 945         lock_sock(sk);
 946         if (sk->sk_prot->keepalive)
 947                 sk->sk_prot->keepalive(sk, true);
 948         sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 949         release_sock(sk);
 950 }
 951 EXPORT_SYMBOL(sock_set_keepalive);
 952
 953 static void __sock_set_rcvbuf(struct sock *sk, int val)
 954 {
 955         /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 956          * as a negative value.
 957          */
 958         val = min_t(int, val, INT_MAX / 2);
 959         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 960
 961         /* We double it on the way in to account for "struct sk_buff" etc.
 962          * overhead.   Applications assume that the SO_RCVBUF setting they make
 963          * will allow that much actual data to be received on that socket.
 964          *
 965          * Applications are unaware that "struct sk_buff" and other overheads
 966          * allocate from the receive buffer during socket buffer allocation.
 967          *
 968          * And after considering the possible alternatives, returning the value
 969          * we actually used in getsockopt is the most desirable behavior.
 970          */
 971         WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 972 }
 973
 974 void sock_set_rcvbuf(struct sock *sk, int val)
 975 {
 976         lock_sock(sk);
 977         __sock_set_rcvbuf(sk, val);
 978         release_sock(sk);
 979 }
 980 EXPORT_SYMBOL(sock_set_rcvbuf);
 981
 982 static void __sock_set_mark(struct sock *sk, u32 val)
 983 {
 984         if (val != sk->sk_mark) {
 985                 WRITE_ONCE(sk->sk_mark, val);
 986                 sk_dst_reset(sk);
 987         }
 988 }
 989
 990 void sock_set_mark(struct sock *sk, u32 val)
 991 {
 992         lock_sock(sk);
 993         __sock_set_mark(sk, val);
 994         release_sock(sk);
 995 }
 996 EXPORT_SYMBOL(sock_set_mark);
 997
 998 static void sock_release_reserved_memory(struct sock *sk, int bytes)
 999 {
1000         /* Round down bytes to multiple of pages */
1001         bytes = round_down(bytes, PAGE_SIZE);
1002
1003         WARN_ON(bytes > sk->sk_reserved_mem);
1004         WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1005         sk_mem_reclaim(sk);
1006 }
1007
1008 static int sock_reserve_memory(struct sock *sk, int bytes)
1009 {
1010         long allocated;
1011         bool charged;
1012         int pages;
1013
1014         if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1015                 return -EOPNOTSUPP;
1016
1017         if (!bytes)
1018                 return 0;
1019
1020         pages = sk_mem_pages(bytes);
1021
1022         /* pre-charge to memcg */
1023         charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1024                                           GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1025         if (!charged)
1026                 return -ENOMEM;
1027
1028         /* pre-charge to forward_alloc */
1029         sk_memory_allocated_add(sk, pages);
1030         allocated = sk_memory_allocated(sk);
1031         /* If the system goes into memory pressure with this
1032          * precharge, give up and return error.
1033          */
1034         if (allocated > sk_prot_mem_limits(sk, 1)) {
1035                 sk_memory_allocated_sub(sk, pages);
1036                 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1037                 return -ENOMEM;
1038         }
1039         sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1040
1041         WRITE_ONCE(sk->sk_reserved_mem,
1042                    sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1043
1044         return 0;
1045 }
1046
1047 void sockopt_lock_sock(struct sock *sk)
1048 {
1049         /* When current->bpf_ctx is set, the setsockopt is called from
1050          * a bpf prog.  bpf has ensured the sk lock has been
1051          * acquired before calling setsockopt().
1052          */
1053         if (has_current_bpf_ctx())
1054                 return;
1055
1056         lock_sock(sk);
1057 }
1058 EXPORT_SYMBOL(sockopt_lock_sock);
1059
1060 void sockopt_release_sock(struct sock *sk)
1061 {
1062         if (has_current_bpf_ctx())
1063                 return;
1064
1065         release_sock(sk);
1066 }
1067 EXPORT_SYMBOL(sockopt_release_sock);
1068
1069 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1070 {
1071         return has_current_bpf_ctx() || ns_capable(ns, cap);
1072 }
1073 EXPORT_SYMBOL(sockopt_ns_capable);
1074
1075 bool sockopt_capable(int cap)
1076 {
1077         return has_current_bpf_ctx() || capable(cap);
1078 }
1079 EXPORT_SYMBOL(sockopt_capable);
1080
1081 /*
1082  *      This is meant for all protocols to use and covers goings on
1083  *      at the socket level. Everything here is generic.
1084  */
1085
1086 int sk_setsockopt(struct sock *sk, int level, int optname,
1087                   sockptr_t optval, unsigned int optlen)
1088 {
1089         struct so_timestamping timestamping;
1090         struct socket *sock = sk->sk_socket;
1091         struct sock_txtime sk_txtime;
1092         int val;
1093         int valbool;
1094         struct linger ling;
1095         int ret = 0;
1096
1097         /*
1098          *      Options without arguments
1099          */
1100
1101         if (optname == SO_BINDTODEVICE)
1102                 return sock_setbindtodevice(sk, optval, optlen);
1103
1104         if (optlen < sizeof(int))
1105                 return -EINVAL;
1106
1107         if (copy_from_sockptr(&val, optval, sizeof(val)))
1108                 return -EFAULT;
1109
1110         valbool = val ? 1 : 0;
1111
1112         sockopt_lock_sock(sk);
1113
1114         switch (optname) {
1115         case SO_DEBUG:
1116                 if (val && !sockopt_capable(CAP_NET_ADMIN))
1117                         ret = -EACCES;
1118                 else
1119                         sock_valbool_flag(sk, SOCK_DBG, valbool);
1120                 break;
1121         case SO_REUSEADDR:
1122                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1123                 break;
1124         case SO_REUSEPORT:
1125                 sk->sk_reuseport = valbool;
1126                 break;
1127         case SO_TYPE:
1128         case SO_PROTOCOL:
1129         case SO_DOMAIN:
1130         case SO_ERROR:
1131                 ret = -ENOPROTOOPT;
1132                 break;
1133         case SO_DONTROUTE:
1134                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1135                 sk_dst_reset(sk);
1136                 break;
1137         case SO_BROADCAST:
1138                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1139                 break;
1140         case SO_SNDBUF:
1141                 /* Don't error on this BSD doesn't and if you think
1142                  * about it this is right. Otherwise apps have to
1143                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1144                  * are treated in BSD as hints
1145                  */
1146                 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1147 set_sndbuf:
1148                 /* Ensure val * 2 fits into an int, to prevent max_t()
1149                  * from treating it as a negative value.
1150                  */
1151                 val = min_t(int, val, INT_MAX / 2);
1152                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1153                 WRITE_ONCE(sk->sk_sndbuf,
1154                            max_t(int, val * 2, SOCK_MIN_SNDBUF));
1155                 /* Wake up sending tasks if we upped the value. */
1156                 sk->sk_write_space(sk);
1157                 break;
1158
1159         case SO_SNDBUFFORCE:
1160                 if (!sockopt_capable(CAP_NET_ADMIN)) {
1161                         ret = -EPERM;
1162                         break;
1163                 }
1164
1165                 /* No negative values (to prevent underflow, as val will be
1166                  * multiplied by 2).
1167                  */
1168                 if (val < 0)
1169                         val = 0;
1170                 goto set_sndbuf;
1171
1172         case SO_RCVBUF:
1173                 /* Don't error on this BSD doesn't and if you think
1174                  * about it this is right. Otherwise apps have to
1175                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1176                  * are treated in BSD as hints
1177                  */
1178                 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1179                 break;
1180
1181         case SO_RCVBUFFORCE:
1182                 if (!sockopt_capable(CAP_NET_ADMIN)) {
1183                         ret = -EPERM;
1184                         break;
1185                 }
1186
1187                 /* No negative values (to prevent underflow, as val will be
1188                  * multiplied by 2).
1189                  */
1190                 __sock_set_rcvbuf(sk, max(val, 0));
1191                 break;
1192
1193         case SO_KEEPALIVE:
1194                 if (sk->sk_prot->keepalive)
1195                         sk->sk_prot->keepalive(sk, valbool);
1196                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1197                 break;
1198
1199         case SO_OOBINLINE:
1200                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1201                 break;
1202
1203         case SO_NO_CHECK:
1204                 sk->sk_no_check_tx = valbool;
1205                 break;
1206
1207         case SO_PRIORITY:
1208                 if ((val >= 0 && val <= 6) ||
1209                     sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1210                     sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1211                         WRITE_ONCE(sk->sk_priority, val);
1212                 else
1213                         ret = -EPERM;
1214                 break;
1215
1216         case SO_LINGER:
1217                 if (optlen < sizeof(ling)) {
1218                         ret = -EINVAL;  /* 1003.1g */
1219                         break;
1220                 }
1221                 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1222                         ret = -EFAULT;
1223                         break;
1224                 }
1225                 if (!ling.l_onoff) {
1226                         sock_reset_flag(sk, SOCK_LINGER);
1227                 } else {
1228                         unsigned long t_sec = ling.l_linger;
1229
1230                         if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1231                                 WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1232                         else
1233                                 WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1234                         sock_set_flag(sk, SOCK_LINGER);
1235                 }
1236                 break;
1237
1238         case SO_BSDCOMPAT:
1239                 break;
1240
1241         case SO_PASSCRED:
1242                 if (valbool)
1243                         set_bit(SOCK_PASSCRED, &sock->flags);
1244                 else
1245                         clear_bit(SOCK_PASSCRED, &sock->flags);
1246                 break;
1247
1248         case SO_TIMESTAMP_OLD:
1249         case SO_TIMESTAMP_NEW:
1250         case SO_TIMESTAMPNS_OLD:
1251         case SO_TIMESTAMPNS_NEW:
1252                 sock_set_timestamp(sk, optname, valbool);
1253                 break;
1254
1255         case SO_TIMESTAMPING_NEW:
1256         case SO_TIMESTAMPING_OLD:
1257                 if (optlen == sizeof(timestamping)) {
1258                         if (copy_from_sockptr(&timestamping, optval,
1259                                               sizeof(timestamping))) {
1260                                 ret = -EFAULT;
1261                                 break;
1262                         }
1263                 } else {
1264                         memset(&timestamping, 0, sizeof(timestamping));
1265                         timestamping.flags = val;
1266                 }
1267                 ret = sock_set_timestamping(sk, optname, timestamping);
1268                 break;
1269
1270         case SO_RCVLOWAT:
1271                 if (val < 0)
1272                         val = INT_MAX;
1273                 if (sock && sock->ops->set_rcvlowat)
1274                         ret = sock->ops->set_rcvlowat(sk, val);
1275                 else
1276                         WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1277                 break;
1278
1279         case SO_RCVTIMEO_OLD:
1280         case SO_RCVTIMEO_NEW:
1281                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1282                                        optlen, optname == SO_RCVTIMEO_OLD);
1283                 break;
1284
1285         case SO_SNDTIMEO_OLD:
1286         case SO_SNDTIMEO_NEW:
1287                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1288                                        optlen, optname == SO_SNDTIMEO_OLD);
1289                 break;
1290
1291         case SO_ATTACH_FILTER: {
1292                 struct sock_fprog fprog;
1293
1294                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1295                 if (!ret)
1296                         ret = sk_attach_filter(&fprog, sk);
1297                 break;
1298         }
1299         case SO_ATTACH_BPF:
1300                 ret = -EINVAL;
1301                 if (optlen == sizeof(u32)) {
1302                         u32 ufd;
1303
1304                         ret = -EFAULT;
1305                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1306                                 break;
1307
1308                         ret = sk_attach_bpf(ufd, sk);
1309                 }
1310                 break;
1311
1312         case SO_ATTACH_REUSEPORT_CBPF: {
1313                 struct sock_fprog fprog;
1314
1315                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1316                 if (!ret)
1317                         ret = sk_reuseport_attach_filter(&fprog, sk);
1318                 break;
1319         }
1320         case SO_ATTACH_REUSEPORT_EBPF:
1321                 ret = -EINVAL;
1322                 if (optlen == sizeof(u32)) {
1323                         u32 ufd;
1324
1325                         ret = -EFAULT;
1326                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1327                                 break;
1328
1329                         ret = sk_reuseport_attach_bpf(ufd, sk);
1330                 }
1331                 break;
1332
1333         case SO_DETACH_REUSEPORT_BPF:
1334                 ret = reuseport_detach_prog(sk);
1335                 break;
1336
1337         case SO_DETACH_FILTER:
1338                 ret = sk_detach_filter(sk);
1339                 break;
1340
1341         case SO_LOCK_FILTER:
1342                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1343                         ret = -EPERM;
1344                 else
1345                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1346                 break;
1347
1348         case SO_PASSSEC:
1349                 if (valbool)
1350                         set_bit(SOCK_PASSSEC, &sock->flags);
1351                 else
1352                         clear_bit(SOCK_PASSSEC, &sock->flags);
1353                 break;
1354         case SO_MARK:
1355                 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1356                     !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1357                         ret = -EPERM;
1358                         break;
1359                 }
1360
1361                 __sock_set_mark(sk, val);
1362                 break;
1363         case SO_RCVMARK:
1364                 sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1365                 break;
1366
1367         case SO_RXQ_OVFL:
1368                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1369                 break;
1370
1371         case SO_WIFI_STATUS:
1372                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1373                 break;
1374
1375         case SO_PEEK_OFF:
1376                 if (sock->ops->set_peek_off)
1377                         ret = sock->ops->set_peek_off(sk, val);
1378                 else
1379                         ret = -EOPNOTSUPP;
1380                 break;
1381
1382         case SO_NOFCS:
1383                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1384                 break;
1385
1386         case SO_SELECT_ERR_QUEUE:
1387                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1388                 break;
1389
1390 #ifdef CONFIG_NET_RX_BUSY_POLL
1391         case SO_BUSY_POLL:
1392                 /* allow unprivileged users to decrease the value */
1393                 if ((val > sk->sk_ll_usec) && !sockopt_capable(CAP_NET_ADMIN))
1394                         ret = -EPERM;
1395                 else {
1396                         if (val < 0)
1397                                 ret = -EINVAL;
1398                         else
1399                                 WRITE_ONCE(sk->sk_ll_usec, val);
1400                 }
1401                 break;
1402         case SO_PREFER_BUSY_POLL:
1403                 if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1404                         ret = -EPERM;
1405                 else
1406                         WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1407                 break;
1408         case SO_BUSY_POLL_BUDGET:
1409                 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) {
1410                         ret = -EPERM;
1411                 } else {
1412                         if (val < 0 || val > U16_MAX)
1413                                 ret = -EINVAL;
1414                         else
1415                                 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1416                 }
1417                 break;
1418 #endif
1419
1420         case SO_MAX_PACING_RATE:
1421                 {
1422                 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1423
1424                 if (sizeof(ulval) != sizeof(val) &&
1425                     optlen >= sizeof(ulval) &&
1426                     copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1427                         ret = -EFAULT;
1428                         break;
1429                 }
1430                 if (ulval != ~0UL)
1431                         cmpxchg(&sk->sk_pacing_status,
1432                                 SK_PACING_NONE,
1433                                 SK_PACING_NEEDED);
1434                 /* Pairs with READ_ONCE() from sk_getsockopt() */
1435                 WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1436                 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1437                 break;
1438                 }
1439         case SO_INCOMING_CPU:
1440                 reuseport_update_incoming_cpu(sk, val);
1441                 break;
1442
1443         case SO_CNX_ADVICE:
1444                 if (val == 1)
1445                         dst_negative_advice(sk);
1446                 break;
1447
1448         case SO_ZEROCOPY:
1449                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1450                         if (!(sk_is_tcp(sk) ||
1451                               (sk->sk_type == SOCK_DGRAM &&
1452                                sk->sk_protocol == IPPROTO_UDP)))
1453                                 ret = -EOPNOTSUPP;
1454                 } else if (sk->sk_family != PF_RDS) {
1455                         ret = -EOPNOTSUPP;
1456                 }
1457                 if (!ret) {
1458                         if (val < 0 || val > 1)
1459                                 ret = -EINVAL;
1460                         else
1461                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1462                 }
1463                 break;
1464
1465         case SO_TXTIME:
1466                 if (optlen != sizeof(struct sock_txtime)) {
1467                         ret = -EINVAL;
1468                         break;
1469                 } else if (copy_from_sockptr(&sk_txtime, optval,
1470                            sizeof(struct sock_txtime))) {
1471                         ret = -EFAULT;
1472                         break;
1473                 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1474                         ret = -EINVAL;
1475                         break;
1476                 }
1477                 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1478                  * scheduler has enough safe guards.
1479                  */
1480                 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1481                     !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1482                         ret = -EPERM;
1483                         break;
1484                 }
1485                 sock_valbool_flag(sk, SOCK_TXTIME, true);
1486                 sk->sk_clockid = sk_txtime.clockid;
1487                 sk->sk_txtime_deadline_mode =
1488                         !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1489                 sk->sk_txtime_report_errors =
1490                         !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1491                 break;
1492
1493         case SO_BINDTOIFINDEX:
1494                 ret = sock_bindtoindex_locked(sk, val);
1495                 break;
1496
1497         case SO_BUF_LOCK:
1498                 if (val & ~SOCK_BUF_LOCK_MASK) {
1499                         ret = -EINVAL;
1500                         break;
1501                 }
1502                 sk->sk_userlocks = val | (sk->sk_userlocks &
1503                                           ~SOCK_BUF_LOCK_MASK);
1504                 break;
1505
1506         case SO_RESERVE_MEM:
1507         {
1508                 int delta;
1509
1510                 if (val < 0) {
1511                         ret = -EINVAL;
1512                         break;
1513                 }
1514
1515                 delta = val - sk->sk_reserved_mem;
1516                 if (delta < 0)
1517                         sock_release_reserved_memory(sk, -delta);
1518                 else
1519                         ret = sock_reserve_memory(sk, delta);
1520                 break;
1521         }
1522
1523         case SO_TXREHASH:
1524                 if (val < -1 || val > 1) {
1525                         ret = -EINVAL;
1526                         break;
1527                 }
1528                 if ((u8)val == SOCK_TXREHASH_DEFAULT)
1529                         val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1530                 /* Paired with READ_ONCE() in tcp_rtx_synack()
1531                  * and sk_getsockopt().
1532                  */
1533                 WRITE_ONCE(sk->sk_txrehash, (u8)val);
1534                 break;
1535
1536         default:
1537                 ret = -ENOPROTOOPT;
1538                 break;
1539         }
1540         sockopt_release_sock(sk);
1541         return ret;
1542 }
1543
1544 int sock_setsockopt(struct socket *sock, int level, int optname,
1545                     sockptr_t optval, unsigned int optlen)
1546 {
1547         return sk_setsockopt(sock->sk, level, optname,
1548                              optval, optlen);
1549 }
1550 EXPORT_SYMBOL(sock_setsockopt);
1551
1552 static const struct cred *sk_get_peer_cred(struct sock *sk)
1553 {
1554         const struct cred *cred;
1555
1556         spin_lock(&sk->sk_peer_lock);
1557         cred = get_cred(sk->sk_peer_cred);
1558         spin_unlock(&sk->sk_peer_lock);
1559
1560         return cred;
1561 }
1562
1563 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1564                           struct ucred *ucred)
1565 {
1566         ucred->pid = pid_vnr(pid);
1567         ucred->uid = ucred->gid = -1;
1568         if (cred) {
1569                 struct user_namespace *current_ns = current_user_ns();
1570
1571                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1572                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1573         }
1574 }
1575
1576 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1577 {
1578         struct user_namespace *user_ns = current_user_ns();
1579         int i;
1580
1581         for (i = 0; i < src->ngroups; i++) {
1582                 gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1583
1584                 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1585                         return -EFAULT;
1586         }
1587
1588         return 0;
1589 }
1590
1591 int sk_getsockopt(struct sock *sk, int level, int optname,
1592                   sockptr_t optval, sockptr_t optlen)
1593 {
1594         struct socket *sock = sk->sk_socket;
1595
1596         union {
1597                 int val;
1598                 u64 val64;
1599                 unsigned long ulval;
1600                 struct linger ling;
1601                 struct old_timeval32 tm32;
1602                 struct __kernel_old_timeval tm;
1603                 struct  __kernel_sock_timeval stm;
1604                 struct sock_txtime txtime;
1605                 struct so_timestamping timestamping;
1606         } v;
1607
1608         int lv = sizeof(int);
1609         int len;
1610
1611         if (copy_from_sockptr(&len, optlen, sizeof(int)))
1612                 return -EFAULT;
1613         if (len < 0)
1614                 return -EINVAL;
1615
1616         memset(&v, 0, sizeof(v));
1617
1618         switch (optname) {
1619         case SO_DEBUG:
1620                 v.val = sock_flag(sk, SOCK_DBG);
1621                 break;
1622
1623         case SO_DONTROUTE:
1624                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1625                 break;
1626
1627         case SO_BROADCAST:
1628                 v.val = sock_flag(sk, SOCK_BROADCAST);
1629                 break;
1630
1631         case SO_SNDBUF:
1632                 v.val = READ_ONCE(sk->sk_sndbuf);
1633                 break;
1634
1635         case SO_RCVBUF:
1636                 v.val = READ_ONCE(sk->sk_rcvbuf);
1637                 break;
1638
1639         case SO_REUSEADDR:
1640                 v.val = sk->sk_reuse;
1641                 break;
1642
1643         case SO_REUSEPORT:
1644                 v.val = sk->sk_reuseport;
1645                 break;
1646
1647         case SO_KEEPALIVE:
1648                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1649                 break;
1650
1651         case SO_TYPE:
1652                 v.val = sk->sk_type;
1653                 break;
1654
1655         case SO_PROTOCOL:
1656                 v.val = sk->sk_protocol;
1657                 break;
1658
1659         case SO_DOMAIN:
1660                 v.val = sk->sk_family;
1661                 break;
1662
1663         case SO_ERROR:
1664                 v.val = -sock_error(sk);
1665                 if (v.val == 0)
1666                         v.val = xchg(&sk->sk_err_soft, 0);
1667                 break;
1668
1669         case SO_OOBINLINE:
1670                 v.val = sock_flag(sk, SOCK_URGINLINE);
1671                 break;
1672
1673         case SO_NO_CHECK:
1674                 v.val = sk->sk_no_check_tx;
1675                 break;
1676
1677         case SO_PRIORITY:
1678                 v.val = READ_ONCE(sk->sk_priority);
1679                 break;
1680
1681         case SO_LINGER:
1682                 lv              = sizeof(v.ling);
1683                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1684                 v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ;
1685                 break;
1686
1687         case SO_BSDCOMPAT:
1688                 break;
1689
1690         case SO_TIMESTAMP_OLD:
1691                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1692                                 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1693                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1694                 break;
1695
1696         case SO_TIMESTAMPNS_OLD:
1697                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1698                 break;
1699
1700         case SO_TIMESTAMP_NEW:
1701                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1702                 break;
1703
1704         case SO_TIMESTAMPNS_NEW:
1705                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1706                 break;
1707
1708         case SO_TIMESTAMPING_OLD:
1709         case SO_TIMESTAMPING_NEW:
1710                 lv = sizeof(v.timestamping);
1711                 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1712                  * returning the flags when they were set through the same option.
1713                  * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1714                  */
1715                 if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1716                         v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1717                         v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1718                 }
1719                 break;
1720
1721         case SO_RCVTIMEO_OLD:
1722         case SO_RCVTIMEO_NEW:
1723                 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1724                                       SO_RCVTIMEO_OLD == optname);
1725                 break;
1726
1727         case SO_SNDTIMEO_OLD:
1728         case SO_SNDTIMEO_NEW:
1729                 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1730                                       SO_SNDTIMEO_OLD == optname);
1731                 break;
1732
1733         case SO_RCVLOWAT:
1734                 v.val = READ_ONCE(sk->sk_rcvlowat);
1735                 break;
1736
1737         case SO_SNDLOWAT:
1738                 v.val = 1;
1739                 break;
1740
1741         case SO_PASSCRED:
1742                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1743                 break;
1744
1745         case SO_PEERCRED:
1746         {
1747                 struct ucred peercred;
1748                 if (len > sizeof(peercred))
1749                         len = sizeof(peercred);
1750
1751                 spin_lock(&sk->sk_peer_lock);
1752                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1753                 spin_unlock(&sk->sk_peer_lock);
1754
1755                 if (copy_to_sockptr(optval, &peercred, len))
1756                         return -EFAULT;
1757                 goto lenout;
1758         }
1759
1760         case SO_PEERGROUPS:
1761         {
1762                 const struct cred *cred;
1763                 int ret, n;
1764
1765                 cred = sk_get_peer_cred(sk);
1766                 if (!cred)
1767                         return -ENODATA;
1768
1769                 n = cred->group_info->ngroups;
1770                 if (len < n * sizeof(gid_t)) {
1771                         len = n * sizeof(gid_t);
1772                         put_cred(cred);
1773                         return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1774                 }
1775                 len = n * sizeof(gid_t);
1776
1777                 ret = groups_to_user(optval, cred->group_info);
1778                 put_cred(cred);
1779                 if (ret)
1780                         return ret;
1781                 goto lenout;
1782         }
1783
1784         case SO_PEERNAME:
1785         {
1786                 struct sockaddr_storage address;
1787
1788                 lv = sock->ops->getname(sock, (struct sockaddr *)&address, 2);
1789                 if (lv < 0)
1790                         return -ENOTCONN;
1791                 if (lv < len)
1792                         return -EINVAL;
1793                 if (copy_to_sockptr(optval, &address, len))
1794                         return -EFAULT;
1795                 goto lenout;
1796         }
1797
1798         /* Dubious BSD thing... Probably nobody even uses it, but
1799          * the UNIX standard wants it for whatever reason... -DaveM
1800          */
1801         case SO_ACCEPTCONN:
1802                 v.val = sk->sk_state == TCP_LISTEN;
1803                 break;
1804
1805         case SO_PASSSEC:
1806                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1807                 break;
1808
1809         case SO_PEERSEC:
1810                 return security_socket_getpeersec_stream(sock, optval.user, optlen.user, len);
1811
1812         case SO_MARK:
1813                 v.val = READ_ONCE(sk->sk_mark);
1814                 break;
1815
1816         case SO_RCVMARK:
1817                 v.val = sock_flag(sk, SOCK_RCVMARK);
1818                 break;
1819
1820         case SO_RXQ_OVFL:
1821                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1822                 break;
1823
1824         case SO_WIFI_STATUS:
1825                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1826                 break;
1827
1828         case SO_PEEK_OFF:
1829                 if (!sock->ops->set_peek_off)
1830                         return -EOPNOTSUPP;
1831
1832                 v.val = READ_ONCE(sk->sk_peek_off);
1833                 break;
1834         case SO_NOFCS:
1835                 v.val = sock_flag(sk, SOCK_NOFCS);
1836                 break;
1837
1838         case SO_BINDTODEVICE:
1839                 return sock_getbindtodevice(sk, optval, optlen, len);
1840
1841         case SO_GET_FILTER:
1842                 len = sk_get_filter(sk, optval, len);
1843                 if (len < 0)
1844                         return len;
1845
1846                 goto lenout;
1847
1848         case SO_LOCK_FILTER:
1849                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1850                 break;
1851
1852         case SO_BPF_EXTENSIONS:
1853                 v.val = bpf_tell_extensions();
1854                 break;
1855
1856         case SO_SELECT_ERR_QUEUE:
1857                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1858                 break;
1859
1860 #ifdef CONFIG_NET_RX_BUSY_POLL
1861         case SO_BUSY_POLL:
1862                 v.val = READ_ONCE(sk->sk_ll_usec);
1863                 break;
1864         case SO_PREFER_BUSY_POLL:
1865                 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1866                 break;
1867 #endif
1868
1869         case SO_MAX_PACING_RATE:
1870                 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1871                 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1872                         lv = sizeof(v.ulval);
1873                         v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1874                 } else {
1875                         /* 32bit version */
1876                         v.val = min_t(unsigned long, ~0U,
1877                                       READ_ONCE(sk->sk_max_pacing_rate));
1878                 }
1879                 break;
1880
1881         case SO_INCOMING_CPU:
1882                 v.val = READ_ONCE(sk->sk_incoming_cpu);
1883                 break;
1884
1885         case SO_MEMINFO:
1886         {
1887                 u32 meminfo[SK_MEMINFO_VARS];
1888
1889                 sk_get_meminfo(sk, meminfo);
1890
1891                 len = min_t(unsigned int, len, sizeof(meminfo));
1892                 if (copy_to_sockptr(optval, &meminfo, len))
1893                         return -EFAULT;
1894
1895                 goto lenout;
1896         }
1897
1898 #ifdef CONFIG_NET_RX_BUSY_POLL
1899         case SO_INCOMING_NAPI_ID:
1900                 v.val = READ_ONCE(sk->sk_napi_id);
1901
1902                 /* aggregate non-NAPI IDs down to 0 */
1903                 if (v.val < MIN_NAPI_ID)
1904                         v.val = 0;
1905
1906                 break;
1907 #endif
1908
1909         case SO_COOKIE:
1910                 lv = sizeof(u64);
1911                 if (len < lv)
1912                         return -EINVAL;
1913                 v.val64 = sock_gen_cookie(sk);
1914                 break;
1915
1916         case SO_ZEROCOPY:
1917                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1918                 break;
1919
1920         case SO_TXTIME:
1921                 lv = sizeof(v.txtime);
1922                 v.txtime.clockid = sk->sk_clockid;
1923                 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1924                                   SOF_TXTIME_DEADLINE_MODE : 0;
1925                 v.txtime.flags |= sk->sk_txtime_report_errors ?
1926                                   SOF_TXTIME_REPORT_ERRORS : 0;
1927                 break;
1928
1929         case SO_BINDTOIFINDEX:
1930                 v.val = READ_ONCE(sk->sk_bound_dev_if);
1931                 break;
1932
1933         case SO_NETNS_COOKIE:
1934                 lv = sizeof(u64);
1935                 if (len != lv)
1936                         return -EINVAL;
1937                 v.val64 = sock_net(sk)->net_cookie;
1938                 break;
1939
1940         case SO_BUF_LOCK:
1941                 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1942                 break;
1943
1944         case SO_RESERVE_MEM:
1945                 v.val = READ_ONCE(sk->sk_reserved_mem);
1946                 break;
1947
1948         case SO_TXREHASH:
1949                 /* Paired with WRITE_ONCE() in sk_setsockopt() */
1950                 v.val = READ_ONCE(sk->sk_txrehash);
1951                 break;
1952
1953         default:
1954                 /* We implement the SO_SNDLOWAT etc to not be settable
1955                  * (1003.1g 7).
1956                  */
1957                 return -ENOPROTOOPT;
1958         }
1959
1960         if (len > lv)
1961                 len = lv;
1962         if (copy_to_sockptr(optval, &v, len))
1963                 return -EFAULT;
1964 lenout:
1965         if (copy_to_sockptr(optlen, &len, sizeof(int)))
1966                 return -EFAULT;
1967         return 0;
1968 }
1969
1970 int sock_getsockopt(struct socket *sock, int level, int optname,
1971                     char __user *optval, int __user *optlen)
1972 {
1973         return sk_getsockopt(sock->sk, level, optname,
1974                              USER_SOCKPTR(optval),
1975                              USER_SOCKPTR(optlen));
1976 }
1977
1978 /*
1979  * Initialize an sk_lock.
1980  *
1981  * (We also register the sk_lock with the lock validator.)
1982  */
1983 static inline void sock_lock_init(struct sock *sk)
1984 {
1985         if (sk->sk_kern_sock)
1986                 sock_lock_init_class_and_name(
1987                         sk,
1988                         af_family_kern_slock_key_strings[sk->sk_family],
1989                         af_family_kern_slock_keys + sk->sk_family,
1990                         af_family_kern_key_strings[sk->sk_family],
1991                         af_family_kern_keys + sk->sk_family);
1992         else
1993                 sock_lock_init_class_and_name(
1994                         sk,
1995                         af_family_slock_key_strings[sk->sk_family],
1996                         af_family_slock_keys + sk->sk_family,
1997                         af_family_key_strings[sk->sk_family],
1998                         af_family_keys + sk->sk_family);
1999 }
2000
2001 /*
2002  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2003  * even temporarly, because of RCU lookups. sk_node should also be left as is.
2004  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2005  */
2006 static void sock_copy(struct sock *nsk, const struct sock *osk)
2007 {
2008         const struct proto *prot = READ_ONCE(osk->sk_prot);
2009 #ifdef CONFIG_SECURITY_NETWORK
2010         void *sptr = nsk->sk_security;
2011 #endif
2012
2013         /* If we move sk_tx_queue_mapping out of the private section,
2014          * we must check if sk_tx_queue_clear() is called after
2015          * sock_copy() in sk_clone_lock().
2016          */
2017         BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2018                      offsetof(struct sock, sk_dontcopy_begin) ||
2019                      offsetof(struct sock, sk_tx_queue_mapping) >=
2020                      offsetof(struct sock, sk_dontcopy_end));
2021
2022         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2023
2024         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2025                prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
2026
2027 #ifdef CONFIG_SECURITY_NETWORK
2028         nsk->sk_security = sptr;
2029         security_sk_clone(osk, nsk);
2030 #endif
2031 }
2032
2033 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2034                 int family)
2035 {
2036         struct sock *sk;
2037         struct kmem_cache *slab;
2038
2039         slab = prot->slab;
2040         if (slab != NULL) {
2041                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2042                 if (!sk)
2043                         return sk;
2044                 if (want_init_on_alloc(priority))
2045                         sk_prot_clear_nulls(sk, prot->obj_size);
2046         } else
2047                 sk = kmalloc(prot->obj_size, priority);
2048
2049         if (sk != NULL) {
2050                 if (security_sk_alloc(sk, family, priority))
2051                         goto out_free;
2052
2053                 if (!try_module_get(prot->owner))
2054                         goto out_free_sec;
2055         }
2056
2057         return sk;
2058
2059 out_free_sec:
2060         security_sk_free(sk);
2061 out_free:
2062         if (slab != NULL)
2063                 kmem_cache_free(slab, sk);
2064         else
2065                 kfree(sk);
2066         return NULL;
2067 }
2068
2069 static void sk_prot_free(struct proto *prot, struct sock *sk)
2070 {
2071         struct kmem_cache *slab;
2072         struct module *owner;
2073
2074         owner = prot->owner;
2075         slab = prot->slab;
2076
2077         cgroup_sk_free(&sk->sk_cgrp_data);
2078         mem_cgroup_sk_free(sk);
2079         security_sk_free(sk);
2080         if (slab != NULL)
2081                 kmem_cache_free(slab, sk);
2082         else
2083                 kfree(sk);
2084         module_put(owner);
2085 }
2086
2087 /**
2088  *      sk_alloc - All socket objects are allocated here
2089  *      @net: the applicable net namespace
2090  *      @family: protocol family
2091  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2092  *      @prot: struct proto associated with this new sock instance
2093  *      @kern: is this to be a kernel socket?
2094  */
2095 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2096                       struct proto *prot, int kern)
2097 {
2098         struct sock *sk;
2099
2100         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2101         if (sk) {
2102                 sk->sk_family = family;
2103                 /*
2104                  * See comment in struct sock definition to understand
2105                  * why we need sk_prot_creator -acme
2106                  */
2107                 sk->sk_prot = sk->sk_prot_creator = prot;
2108                 sk->sk_kern_sock = kern;
2109                 sock_lock_init(sk);
2110                 sk->sk_net_refcnt = kern ? 0 : 1;
2111                 if (likely(sk->sk_net_refcnt)) {
2112                         get_net_track(net, &sk->ns_tracker, priority);
2113                         sock_inuse_add(net, 1);
2114                 }
2115
2116                 sock_net_set(sk, net);
2117                 refcount_set(&sk->sk_wmem_alloc, 1);
2118
2119                 mem_cgroup_sk_alloc(sk);
2120                 cgroup_sk_alloc(&sk->sk_cgrp_data);
2121                 sock_update_classid(&sk->sk_cgrp_data);
2122                 sock_update_netprioidx(&sk->sk_cgrp_data);
2123                 sk_tx_queue_clear(sk);
2124         }
2125
2126         return sk;
2127 }
2128 EXPORT_SYMBOL(sk_alloc);
2129
2130 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2131  * grace period. This is the case for UDP sockets and TCP listeners.
2132  */
2133 static void __sk_destruct(struct rcu_head *head)
2134 {
2135         struct sock *sk = container_of(head, struct sock, sk_rcu);
2136         struct sk_filter *filter;
2137
2138         if (sk->sk_destruct)
2139                 sk->sk_destruct(sk);
2140
2141         filter = rcu_dereference_check(sk->sk_filter,
2142                                        refcount_read(&sk->sk_wmem_alloc) == 0);
2143         if (filter) {
2144                 sk_filter_uncharge(sk, filter);
2145                 RCU_INIT_POINTER(sk->sk_filter, NULL);
2146         }
2147
2148         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2149
2150 #ifdef CONFIG_BPF_SYSCALL
2151         bpf_sk_storage_free(sk);
2152 #endif
2153
2154         if (atomic_read(&sk->sk_omem_alloc))
2155                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2156                          __func__, atomic_read(&sk->sk_omem_alloc));
2157
2158         if (sk->sk_frag.page) {
2159                 put_page(sk->sk_frag.page);
2160                 sk->sk_frag.page = NULL;
2161         }
2162
2163         /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2164         put_cred(sk->sk_peer_cred);
2165         put_pid(sk->sk_peer_pid);
2166
2167         if (likely(sk->sk_net_refcnt))
2168                 put_net_track(sock_net(sk), &sk->ns_tracker);
2169         sk_prot_free(sk->sk_prot_creator, sk);
2170 }
2171
2172 void sk_destruct(struct sock *sk)
2173 {
2174         bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2175
2176         if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2177                 reuseport_detach_sock(sk);
2178                 use_call_rcu = true;
2179         }
2180
2181         if (use_call_rcu)
2182                 call_rcu(&sk->sk_rcu, __sk_destruct);
2183         else
2184                 __sk_destruct(&sk->sk_rcu);
2185 }
2186
2187 static void __sk_free(struct sock *sk)
2188 {
2189         if (likely(sk->sk_net_refcnt))
2190                 sock_inuse_add(sock_net(sk), -1);
2191
2192         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2193                 sock_diag_broadcast_destroy(sk);
2194         else
2195                 sk_destruct(sk);
2196 }
2197
2198 void sk_free(struct sock *sk)
2199 {
2200         /*
2201          * We subtract one from sk_wmem_alloc and can know if
2202          * some packets are still in some tx queue.
2203          * If not null, sock_wfree() will call __sk_free(sk) later
2204          */
2205         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2206                 __sk_free(sk);
2207 }
2208 EXPORT_SYMBOL(sk_free);
2209
2210 static void sk_init_common(struct sock *sk)
2211 {
2212         skb_queue_head_init(&sk->sk_receive_queue);
2213         skb_queue_head_init(&sk->sk_write_queue);
2214         skb_queue_head_init(&sk->sk_error_queue);
2215
2216         rwlock_init(&sk->sk_callback_lock);
2217         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2218                         af_rlock_keys + sk->sk_family,
2219                         af_family_rlock_key_strings[sk->sk_family]);
2220         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2221                         af_wlock_keys + sk->sk_family,
2222                         af_family_wlock_key_strings[sk->sk_family]);
2223         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2224                         af_elock_keys + sk->sk_family,
2225                         af_family_elock_key_strings[sk->sk_family]);
2226         lockdep_set_class_and_name(&sk->sk_callback_lock,
2227                         af_callback_keys + sk->sk_family,
2228                         af_family_clock_key_strings[sk->sk_family]);
2229 }
2230
2231 /**
2232  *      sk_clone_lock - clone a socket, and lock its clone
2233  *      @sk: the socket to clone
2234  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2235  *
2236  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2237  */
2238 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2239 {
2240         struct proto *prot = READ_ONCE(sk->sk_prot);
2241         struct sk_filter *filter;
2242         bool is_charged = true;
2243         struct sock *newsk;
2244
2245         newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2246         if (!newsk)
2247                 goto out;
2248
2249         sock_copy(newsk, sk);
2250
2251         newsk->sk_prot_creator = prot;
2252
2253         /* SANITY */
2254         if (likely(newsk->sk_net_refcnt)) {
2255                 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2256                 sock_inuse_add(sock_net(newsk), 1);
2257         }
2258         sk_node_init(&newsk->sk_node);
2259         sock_lock_init(newsk);
2260         bh_lock_sock(newsk);
2261         newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
2262         newsk->sk_backlog.len = 0;
2263
2264         atomic_set(&newsk->sk_rmem_alloc, 0);
2265
2266         /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2267         refcount_set(&newsk->sk_wmem_alloc, 1);
2268
2269         atomic_set(&newsk->sk_omem_alloc, 0);
2270         sk_init_common(newsk);
2271
2272         newsk->sk_dst_cache     = NULL;
2273         newsk->sk_dst_pending_confirm = 0;
2274         newsk->sk_wmem_queued   = 0;
2275         newsk->sk_forward_alloc = 0;
2276         newsk->sk_reserved_mem  = 0;
2277         atomic_set(&newsk->sk_drops, 0);
2278         newsk->sk_send_head     = NULL;
2279         newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2280         atomic_set(&newsk->sk_zckey, 0);
2281
2282         sock_reset_flag(newsk, SOCK_DONE);
2283
2284         /* sk->sk_memcg will be populated at accept() time */
2285         newsk->sk_memcg = NULL;
2286
2287         cgroup_sk_clone(&newsk->sk_cgrp_data);
2288
2289         rcu_read_lock();
2290         filter = rcu_dereference(sk->sk_filter);
2291         if (filter != NULL)
2292                 /* though it's an empty new sock, the charging may fail
2293                  * if sysctl_optmem_max was changed between creation of
2294                  * original socket and cloning
2295                  */
2296                 is_charged = sk_filter_charge(newsk, filter);
2297         RCU_INIT_POINTER(newsk->sk_filter, filter);
2298         rcu_read_unlock();
2299
2300         if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2301                 /* We need to make sure that we don't uncharge the new
2302                  * socket if we couldn't charge it in the first place
2303                  * as otherwise we uncharge the parent's filter.
2304                  */
2305                 if (!is_charged)
2306                         RCU_INIT_POINTER(newsk->sk_filter, NULL);
2307                 sk_free_unlock_clone(newsk);
2308                 newsk = NULL;
2309                 goto out;
2310         }
2311         RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2312
2313         if (bpf_sk_storage_clone(sk, newsk)) {
2314                 sk_free_unlock_clone(newsk);
2315                 newsk = NULL;
2316                 goto out;
2317         }
2318
2319         /* Clear sk_user_data if parent had the pointer tagged
2320          * as not suitable for copying when cloning.
2321          */
2322         if (sk_user_data_is_nocopy(newsk))
2323                 newsk->sk_user_data = NULL;
2324
2325         newsk->sk_err      = 0;
2326         newsk->sk_err_soft = 0;
2327         newsk->sk_priority = 0;
2328         newsk->sk_incoming_cpu = raw_smp_processor_id();
2329
2330         /* Before updating sk_refcnt, we must commit prior changes to memory
2331          * (Documentation/RCU/rculist_nulls.rst for details)
2332          */
2333         smp_wmb();
2334         refcount_set(&newsk->sk_refcnt, 2);
2335
2336         /* Increment the counter in the same struct proto as the master
2337          * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2338          * is the same as sk->sk_prot->socks, as this field was copied
2339          * with memcpy).
2340          *
2341          * This _changes_ the previous behaviour, where
2342          * tcp_create_openreq_child always was incrementing the
2343          * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2344          * to be taken into account in all callers. -acme
2345          */
2346         sk_refcnt_debug_inc(newsk);
2347         sk_set_socket(newsk, NULL);
2348         sk_tx_queue_clear(newsk);
2349         RCU_INIT_POINTER(newsk->sk_wq, NULL);
2350
2351         if (newsk->sk_prot->sockets_allocated)
2352                 sk_sockets_allocated_inc(newsk);
2353
2354         if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2355                 net_enable_timestamp();
2356 out:
2357         return newsk;
2358 }
2359 EXPORT_SYMBOL_GPL(sk_clone_lock);
2360
2361 void sk_free_unlock_clone(struct sock *sk)
2362 {
2363         /* It is still raw copy of parent, so invalidate
2364          * destructor and make plain sk_free() */
2365         sk->sk_destruct = NULL;
2366         bh_unlock_sock(sk);
2367         sk_free(sk);
2368 }
2369 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2370
2371 static void sk_trim_gso_size(struct sock *sk)
2372 {
2373         if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE)
2374                 return;
2375 #if IS_ENABLED(CONFIG_IPV6)
2376         if (sk->sk_family == AF_INET6 &&
2377             sk_is_tcp(sk) &&
2378             !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
2379                 return;
2380 #endif
2381         sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
2382 }
2383
2384 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2385 {
2386         u32 max_segs = 1;
2387
2388         sk->sk_route_caps = dst->dev->features;
2389         if (sk_is_tcp(sk))
2390                 sk->sk_route_caps |= NETIF_F_GSO;
2391         if (sk->sk_route_caps & NETIF_F_GSO)
2392                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2393         if (unlikely(sk->sk_gso_disabled))
2394                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2395         if (sk_can_gso(sk)) {
2396                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2397                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2398                 } else {
2399                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2400                         /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
2401                         sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
2402                         sk_trim_gso_size(sk);
2403                         sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
2404                         /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2405                         max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2406                 }
2407         }
2408         sk->sk_gso_max_segs = max_segs;
2409         sk_dst_set(sk, dst);
2410 }
2411 EXPORT_SYMBOL_GPL(sk_setup_caps);
2412
2413 /*
2414  *      Simple resource managers for sockets.
2415  */
2416
2417
2418 /*
2419  * Write buffer destructor automatically called from kfree_skb.
2420  */
2421 void sock_wfree(struct sk_buff *skb)
2422 {
2423         struct sock *sk = skb->sk;
2424         unsigned int len = skb->truesize;
2425         bool free;
2426
2427         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2428                 if (sock_flag(sk, SOCK_RCU_FREE) &&
2429                     sk->sk_write_space == sock_def_write_space) {
2430                         rcu_read_lock();
2431                         free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2432                         sock_def_write_space_wfree(sk);
2433                         rcu_read_unlock();
2434                         if (unlikely(free))
2435                                 __sk_free(sk);
2436                         return;
2437                 }
2438
2439                 /*
2440                  * Keep a reference on sk_wmem_alloc, this will be released
2441                  * after sk_write_space() call
2442                  */
2443                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2444                 sk->sk_write_space(sk);
2445                 len = 1;
2446         }
2447         /*
2448          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2449          * could not do because of in-flight packets
2450          */
2451         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2452                 __sk_free(sk);
2453 }
2454 EXPORT_SYMBOL(sock_wfree);
2455
2456 /* This variant of sock_wfree() is used by TCP,
2457  * since it sets SOCK_USE_WRITE_QUEUE.
2458  */
2459 void __sock_wfree(struct sk_buff *skb)
2460 {
2461         struct sock *sk = skb->sk;
2462
2463         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2464                 __sk_free(sk);
2465 }
2466
2467 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2468 {
2469         skb_orphan(skb);
2470         skb->sk = sk;
2471 #ifdef CONFIG_INET
2472         if (unlikely(!sk_fullsock(sk))) {
2473                 skb->destructor = sock_edemux;
2474                 sock_hold(sk);
2475                 return;
2476         }
2477 #endif
2478         skb->destructor = sock_wfree;
2479         skb_set_hash_from_sk(skb, sk);
2480         /*
2481          * We used to take a refcount on sk, but following operation
2482          * is enough to guarantee sk_free() wont free this sock until
2483          * all in-flight packets are completed
2484          */
2485         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2486 }
2487 EXPORT_SYMBOL(skb_set_owner_w);
2488
2489 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2490 {
2491 #ifdef CONFIG_TLS_DEVICE
2492         /* Drivers depend on in-order delivery for crypto offload,
2493          * partial orphan breaks out-of-order-OK logic.
2494          */
2495         if (skb->decrypted)
2496                 return false;
2497 #endif
2498         return (skb->destructor == sock_wfree ||
2499                 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2500 }
2501
2502 /* This helper is used by netem, as it can hold packets in its
2503  * delay queue. We want to allow the owner socket to send more
2504  * packets, as if they were already TX completed by a typical driver.
2505  * But we also want to keep skb->sk set because some packet schedulers
2506  * rely on it (sch_fq for example).
2507  */
2508 void skb_orphan_partial(struct sk_buff *skb)
2509 {
2510         if (skb_is_tcp_pure_ack(skb))
2511                 return;
2512
2513         if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2514                 return;
2515
2516         skb_orphan(skb);
2517 }
2518 EXPORT_SYMBOL(skb_orphan_partial);
2519
2520 /*
2521  * Read buffer destructor automatically called from kfree_skb.
2522  */
2523 void sock_rfree(struct sk_buff *skb)
2524 {
2525         struct sock *sk = skb->sk;
2526         unsigned int len = skb->truesize;
2527
2528         atomic_sub(len, &sk->sk_rmem_alloc);
2529         sk_mem_uncharge(sk, len);
2530 }
2531 EXPORT_SYMBOL(sock_rfree);
2532
2533 /*
2534  * Buffer destructor for skbs that are not used directly in read or write
2535  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2536  */
2537 void sock_efree(struct sk_buff *skb)
2538 {
2539         sock_put(skb->sk);
2540 }
2541 EXPORT_SYMBOL(sock_efree);
2542
2543 /* Buffer destructor for prefetch/receive path where reference count may
2544  * not be held, e.g. for listen sockets.
2545  */
2546 #ifdef CONFIG_INET
2547 void sock_pfree(struct sk_buff *skb)
2548 {
2549         if (sk_is_refcounted(skb->sk))
2550                 sock_gen_put(skb->sk);
2551 }
2552 EXPORT_SYMBOL(sock_pfree);
2553 #endif /* CONFIG_INET */
2554
2555 kuid_t sock_i_uid(struct sock *sk)
2556 {
2557         kuid_t uid;
2558
2559         read_lock_bh(&sk->sk_callback_lock);
2560         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2561         read_unlock_bh(&sk->sk_callback_lock);
2562         return uid;
2563 }
2564 EXPORT_SYMBOL(sock_i_uid);
2565
2566 unsigned long __sock_i_ino(struct sock *sk)
2567 {
2568         unsigned long ino;
2569
2570         read_lock(&sk->sk_callback_lock);
2571         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2572         read_unlock(&sk->sk_callback_lock);
2573         return ino;
2574 }
2575 EXPORT_SYMBOL(__sock_i_ino);
2576
2577 unsigned long sock_i_ino(struct sock *sk)
2578 {
2579         unsigned long ino;
2580
2581         local_bh_disable();
2582         ino = __sock_i_ino(sk);
2583         local_bh_enable();
2584         return ino;
2585 }
2586 EXPORT_SYMBOL(sock_i_ino);
2587
2588 /*
2589  * Allocate a skb from the socket's send buffer.
2590  */
2591 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2592                              gfp_t priority)
2593 {
2594         if (force ||
2595             refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2596                 struct sk_buff *skb = alloc_skb(size, priority);
2597
2598                 if (skb) {
2599                         skb_set_owner_w(skb, sk);
2600                         return skb;
2601                 }
2602         }
2603         return NULL;
2604 }
2605 EXPORT_SYMBOL(sock_wmalloc);
2606
2607 static void sock_ofree(struct sk_buff *skb)
2608 {
2609         struct sock *sk = skb->sk;
2610
2611         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2612 }
2613
2614 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2615                              gfp_t priority)
2616 {
2617         struct sk_buff *skb;
2618
2619         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2620         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2621             READ_ONCE(sysctl_optmem_max))
2622                 return NULL;
2623
2624         skb = alloc_skb(size, priority);
2625         if (!skb)
2626                 return NULL;
2627
2628         atomic_add(skb->truesize, &sk->sk_omem_alloc);
2629         skb->sk = sk;
2630         skb->destructor = sock_ofree;
2631         return skb;
2632 }
2633
2634 /*
2635  * Allocate a memory block from the socket's option memory buffer.
2636  */
2637 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2638 {
2639         int optmem_max = READ_ONCE(sysctl_optmem_max);
2640
2641         if ((unsigned int)size <= optmem_max &&
2642             atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2643                 void *mem;
2644                 /* First do the add, to avoid the race if kmalloc
2645                  * might sleep.
2646                  */
2647                 atomic_add(size, &sk->sk_omem_alloc);
2648                 mem = kmalloc(size, priority);
2649                 if (mem)
2650                         return mem;
2651                 atomic_sub(size, &sk->sk_omem_alloc);
2652         }
2653         return NULL;
2654 }
2655 EXPORT_SYMBOL(sock_kmalloc);
2656
2657 /* Free an option memory block. Note, we actually want the inline
2658  * here as this allows gcc to detect the nullify and fold away the
2659  * condition entirely.
2660  */
2661 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2662                                   const bool nullify)
2663 {
2664         if (WARN_ON_ONCE(!mem))
2665                 return;
2666         if (nullify)
2667                 kfree_sensitive(mem);
2668         else
2669                 kfree(mem);
2670         atomic_sub(size, &sk->sk_omem_alloc);
2671 }
2672
2673 void sock_kfree_s(struct sock *sk, void *mem, int size)
2674 {
2675         __sock_kfree_s(sk, mem, size, false);
2676 }
2677 EXPORT_SYMBOL(sock_kfree_s);
2678
2679 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2680 {
2681         __sock_kfree_s(sk, mem, size, true);
2682 }
2683 EXPORT_SYMBOL(sock_kzfree_s);
2684
2685 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2686    I think, these locks should be removed for datagram sockets.
2687  */
2688 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2689 {
2690         DEFINE_WAIT(wait);
2691
2692         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2693         for (;;) {
2694                 if (!timeo)
2695                         break;
2696                 if (signal_pending(current))
2697                         break;
2698                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2699                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2700                 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2701                         break;
2702                 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2703                         break;
2704                 if (READ_ONCE(sk->sk_err))
2705                         break;
2706                 timeo = schedule_timeout(timeo);
2707         }
2708         finish_wait(sk_sleep(sk), &wait);
2709         return timeo;
2710 }
2711
2712
2713 /*
2714  *      Generic send/receive buffer handlers
2715  */
2716
2717 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2718                                      unsigned long data_len, int noblock,
2719                                      int *errcode, int max_page_order)
2720 {
2721         struct sk_buff *skb;
2722         long timeo;
2723         int err;
2724
2725         timeo = sock_sndtimeo(sk, noblock);
2726         for (;;) {
2727                 err = sock_error(sk);
2728                 if (err != 0)
2729                         goto failure;
2730
2731                 err = -EPIPE;
2732                 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2733                         goto failure;
2734
2735                 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2736                         break;
2737
2738                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2739                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2740                 err = -EAGAIN;
2741                 if (!timeo)
2742                         goto failure;
2743                 if (signal_pending(current))
2744                         goto interrupted;
2745                 timeo = sock_wait_for_wmem(sk, timeo);
2746         }
2747         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2748                                    errcode, sk->sk_allocation);
2749         if (skb)
2750                 skb_set_owner_w(skb, sk);
2751         return skb;
2752
2753 interrupted:
2754         err = sock_intr_errno(timeo);
2755 failure:
2756         *errcode = err;
2757         return NULL;
2758 }
2759 EXPORT_SYMBOL(sock_alloc_send_pskb);
2760
2761 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2762                      struct sockcm_cookie *sockc)
2763 {
2764         u32 tsflags;
2765
2766         switch (cmsg->cmsg_type) {
2767         case SO_MARK:
2768                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2769                     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2770                         return -EPERM;
2771                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2772                         return -EINVAL;
2773                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2774                 break;
2775         case SO_TIMESTAMPING_OLD:
2776         case SO_TIMESTAMPING_NEW:
2777                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2778                         return -EINVAL;
2779
2780                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2781                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2782                         return -EINVAL;
2783
2784                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2785                 sockc->tsflags |= tsflags;
2786                 break;
2787         case SCM_TXTIME:
2788                 if (!sock_flag(sk, SOCK_TXTIME))
2789                         return -EINVAL;
2790                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2791                         return -EINVAL;
2792                 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2793                 break;
2794         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2795         case SCM_RIGHTS:
2796         case SCM_CREDENTIALS:
2797                 break;
2798         default:
2799                 return -EINVAL;
2800         }
2801         return 0;
2802 }
2803 EXPORT_SYMBOL(__sock_cmsg_send);
2804
2805 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2806                    struct sockcm_cookie *sockc)
2807 {
2808         struct cmsghdr *cmsg;
2809         int ret;
2810
2811         for_each_cmsghdr(cmsg, msg) {
2812                 if (!CMSG_OK(msg, cmsg))
2813                         return -EINVAL;
2814                 if (cmsg->cmsg_level != SOL_SOCKET)
2815                         continue;
2816                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2817                 if (ret)
2818                         return ret;
2819         }
2820         return 0;
2821 }
2822 EXPORT_SYMBOL(sock_cmsg_send);
2823
2824 static void sk_enter_memory_pressure(struct sock *sk)
2825 {
2826         if (!sk->sk_prot->enter_memory_pressure)
2827                 return;
2828
2829         sk->sk_prot->enter_memory_pressure(sk);
2830 }
2831
2832 static void sk_leave_memory_pressure(struct sock *sk)
2833 {
2834         if (sk->sk_prot->leave_memory_pressure) {
2835                 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2836                                      tcp_leave_memory_pressure, sk);
2837         } else {
2838                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2839
2840                 if (memory_pressure && READ_ONCE(*memory_pressure))
2841                         WRITE_ONCE(*memory_pressure, 0);
2842         }
2843 }
2844
2845 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2846
2847 /**
2848  * skb_page_frag_refill - check that a page_frag contains enough room
2849  * @sz: minimum size of the fragment we want to get
2850  * @pfrag: pointer to page_frag
2851  * @gfp: priority for memory allocation
2852  *
2853  * Note: While this allocator tries to use high order pages, there is
2854  * no guarantee that allocations succeed. Therefore, @sz MUST be
2855  * less or equal than PAGE_SIZE.
2856  */
2857 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2858 {
2859         if (pfrag->page) {
2860                 if (page_ref_count(pfrag->page) == 1) {
2861                         pfrag->offset = 0;
2862                         return true;
2863                 }
2864                 if (pfrag->offset + sz <= pfrag->size)
2865                         return true;
2866                 put_page(pfrag->page);
2867         }
2868
2869         pfrag->offset = 0;
2870         if (SKB_FRAG_PAGE_ORDER &&
2871             !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2872                 /* Avoid direct reclaim but allow kswapd to wake */
2873                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2874                                           __GFP_COMP | __GFP_NOWARN |
2875                                           __GFP_NORETRY,
2876                                           SKB_FRAG_PAGE_ORDER);
2877                 if (likely(pfrag->page)) {
2878                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2879                         return true;
2880                 }
2881         }
2882         pfrag->page = alloc_page(gfp);
2883         if (likely(pfrag->page)) {
2884                 pfrag->size = PAGE_SIZE;
2885                 return true;
2886         }
2887         return false;
2888 }
2889 EXPORT_SYMBOL(skb_page_frag_refill);
2890
2891 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2892 {
2893         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2894                 return true;
2895
2896         sk_enter_memory_pressure(sk);
2897         sk_stream_moderate_sndbuf(sk);
2898         return false;
2899 }
2900 EXPORT_SYMBOL(sk_page_frag_refill);
2901
2902 void __lock_sock(struct sock *sk)
2903         __releases(&sk->sk_lock.slock)
2904         __acquires(&sk->sk_lock.slock)
2905 {
2906         DEFINE_WAIT(wait);
2907
2908         for (;;) {
2909                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2910                                         TASK_UNINTERRUPTIBLE);
2911                 spin_unlock_bh(&sk->sk_lock.slock);
2912                 schedule();
2913                 spin_lock_bh(&sk->sk_lock.slock);
2914                 if (!sock_owned_by_user(sk))
2915                         break;
2916         }
2917         finish_wait(&sk->sk_lock.wq, &wait);
2918 }
2919
2920 void __release_sock(struct sock *sk)
2921         __releases(&sk->sk_lock.slock)
2922         __acquires(&sk->sk_lock.slock)
2923 {
2924         struct sk_buff *skb, *next;
2925
2926         while ((skb = sk->sk_backlog.head) != NULL) {
2927                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2928
2929                 spin_unlock_bh(&sk->sk_lock.slock);
2930
2931                 do {
2932                         next = skb->next;
2933                         prefetch(next);
2934                         DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2935                         skb_mark_not_on_list(skb);
2936                         sk_backlog_rcv(sk, skb);
2937
2938                         cond_resched();
2939
2940                         skb = next;
2941                 } while (skb != NULL);
2942
2943                 spin_lock_bh(&sk->sk_lock.slock);
2944         }
2945
2946         /*
2947          * Doing the zeroing here guarantee we can not loop forever
2948          * while a wild producer attempts to flood us.
2949          */
2950         sk->sk_backlog.len = 0;
2951 }
2952
2953 void __sk_flush_backlog(struct sock *sk)
2954 {
2955         spin_lock_bh(&sk->sk_lock.slock);
2956         __release_sock(sk);
2957         spin_unlock_bh(&sk->sk_lock.slock);
2958 }
2959 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
2960
2961 /**
2962  * sk_wait_data - wait for data to arrive at sk_receive_queue
2963  * @sk:    sock to wait on
2964  * @timeo: for how long
2965  * @skb:   last skb seen on sk_receive_queue
2966  *
2967  * Now socket state including sk->sk_err is changed only under lock,
2968  * hence we may omit checks after joining wait queue.
2969  * We check receive queue before schedule() only as optimization;
2970  * it is very likely that release_sock() added new data.
2971  */
2972 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2973 {
2974         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2975         int rc;
2976
2977         add_wait_queue(sk_sleep(sk), &wait);
2978         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2979         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2980         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2981         remove_wait_queue(sk_sleep(sk), &wait);
2982         return rc;
2983 }
2984 EXPORT_SYMBOL(sk_wait_data);
2985
2986 /**
2987  *      __sk_mem_raise_allocated - increase memory_allocated
2988  *      @sk: socket
2989  *      @size: memory size to allocate
2990  *      @amt: pages to allocate
2991  *      @kind: allocation type
2992  *
2993  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2994  */
2995 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2996 {
2997         bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2998         struct proto *prot = sk->sk_prot;
2999         bool charged = true;
3000         long allocated;
3001
3002         sk_memory_allocated_add(sk, amt);
3003         allocated = sk_memory_allocated(sk);
3004         if (memcg_charge &&
3005             !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3006                                                 gfp_memcg_charge())))
3007                 goto suppress_allocation;
3008
3009         /* Under limit. */
3010         if (allocated <= sk_prot_mem_limits(sk, 0)) {
3011                 sk_leave_memory_pressure(sk);
3012                 return 1;
3013         }
3014
3015         /* Under pressure. */
3016         if (allocated > sk_prot_mem_limits(sk, 1))
3017                 sk_enter_memory_pressure(sk);
3018
3019         /* Over hard limit. */
3020         if (allocated > sk_prot_mem_limits(sk, 2))
3021                 goto suppress_allocation;
3022
3023         /* guarantee minimum buffer size under pressure */
3024         if (kind == SK_MEM_RECV) {
3025                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3026                         return 1;
3027
3028         } else { /* SK_MEM_SEND */
3029                 int wmem0 = sk_get_wmem0(sk, prot);
3030
3031                 if (sk->sk_type == SOCK_STREAM) {
3032                         if (sk->sk_wmem_queued < wmem0)
3033                                 return 1;
3034                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3035                                 return 1;
3036                 }
3037         }
3038
3039         if (sk_has_memory_pressure(sk)) {
3040                 u64 alloc;
3041
3042                 if (!sk_under_memory_pressure(sk))
3043                         return 1;
3044                 alloc = sk_sockets_allocated_read_positive(sk);
3045                 if (sk_prot_mem_limits(sk, 2) > alloc *
3046                     sk_mem_pages(sk->sk_wmem_queued +
3047                                  atomic_read(&sk->sk_rmem_alloc) +
3048                                  sk->sk_forward_alloc))
3049                         return 1;
3050         }
3051
3052 suppress_allocation:
3053
3054         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3055                 sk_stream_moderate_sndbuf(sk);
3056
3057                 /* Fail only if socket is _under_ its sndbuf.
3058                  * In this case we cannot block, so that we have to fail.
3059                  */
3060                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3061                         /* Force charge with __GFP_NOFAIL */
3062                         if (memcg_charge && !charged) {
3063                                 mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3064                                         gfp_memcg_charge() | __GFP_NOFAIL);
3065                         }
3066                         return 1;
3067                 }
3068         }
3069
3070         if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3071                 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3072
3073         sk_memory_allocated_sub(sk, amt);
3074
3075         if (memcg_charge && charged)
3076                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
3077
3078         return 0;
3079 }
3080
3081 /**
3082  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3083  *      @sk: socket
3084  *      @size: memory size to allocate
3085  *      @kind: allocation type
3086  *
3087  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3088  *      rmem allocation. This function assumes that protocols which have
3089  *      memory_pressure use sk_wmem_queued as write buffer accounting.
3090  */
3091 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3092 {
3093         int ret, amt = sk_mem_pages(size);
3094
3095         sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3096         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3097         if (!ret)
3098                 sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3099         return ret;
3100 }
3101 EXPORT_SYMBOL(__sk_mem_schedule);
3102
3103 /**
3104  *      __sk_mem_reduce_allocated - reclaim memory_allocated
3105  *      @sk: socket
3106  *      @amount: number of quanta
3107  *
3108  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3109  */
3110 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3111 {
3112         sk_memory_allocated_sub(sk, amount);
3113
3114         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3115                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3116
3117         if (sk_under_global_memory_pressure(sk) &&
3118             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3119                 sk_leave_memory_pressure(sk);
3120 }
3121
3122 /**
3123  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3124  *      @sk: socket
3125  *      @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3126  */
3127 void __sk_mem_reclaim(struct sock *sk, int amount)
3128 {
3129         amount >>= PAGE_SHIFT;
3130         sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3131         __sk_mem_reduce_allocated(sk, amount);
3132 }
3133 EXPORT_SYMBOL(__sk_mem_reclaim);
3134
3135 int sk_set_peek_off(struct sock *sk, int val)
3136 {
3137         WRITE_ONCE(sk->sk_peek_off, val);
3138         return 0;
3139 }
3140 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3141
3142 /*
3143  * Set of default routines for initialising struct proto_ops when
3144  * the protocol does not support a particular function. In certain
3145  * cases where it makes no sense for a protocol to have a "do nothing"
3146  * function, some default processing is provided.
3147  */
3148
3149 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3150 {
3151         return -EOPNOTSUPP;
3152 }
3153 EXPORT_SYMBOL(sock_no_bind);
3154
3155 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3156                     int len, int flags)
3157 {
3158         return -EOPNOTSUPP;
3159 }
3160 EXPORT_SYMBOL(sock_no_connect);
3161
3162 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3163 {
3164         return -EOPNOTSUPP;
3165 }
3166 EXPORT_SYMBOL(sock_no_socketpair);
3167
3168 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3169                    bool kern)
3170 {
3171         return -EOPNOTSUPP;
3172 }
3173 EXPORT_SYMBOL(sock_no_accept);
3174
3175 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3176                     int peer)
3177 {
3178         return -EOPNOTSUPP;
3179 }
3180 EXPORT_SYMBOL(sock_no_getname);
3181
3182 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3183 {
3184         return -EOPNOTSUPP;
3185 }
3186 EXPORT_SYMBOL(sock_no_ioctl);
3187
3188 int sock_no_listen(struct socket *sock, int backlog)
3189 {
3190         return -EOPNOTSUPP;
3191 }
3192 EXPORT_SYMBOL(sock_no_listen);
3193
3194 int sock_no_shutdown(struct socket *sock, int how)
3195 {
3196         return -EOPNOTSUPP;
3197 }
3198 EXPORT_SYMBOL(sock_no_shutdown);
3199
3200 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3201 {
3202         return -EOPNOTSUPP;
3203 }
3204 EXPORT_SYMBOL(sock_no_sendmsg);
3205
3206 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3207 {
3208         return -EOPNOTSUPP;
3209 }
3210 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3211
3212 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3213                     int flags)
3214 {
3215         return -EOPNOTSUPP;
3216 }
3217 EXPORT_SYMBOL(sock_no_recvmsg);
3218
3219 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3220 {
3221         /* Mirror missing mmap method error code */
3222         return -ENODEV;
3223 }
3224 EXPORT_SYMBOL(sock_no_mmap);
3225
3226 /*
3227  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3228  * various sock-based usage counts.
3229  */
3230 void __receive_sock(struct file *file)
3231 {
3232         struct socket *sock;
3233
3234         sock = sock_from_file(file);
3235         if (sock) {
3236                 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3237                 sock_update_classid(&sock->sk->sk_cgrp_data);
3238         }
3239 }
3240
3241 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3242 {
3243         ssize_t res;
3244         struct msghdr msg = {.msg_flags = flags};
3245         struct kvec iov;
3246         char *kaddr = kmap(page);
3247         iov.iov_base = kaddr + offset;
3248         iov.iov_len = size;
3249         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3250         kunmap(page);
3251         return res;
3252 }
3253 EXPORT_SYMBOL(sock_no_sendpage);
3254
3255 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3256                                 int offset, size_t size, int flags)
3257 {
3258         ssize_t res;
3259         struct msghdr msg = {.msg_flags = flags};
3260         struct kvec iov;
3261         char *kaddr = kmap(page);
3262
3263         iov.iov_base = kaddr + offset;
3264         iov.iov_len = size;
3265         res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3266         kunmap(page);
3267         return res;
3268 }
3269 EXPORT_SYMBOL(sock_no_sendpage_locked);
3270
3271 /*
3272  *      Default Socket Callbacks
3273  */
3274
3275 static void sock_def_wakeup(struct sock *sk)
3276 {
3277         struct socket_wq *wq;
3278
3279         rcu_read_lock();
3280         wq = rcu_dereference(sk->sk_wq);
3281         if (skwq_has_sleeper(wq))
3282                 wake_up_interruptible_all(&wq->wait);
3283         rcu_read_unlock();
3284 }
3285
3286 static void sock_def_error_report(struct sock *sk)
3287 {
3288         struct socket_wq *wq;
3289
3290         rcu_read_lock();
3291         wq = rcu_dereference(sk->sk_wq);
3292         if (skwq_has_sleeper(wq))
3293                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3294         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3295         rcu_read_unlock();
3296 }
3297
3298 void sock_def_readable(struct sock *sk)
3299 {
3300         struct socket_wq *wq;
3301
3302         rcu_read_lock();
3303         wq = rcu_dereference(sk->sk_wq);
3304         if (skwq_has_sleeper(wq))
3305                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3306                                                 EPOLLRDNORM | EPOLLRDBAND);
3307         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3308         rcu_read_unlock();
3309 }
3310
3311 static void sock_def_write_space(struct sock *sk)
3312 {
3313         struct socket_wq *wq;
3314
3315         rcu_read_lock();
3316
3317         /* Do not wake up a writer until he can make "significant"
3318          * progress.  --DaveM
3319          */
3320         if (sock_writeable(sk)) {
3321                 wq = rcu_dereference(sk->sk_wq);
3322                 if (skwq_has_sleeper(wq))
3323                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3324                                                 EPOLLWRNORM | EPOLLWRBAND);
3325
3326                 /* Should agree with poll, otherwise some programs break */
3327                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3328         }
3329
3330         rcu_read_unlock();
3331 }
3332
3333 /* An optimised version of sock_def_write_space(), should only be called
3334  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3335  * ->sk_wmem_alloc.
3336  */
3337 static void sock_def_write_space_wfree(struct sock *sk)
3338 {
3339         /* Do not wake up a writer until he can make "significant"
3340          * progress.  --DaveM
3341          */
3342         if (sock_writeable(sk)) {
3343                 struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3344
3345                 /* rely on refcount_sub from sock_wfree() */
3346                 smp_mb__after_atomic();
3347                 if (wq && waitqueue_active(&wq->wait))
3348                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3349                                                 EPOLLWRNORM | EPOLLWRBAND);
3350
3351                 /* Should agree with poll, otherwise some programs break */
3352                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3353         }
3354 }
3355
3356 static void sock_def_destruct(struct sock *sk)
3357 {
3358 }
3359
3360 void sk_send_sigurg(struct sock *sk)
3361 {
3362         if (sk->sk_socket && sk->sk_socket->file)
3363                 if (send_sigurg(&sk->sk_socket->file->f_owner))
3364                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3365 }
3366 EXPORT_SYMBOL(sk_send_sigurg);
3367
3368 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3369                     unsigned long expires)
3370 {
3371         if (!mod_timer(timer, expires))
3372                 sock_hold(sk);
3373 }
3374 EXPORT_SYMBOL(sk_reset_timer);
3375
3376 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3377 {
3378         if (del_timer(timer))
3379                 __sock_put(sk);
3380 }
3381 EXPORT_SYMBOL(sk_stop_timer);
3382
3383 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3384 {
3385         if (del_timer_sync(timer))
3386                 __sock_put(sk);
3387 }
3388 EXPORT_SYMBOL(sk_stop_timer_sync);
3389
3390 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3391 {
3392         sk_init_common(sk);
3393         sk->sk_send_head        =       NULL;
3394
3395         timer_setup(&sk->sk_timer, NULL, 0);
3396
3397         sk->sk_allocation       =       GFP_KERNEL;
3398         sk->sk_rcvbuf           =       READ_ONCE(sysctl_rmem_default);
3399         sk->sk_sndbuf           =       READ_ONCE(sysctl_wmem_default);
3400         sk->sk_state            =       TCP_CLOSE;
3401         sk_set_socket(sk, sock);
3402
3403         sock_set_flag(sk, SOCK_ZAPPED);
3404
3405         if (sock) {
3406                 sk->sk_type     =       sock->type;
3407                 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3408                 sock->sk        =       sk;
3409         } else {
3410                 RCU_INIT_POINTER(sk->sk_wq, NULL);
3411         }
3412         sk->sk_uid      =       uid;
3413
3414         rwlock_init(&sk->sk_callback_lock);
3415         if (sk->sk_kern_sock)
3416                 lockdep_set_class_and_name(
3417                         &sk->sk_callback_lock,
3418                         af_kern_callback_keys + sk->sk_family,
3419                         af_family_kern_clock_key_strings[sk->sk_family]);
3420         else
3421                 lockdep_set_class_and_name(
3422                         &sk->sk_callback_lock,
3423                         af_callback_keys + sk->sk_family,
3424                         af_family_clock_key_strings[sk->sk_family]);
3425
3426         sk->sk_state_change     =       sock_def_wakeup;
3427         sk->sk_data_ready       =       sock_def_readable;
3428         sk->sk_write_space      =       sock_def_write_space;
3429         sk->sk_error_report     =       sock_def_error_report;
3430         sk->sk_destruct         =       sock_def_destruct;
3431
3432         sk->sk_frag.page        =       NULL;
3433         sk->sk_frag.offset      =       0;
3434         sk->sk_peek_off         =       -1;
3435
3436         sk->sk_peer_pid         =       NULL;
3437         sk->sk_peer_cred        =       NULL;
3438         spin_lock_init(&sk->sk_peer_lock);
3439
3440         sk->sk_write_pending    =       0;
3441         sk->sk_rcvlowat         =       1;
3442         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
3443         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
3444
3445         sk->sk_stamp = SK_DEFAULT_STAMP;
3446 #if BITS_PER_LONG==32
3447         seqlock_init(&sk->sk_stamp_seq);
3448 #endif
3449         atomic_set(&sk->sk_zckey, 0);
3450
3451 #ifdef CONFIG_NET_RX_BUSY_POLL
3452         sk->sk_napi_id          =       0;
3453         sk->sk_ll_usec          =       READ_ONCE(sysctl_net_busy_read);
3454 #endif
3455
3456         sk->sk_max_pacing_rate = ~0UL;
3457         sk->sk_pacing_rate = ~0UL;
3458         WRITE_ONCE(sk->sk_pacing_shift, 10);
3459         sk->sk_incoming_cpu = -1;
3460
3461         sk_rx_queue_clear(sk);
3462         /*
3463          * Before updating sk_refcnt, we must commit prior changes to memory
3464          * (Documentation/RCU/rculist_nulls.rst for details)
3465          */
3466         smp_wmb();
3467         refcount_set(&sk->sk_refcnt, 1);
3468         atomic_set(&sk->sk_drops, 0);
3469 }
3470 EXPORT_SYMBOL(sock_init_data_uid);
3471
3472 void sock_init_data(struct socket *sock, struct sock *sk)
3473 {
3474         kuid_t uid = sock ?
3475                 SOCK_INODE(sock)->i_uid :
3476                 make_kuid(sock_net(sk)->user_ns, 0);
3477
3478         sock_init_data_uid(sock, sk, uid);
3479 }
3480 EXPORT_SYMBOL(sock_init_data);
3481
3482 void lock_sock_nested(struct sock *sk, int subclass)
3483 {
3484         /* The sk_lock has mutex_lock() semantics here. */
3485         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3486
3487         might_sleep();
3488         spin_lock_bh(&sk->sk_lock.slock);
3489         if (sock_owned_by_user_nocheck(sk))
3490                 __lock_sock(sk);
3491         sk->sk_lock.owned = 1;
3492         spin_unlock_bh(&sk->sk_lock.slock);
3493 }
3494 EXPORT_SYMBOL(lock_sock_nested);
3495
3496 void release_sock(struct sock *sk)
3497 {
3498         spin_lock_bh(&sk->sk_lock.slock);
3499         if (sk->sk_backlog.tail)
3500                 __release_sock(sk);
3501
3502         /* Warning : release_cb() might need to release sk ownership,
3503          * ie call sock_release_ownership(sk) before us.
3504          */
3505         if (sk->sk_prot->release_cb)
3506                 sk->sk_prot->release_cb(sk);
3507
3508         sock_release_ownership(sk);
3509         if (waitqueue_active(&sk->sk_lock.wq))
3510                 wake_up(&sk->sk_lock.wq);
3511         spin_unlock_bh(&sk->sk_lock.slock);
3512 }
3513 EXPORT_SYMBOL(release_sock);
3514
3515 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3516 {
3517         might_sleep();
3518         spin_lock_bh(&sk->sk_lock.slock);
3519
3520         if (!sock_owned_by_user_nocheck(sk)) {
3521                 /*
3522                  * Fast path return with bottom halves disabled and
3523                  * sock::sk_lock.slock held.
3524                  *
3525                  * The 'mutex' is not contended and holding
3526                  * sock::sk_lock.slock prevents all other lockers to
3527                  * proceed so the corresponding unlock_sock_fast() can
3528                  * avoid the slow path of release_sock() completely and
3529                  * just release slock.
3530                  *
3531                  * From a semantical POV this is equivalent to 'acquiring'
3532                  * the 'mutex', hence the corresponding lockdep
3533                  * mutex_release() has to happen in the fast path of
3534                  * unlock_sock_fast().
3535                  */
3536                 return false;
3537         }
3538
3539         __lock_sock(sk);
3540         sk->sk_lock.owned = 1;
3541         __acquire(&sk->sk_lock.slock);
3542         spin_unlock_bh(&sk->sk_lock.slock);
3543         return true;
3544 }
3545 EXPORT_SYMBOL(__lock_sock_fast);
3546
3547 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3548                    bool timeval, bool time32)
3549 {
3550         struct sock *sk = sock->sk;
3551         struct timespec64 ts;
3552
3553         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3554         ts = ktime_to_timespec64(sock_read_timestamp(sk));
3555         if (ts.tv_sec == -1)
3556                 return -ENOENT;
3557         if (ts.tv_sec == 0) {
3558                 ktime_t kt = ktime_get_real();
3559                 sock_write_timestamp(sk, kt);
3560                 ts = ktime_to_timespec64(kt);
3561         }
3562
3563         if (timeval)
3564                 ts.tv_nsec /= 1000;
3565
3566 #ifdef CONFIG_COMPAT_32BIT_TIME
3567         if (time32)
3568                 return put_old_timespec32(&ts, userstamp);
3569 #endif
3570 #ifdef CONFIG_SPARC64
3571         /* beware of padding in sparc64 timeval */
3572         if (timeval && !in_compat_syscall()) {
3573                 struct __kernel_old_timeval __user tv = {
3574                         .tv_sec = ts.tv_sec,
3575                         .tv_usec = ts.tv_nsec,
3576                 };
3577                 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3578                         return -EFAULT;
3579                 return 0;
3580         }
3581 #endif
3582         return put_timespec64(&ts, userstamp);
3583 }
3584 EXPORT_SYMBOL(sock_gettstamp);
3585
3586 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3587 {
3588         if (!sock_flag(sk, flag)) {
3589                 unsigned long previous_flags = sk->sk_flags;
3590
3591                 sock_set_flag(sk, flag);
3592                 /*
3593                  * we just set one of the two flags which require net
3594                  * time stamping, but time stamping might have been on
3595                  * already because of the other one
3596                  */
3597                 if (sock_needs_netstamp(sk) &&
3598                     !(previous_flags & SK_FLAGS_TIMESTAMP))
3599                         net_enable_timestamp();
3600         }
3601 }
3602
3603 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3604                        int level, int type)
3605 {
3606         struct sock_exterr_skb *serr;
3607         struct sk_buff *skb;
3608         int copied, err;
3609
3610         err = -EAGAIN;
3611         skb = sock_dequeue_err_skb(sk);
3612         if (skb == NULL)
3613                 goto out;
3614
3615         copied = skb->len;
3616         if (copied > len) {
3617                 msg->msg_flags |= MSG_TRUNC;
3618                 copied = len;
3619         }
3620         err = skb_copy_datagram_msg(skb, 0, msg, copied);
3621         if (err)
3622                 goto out_free_skb;
3623
3624         sock_recv_timestamp(msg, sk, skb);
3625
3626         serr = SKB_EXT_ERR(skb);
3627         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3628
3629         msg->msg_flags |= MSG_ERRQUEUE;
3630         err = copied;
3631
3632 out_free_skb:
3633         kfree_skb(skb);
3634 out:
3635         return err;
3636 }
3637 EXPORT_SYMBOL(sock_recv_errqueue);
3638
3639 /*
3640  *      Get a socket option on an socket.
3641  *
3642  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3643  *      asynchronous errors should be reported by getsockopt. We assume
3644  *      this means if you specify SO_ERROR (otherwise whats the point of it).
3645  */
3646 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3647                            char __user *optval, int __user *optlen)
3648 {
3649         struct sock *sk = sock->sk;
3650
3651         /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3652         return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3653 }
3654 EXPORT_SYMBOL(sock_common_getsockopt);
3655
3656 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3657                         int flags)
3658 {
3659         struct sock *sk = sock->sk;
3660         int addr_len = 0;
3661         int err;
3662
3663         err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3664         if (err >= 0)
3665                 msg->msg_namelen = addr_len;
3666         return err;
3667 }
3668 EXPORT_SYMBOL(sock_common_recvmsg);
3669
3670 /*
3671  *      Set socket options on an inet socket.
3672  */
3673 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3674                            sockptr_t optval, unsigned int optlen)
3675 {
3676         struct sock *sk = sock->sk;
3677
3678         /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3679         return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3680 }
3681 EXPORT_SYMBOL(sock_common_setsockopt);
3682
3683 void sk_common_release(struct sock *sk)
3684 {
3685         if (sk->sk_prot->destroy)
3686                 sk->sk_prot->destroy(sk);
3687
3688         /*
3689          * Observation: when sk_common_release is called, processes have
3690          * no access to socket. But net still has.
3691          * Step one, detach it from networking:
3692          *
3693          * A. Remove from hash tables.
3694          */
3695
3696         sk->sk_prot->unhash(sk);
3697
3698         /*
3699          * In this point socket cannot receive new packets, but it is possible
3700          * that some packets are in flight because some CPU runs receiver and
3701          * did hash table lookup before we unhashed socket. They will achieve
3702          * receive queue and will be purged by socket destructor.
3703          *
3704          * Also we still have packets pending on receive queue and probably,
3705          * our own packets waiting in device queues. sock_destroy will drain
3706          * receive queue, but transmitted packets will delay socket destruction
3707          * until the last reference will be released.
3708          */
3709
3710         sock_orphan(sk);
3711
3712         xfrm_sk_free_policy(sk);
3713
3714         sk_refcnt_debug_release(sk);
3715
3716         sock_put(sk);
3717 }
3718 EXPORT_SYMBOL(sk_common_release);
3719
3720 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3721 {
3722         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3723
3724         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3725         mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3726         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3727         mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3728         mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3729         mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3730         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3731         mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3732         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3733 }
3734
3735 #ifdef CONFIG_PROC_FS
3736 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3737
3738 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3739 {
3740         int cpu, idx = prot->inuse_idx;
3741         int res = 0;
3742
3743         for_each_possible_cpu(cpu)
3744                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3745
3746         return res >= 0 ? res : 0;
3747 }
3748 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3749
3750 int sock_inuse_get(struct net *net)
3751 {
3752         int cpu, res = 0;
3753
3754         for_each_possible_cpu(cpu)
3755                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3756
3757         return res;
3758 }
3759
3760 EXPORT_SYMBOL_GPL(sock_inuse_get);
3761
3762 static int __net_init sock_inuse_init_net(struct net *net)
3763 {
3764         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3765         if (net->core.prot_inuse == NULL)
3766                 return -ENOMEM;
3767         return 0;
3768 }
3769
3770 static void __net_exit sock_inuse_exit_net(struct net *net)
3771 {
3772         free_percpu(net->core.prot_inuse);
3773 }
3774
3775 static struct pernet_operations net_inuse_ops = {
3776         .init = sock_inuse_init_net,
3777         .exit = sock_inuse_exit_net,
3778 };
3779
3780 static __init int net_inuse_init(void)
3781 {
3782         if (register_pernet_subsys(&net_inuse_ops))
3783                 panic("Cannot initialize net inuse counters");
3784
3785         return 0;
3786 }
3787
3788 core_initcall(net_inuse_init);
3789
3790 static int assign_proto_idx(struct proto *prot)
3791 {
3792         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3793
3794         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3795                 pr_err("PROTO_INUSE_NR exhausted\n");
3796                 return -ENOSPC;
3797         }
3798
3799         set_bit(prot->inuse_idx, proto_inuse_idx);
3800         return 0;
3801 }
3802
3803 static void release_proto_idx(struct proto *prot)
3804 {
3805         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3806                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3807 }
3808 #else
3809 static inline int assign_proto_idx(struct proto *prot)
3810 {
3811         return 0;
3812 }
3813
3814 static inline void release_proto_idx(struct proto *prot)
3815 {
3816 }
3817
3818 #endif
3819
3820 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3821 {
3822         if (!twsk_prot)
3823                 return;
3824         kfree(twsk_prot->twsk_slab_name);
3825         twsk_prot->twsk_slab_name = NULL;
3826         kmem_cache_destroy(twsk_prot->twsk_slab);
3827         twsk_prot->twsk_slab = NULL;
3828 }
3829
3830 static int tw_prot_init(const struct proto *prot)
3831 {
3832         struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3833
3834         if (!twsk_prot)
3835                 return 0;
3836
3837         twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3838                                               prot->name);
3839         if (!twsk_prot->twsk_slab_name)
3840                 return -ENOMEM;
3841
3842         twsk_prot->twsk_slab =
3843                 kmem_cache_create(twsk_prot->twsk_slab_name,
3844                                   twsk_prot->twsk_obj_size, 0,
3845                                   SLAB_ACCOUNT | prot->slab_flags,
3846                                   NULL);
3847         if (!twsk_prot->twsk_slab) {
3848                 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3849                         prot->name);
3850                 return -ENOMEM;
3851         }
3852
3853         return 0;
3854 }
3855
3856 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3857 {
3858         if (!rsk_prot)
3859                 return;
3860         kfree(rsk_prot->slab_name);
3861         rsk_prot->slab_name = NULL;
3862         kmem_cache_destroy(rsk_prot->slab);
3863         rsk_prot->slab = NULL;
3864 }
3865
3866 static int req_prot_init(const struct proto *prot)
3867 {
3868         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3869
3870         if (!rsk_prot)
3871                 return 0;
3872
3873         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3874                                         prot->name);
3875         if (!rsk_prot->slab_name)
3876                 return -ENOMEM;
3877
3878         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3879                                            rsk_prot->obj_size, 0,
3880                                            SLAB_ACCOUNT | prot->slab_flags,
3881                                            NULL);
3882
3883         if (!rsk_prot->slab) {
3884                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3885                         prot->name);
3886                 return -ENOMEM;
3887         }
3888         return 0;
3889 }
3890
3891 int proto_register(struct proto *prot, int alloc_slab)
3892 {
3893         int ret = -ENOBUFS;
3894
3895         if (prot->memory_allocated && !prot->sysctl_mem) {
3896                 pr_err("%s: missing sysctl_mem\n", prot->name);
3897                 return -EINVAL;
3898         }
3899         if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3900                 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3901                 return -EINVAL;
3902         }
3903         if (alloc_slab) {
3904                 prot->slab = kmem_cache_create_usercopy(prot->name,
3905                                         prot->obj_size, 0,
3906                                         SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3907                                         prot->slab_flags,
3908                                         prot->useroffset, prot->usersize,
3909                                         NULL);
3910
3911                 if (prot->slab == NULL) {
3912                         pr_crit("%s: Can't create sock SLAB cache!\n",
3913                                 prot->name);
3914                         goto out;
3915                 }
3916
3917                 if (req_prot_init(prot))
3918                         goto out_free_request_sock_slab;
3919
3920                 if (tw_prot_init(prot))
3921                         goto out_free_timewait_sock_slab;
3922         }
3923
3924         mutex_lock(&proto_list_mutex);
3925         ret = assign_proto_idx(prot);
3926         if (ret) {
3927                 mutex_unlock(&proto_list_mutex);
3928                 goto out_free_timewait_sock_slab;
3929         }
3930         list_add(&prot->node, &proto_list);
3931         mutex_unlock(&proto_list_mutex);
3932         return ret;
3933
3934 out_free_timewait_sock_slab:
3935         if (alloc_slab)
3936                 tw_prot_cleanup(prot->twsk_prot);
3937 out_free_request_sock_slab:
3938         if (alloc_slab) {
3939                 req_prot_cleanup(prot->rsk_prot);
3940
3941                 kmem_cache_destroy(prot->slab);
3942                 prot->slab = NULL;
3943         }
3944 out:
3945         return ret;
3946 }
3947 EXPORT_SYMBOL(proto_register);
3948
3949 void proto_unregister(struct proto *prot)
3950 {
3951         mutex_lock(&proto_list_mutex);
3952         release_proto_idx(prot);
3953         list_del(&prot->node);
3954         mutex_unlock(&proto_list_mutex);
3955
3956         kmem_cache_destroy(prot->slab);
3957         prot->slab = NULL;
3958
3959         req_prot_cleanup(prot->rsk_prot);
3960         tw_prot_cleanup(prot->twsk_prot);
3961 }
3962 EXPORT_SYMBOL(proto_unregister);
3963
3964 int sock_load_diag_module(int family, int protocol)
3965 {
3966         if (!protocol) {
3967                 if (!sock_is_registered(family))
3968                         return -ENOENT;
3969
3970                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3971                                       NETLINK_SOCK_DIAG, family);
3972         }
3973
3974 #ifdef CONFIG_INET
3975         if (family == AF_INET &&
3976             protocol != IPPROTO_RAW &&
3977             protocol < MAX_INET_PROTOS &&
3978             !rcu_access_pointer(inet_protos[protocol]))
3979                 return -ENOENT;
3980 #endif
3981
3982         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3983                               NETLINK_SOCK_DIAG, family, protocol);
3984 }
3985 EXPORT_SYMBOL(sock_load_diag_module);
3986
3987 #ifdef CONFIG_PROC_FS
3988 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3989         __acquires(proto_list_mutex)
3990 {
3991         mutex_lock(&proto_list_mutex);
3992         return seq_list_start_head(&proto_list, *pos);
3993 }
3994
3995 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3996 {
3997         return seq_list_next(v, &proto_list, pos);
3998 }
3999
4000 static void proto_seq_stop(struct seq_file *seq, void *v)
4001         __releases(proto_list_mutex)
4002 {
4003         mutex_unlock(&proto_list_mutex);
4004 }
4005
4006 static char proto_method_implemented(const void *method)
4007 {
4008         return method == NULL ? 'n' : 'y';
4009 }
4010 static long sock_prot_memory_allocated(struct proto *proto)
4011 {
4012         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4013 }
4014
4015 static const char *sock_prot_memory_pressure(struct proto *proto)
4016 {
4017         return proto->memory_pressure != NULL ?
4018         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4019 }
4020
4021 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4022 {
4023
4024         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4025                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4026                    proto->name,
4027                    proto->obj_size,
4028                    sock_prot_inuse_get(seq_file_net(seq), proto),
4029                    sock_prot_memory_allocated(proto),
4030                    sock_prot_memory_pressure(proto),
4031                    proto->max_header,
4032                    proto->slab == NULL ? "no" : "yes",
4033                    module_name(proto->owner),
4034                    proto_method_implemented(proto->close),
4035                    proto_method_implemented(proto->connect),
4036                    proto_method_implemented(proto->disconnect),
4037                    proto_method_implemented(proto->accept),
4038                    proto_method_implemented(proto->ioctl),
4039                    proto_method_implemented(proto->init),
4040                    proto_method_implemented(proto->destroy),
4041                    proto_method_implemented(proto->shutdown),
4042                    proto_method_implemented(proto->setsockopt),
4043                    proto_method_implemented(proto->getsockopt),
4044                    proto_method_implemented(proto->sendmsg),
4045                    proto_method_implemented(proto->recvmsg),
4046                    proto_method_implemented(proto->sendpage),
4047                    proto_method_implemented(proto->bind),
4048                    proto_method_implemented(proto->backlog_rcv),
4049                    proto_method_implemented(proto->hash),
4050                    proto_method_implemented(proto->unhash),
4051                    proto_method_implemented(proto->get_port),
4052                    proto_method_implemented(proto->enter_memory_pressure));
4053 }
4054
4055 static int proto_seq_show(struct seq_file *seq, void *v)
4056 {
4057         if (v == &proto_list)
4058                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4059                            "protocol",
4060                            "size",
4061                            "sockets",
4062                            "memory",
4063                            "press",
4064                            "maxhdr",
4065                            "slab",
4066                            "module",
4067                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
4068         else
4069                 proto_seq_printf(seq, list_entry(v, struct proto, node));
4070         return 0;
4071 }
4072
4073 static const struct seq_operations proto_seq_ops = {
4074         .start  = proto_seq_start,
4075         .next   = proto_seq_next,
4076         .stop   = proto_seq_stop,
4077         .show   = proto_seq_show,
4078 };
4079
4080 static __net_init int proto_init_net(struct net *net)
4081 {
4082         if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4083                         sizeof(struct seq_net_private)))
4084                 return -ENOMEM;
4085
4086         return 0;
4087 }
4088
4089 static __net_exit void proto_exit_net(struct net *net)
4090 {
4091         remove_proc_entry("protocols", net->proc_net);
4092 }
4093
4094
4095 static __net_initdata struct pernet_operations proto_net_ops = {
4096         .init = proto_init_net,
4097         .exit = proto_exit_net,
4098 };
4099
4100 static int __init proto_init(void)
4101 {
4102         return register_pernet_subsys(&proto_net_ops);
4103 }
4104
4105 subsys_initcall(proto_init);
4106
4107 #endif /* PROC_FS */
4108
4109 #ifdef CONFIG_NET_RX_BUSY_POLL
4110 bool sk_busy_loop_end(void *p, unsigned long start_time)
4111 {
4112         struct sock *sk = p;
4113
4114         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4115                 return true;
4116
4117         if (sk_is_udp(sk) &&
4118             !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4119                 return true;
4120
4121         return sk_busy_loop_timeout(sk, start_time);
4122 }
4123 EXPORT_SYMBOL(sk_busy_loop_end);
4124 #endif /* CONFIG_NET_RX_BUSY_POLL */
4125
4126 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4127 {
4128         if (!sk->sk_prot->bind_add)
4129                 return -EOPNOTSUPP;
4130         return sk->sk_prot->bind_add(sk, addr, addr_len);
4131 }
4132 EXPORT_SYMBOL(sock_bind_add);