af_unix.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * NET4:        Implementation of BSD Unix domain sockets.
   4  *
   5  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   6  *
   7  * Fixes:
   8  *              Linus Torvalds  :       Assorted bug cures.
   9  *              Niibe Yutaka    :       async I/O support.
  10  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  11  *              Alan Cox        :       Limit size of allocated blocks.
  12  *              Alan Cox        :       Fixed the stupid socketpair bug.
  13  *              Alan Cox        :       BSD compatibility fine tuning.
  14  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  15  *              Alan Cox        :       Sorted out a proper draft version of
  16  *                                      file descriptor passing hacked up from
  17  *                                      Mike Shaver's work.
  18  *              Marty Leisner   :       Fixes to fd passing
  19  *              Nick Nevin      :       recvmsg bugfix.
  20  *              Alan Cox        :       Started proper garbage collector
  21  *              Heiko EiBfeldt  :       Missing verify_area check
  22  *              Alan Cox        :       Started POSIXisms
  23  *              Andreas Schwab  :       Replace inode by dentry for proper
  24  *                                      reference counting
  25  *              Kirk Petersen   :       Made this a module
  26  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  27  *                                      Lots of bug fixes.
  28  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  29  *                                      by above two patches.
  30  *           Andrea Arcangeli   :       If possible we block in connect(2)
  31  *                                      if the max backlog of the listen socket
  32  *                                      is been reached. This won't break
  33  *                                      old apps and it will avoid huge amount
  34  *                                      of socks hashed (this for unix_gc()
  35  *                                      performances reasons).
  36  *                                      Security fix that limits the max
  37  *                                      number of socks to 2*max_files and
  38  *                                      the number of skb queueable in the
  39  *                                      dgram receiver.
  40  *              Artur Skawina   :       Hash function optimizations
  41  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  42  *            Malcolm Beattie   :       Set peercred for socketpair
  43  *           Michal Ostrowski   :       Module initialization cleanup.
  44  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  45  *                                      the core infrastructure is doing that
  46  *                                      for all net proto families now (2.5.69+)
  47  *
  48  * Known differences from reference BSD that was tested:
  49  *
  50  *      [TO FIX]
  51  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  52  *              other the moment one end closes.
  53  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  54  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  55  *      [NOT TO FIX]
  56  *      accept() returns a path name even if the connecting socket has closed
  57  *              in the meantime (BSD loses the path and gives up).
  58  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  59  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  60  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  61  *      BSD af_unix apparently has connect forgetting to block properly.
  62  *              (need to check this with the POSIX spec in detail)
  63  *
  64  * Differences from 2.0.0-11-... (ANK)
  65  *      Bug fixes and improvements.
  66  *              - client shutdown killed server socket.
  67  *              - removed all useless cli/sti pairs.
  68  *
  69  *      Semantic changes/extensions.
  70  *              - generic control message passing.
  71  *              - SCM_CREDENTIALS control message.
  72  *              - "Abstract" (not FS based) socket bindings.
  73  *                Abstract names are sequences of bytes (not zero terminated)
  74  *                started by 0, so that this name space does not intersect
  75  *                with BSD names.
  76  */
  77
  78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  79
  80 #include <linux/module.h>
  81 #include <linux/kernel.h>
  82 #include <linux/signal.h>
  83 #include <linux/sched/signal.h>
  84 #include <linux/errno.h>
  85 #include <linux/string.h>
  86 #include <linux/stat.h>
  87 #include <linux/dcache.h>
  88 #include <linux/namei.h>
  89 #include <linux/socket.h>
  90 #include <linux/un.h>
  91 #include <linux/fcntl.h>
  92 #include <linux/filter.h>
  93 #include <linux/termios.h>
  94 #include <linux/sockios.h>
  95 #include <linux/net.h>
  96 #include <linux/in.h>
  97 #include <linux/fs.h>
  98 #include <linux/slab.h>
  99 #include <linux/uaccess.h>
 100 #include <linux/skbuff.h>
 101 #include <linux/netdevice.h>
 102 #include <net/net_namespace.h>
 103 #include <net/sock.h>
 104 #include <net/tcp_states.h>
 105 #include <net/af_unix.h>
 106 #include <linux/proc_fs.h>
 107 #include <linux/seq_file.h>
 108 #include <net/scm.h>
 109 #include <linux/init.h>
 110 #include <linux/poll.h>
 111 #include <linux/rtnetlink.h>
 112 #include <linux/mount.h>
 113 #include <net/checksum.h>
 114 #include <linux/security.h>
 115 #include <linux/splice.h>
 116 #include <linux/freezer.h>
 117 #include <linux/file.h>
 118 #include <linux/btf_ids.h>
 119 #include <linux/bpf-cgroup.h>
 120
 121 #include "scm.h"
 122
 123 static atomic_long_t unix_nr_socks;
 124 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
 125 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
 126
 127 /* SMP locking strategy:
 128  *    hash table is protected with spinlock.
 129  *    each socket state is protected by separate spinlock.
 130  */
 131
 132 static unsigned int unix_unbound_hash(struct sock *sk)
 133 {
 134         unsigned long hash = (unsigned long)sk;
 135
 136         hash ^= hash >> 16;
 137         hash ^= hash >> 8;
 138         hash ^= sk->sk_type;
 139
 140         return hash & UNIX_HASH_MOD;
 141 }
 142
 143 static unsigned int unix_bsd_hash(struct inode *i)
 144 {
 145         return i->i_ino & UNIX_HASH_MOD;
 146 }
 147
 148 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
 149                                        int addr_len, int type)
 150 {
 151         __wsum csum = csum_partial(sunaddr, addr_len, 0);
 152         unsigned int hash;
 153
 154         hash = (__force unsigned int)csum_fold(csum);
 155         hash ^= hash >> 8;
 156         hash ^= type;
 157
 158         return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
 159 }
 160
 161 static void unix_table_double_lock(struct net *net,
 162                                    unsigned int hash1, unsigned int hash2)
 163 {
 164         if (hash1 == hash2) {
 165                 spin_lock(&net->unx.table.locks[hash1]);
 166                 return;
 167         }
 168
 169         if (hash1 > hash2)
 170                 swap(hash1, hash2);
 171
 172         spin_lock(&net->unx.table.locks[hash1]);
 173         spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
 174 }
 175
 176 static void unix_table_double_unlock(struct net *net,
 177                                      unsigned int hash1, unsigned int hash2)
 178 {
 179         if (hash1 == hash2) {
 180                 spin_unlock(&net->unx.table.locks[hash1]);
 181                 return;
 182         }
 183
 184         spin_unlock(&net->unx.table.locks[hash1]);
 185         spin_unlock(&net->unx.table.locks[hash2]);
 186 }
 187
 188 #ifdef CONFIG_SECURITY_NETWORK
 189 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 190 {
 191         UNIXCB(skb).secid = scm->secid;
 192 }
 193
 194 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 195 {
 196         scm->secid = UNIXCB(skb).secid;
 197 }
 198
 199 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 200 {
 201         return (scm->secid == UNIXCB(skb).secid);
 202 }
 203 #else
 204 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 205 { }
 206
 207 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 208 { }
 209
 210 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 211 {
 212         return true;
 213 }
 214 #endif /* CONFIG_SECURITY_NETWORK */
 215
 216 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 217 {
 218         return unix_peer(osk) == sk;
 219 }
 220
 221 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 222 {
 223         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 224 }
 225
 226 static inline int unix_recvq_full(const struct sock *sk)
 227 {
 228         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 229 }
 230
 231 static inline int unix_recvq_full_lockless(const struct sock *sk)
 232 {
 233         return skb_queue_len_lockless(&sk->sk_receive_queue) >
 234                 READ_ONCE(sk->sk_max_ack_backlog);
 235 }
 236
 237 struct sock *unix_peer_get(struct sock *s)
 238 {
 239         struct sock *peer;
 240
 241         unix_state_lock(s);
 242         peer = unix_peer(s);
 243         if (peer)
 244                 sock_hold(peer);
 245         unix_state_unlock(s);
 246         return peer;
 247 }
 248 EXPORT_SYMBOL_GPL(unix_peer_get);
 249
 250 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
 251                                              int addr_len)
 252 {
 253         struct unix_address *addr;
 254
 255         addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
 256         if (!addr)
 257                 return NULL;
 258
 259         refcount_set(&addr->refcnt, 1);
 260         addr->len = addr_len;
 261         memcpy(addr->name, sunaddr, addr_len);
 262
 263         return addr;
 264 }
 265
 266 static inline void unix_release_addr(struct unix_address *addr)
 267 {
 268         if (refcount_dec_and_test(&addr->refcnt))
 269                 kfree(addr);
 270 }
 271
 272 /*
 273  *      Check unix socket name:
 274  *              - should be not zero length.
 275  *              - if started by not zero, should be NULL terminated (FS object)
 276  *              - if started by zero, it is abstract name.
 277  */
 278
 279 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
 280 {
 281         if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
 282             addr_len > sizeof(*sunaddr))
 283                 return -EINVAL;
 284
 285         if (sunaddr->sun_family != AF_UNIX)
 286                 return -EINVAL;
 287
 288         return 0;
 289 }
 290
 291 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
 292 {
 293         struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
 294         short offset = offsetof(struct sockaddr_storage, __data);
 295
 296         BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
 297
 298         /* This may look like an off by one error but it is a bit more
 299          * subtle.  108 is the longest valid AF_UNIX path for a binding.
 300          * sun_path[108] doesn't as such exist.  However in kernel space
 301          * we are guaranteed that it is a valid memory location in our
 302          * kernel address buffer because syscall functions always pass
 303          * a pointer of struct sockaddr_storage which has a bigger buffer
 304          * than 108.  Also, we must terminate sun_path for strlen() in
 305          * getname_kernel().
 306          */
 307         addr->__data[addr_len - offset] = 0;
 308
 309         /* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
 310          * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
 311          * know the actual buffer.
 312          */
 313         return strlen(addr->__data) + offset + 1;
 314 }
 315
 316 static void __unix_remove_socket(struct sock *sk)
 317 {
 318         sk_del_node_init(sk);
 319 }
 320
 321 static void __unix_insert_socket(struct net *net, struct sock *sk)
 322 {
 323         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 324         sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
 325 }
 326
 327 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
 328                                  struct unix_address *addr, unsigned int hash)
 329 {
 330         __unix_remove_socket(sk);
 331         smp_store_release(&unix_sk(sk)->addr, addr);
 332
 333         sk->sk_hash = hash;
 334         __unix_insert_socket(net, sk);
 335 }
 336
 337 static void unix_remove_socket(struct net *net, struct sock *sk)
 338 {
 339         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 340         __unix_remove_socket(sk);
 341         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 342 }
 343
 344 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
 345 {
 346         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 347         __unix_insert_socket(net, sk);
 348         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 349 }
 350
 351 static void unix_insert_bsd_socket(struct sock *sk)
 352 {
 353         spin_lock(&bsd_socket_locks[sk->sk_hash]);
 354         sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
 355         spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 356 }
 357
 358 static void unix_remove_bsd_socket(struct sock *sk)
 359 {
 360         if (!hlist_unhashed(&sk->sk_bind_node)) {
 361                 spin_lock(&bsd_socket_locks[sk->sk_hash]);
 362                 __sk_del_bind_node(sk);
 363                 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 364
 365                 sk_node_init(&sk->sk_bind_node);
 366         }
 367 }
 368
 369 static struct sock *__unix_find_socket_byname(struct net *net,
 370                                               struct sockaddr_un *sunname,
 371                                               int len, unsigned int hash)
 372 {
 373         struct sock *s;
 374
 375         sk_for_each(s, &net->unx.table.buckets[hash]) {
 376                 struct unix_sock *u = unix_sk(s);
 377
 378                 if (u->addr->len == len &&
 379                     !memcmp(u->addr->name, sunname, len))
 380                         return s;
 381         }
 382         return NULL;
 383 }
 384
 385 static inline struct sock *unix_find_socket_byname(struct net *net,
 386                                                    struct sockaddr_un *sunname,
 387                                                    int len, unsigned int hash)
 388 {
 389         struct sock *s;
 390
 391         spin_lock(&net->unx.table.locks[hash]);
 392         s = __unix_find_socket_byname(net, sunname, len, hash);
 393         if (s)
 394                 sock_hold(s);
 395         spin_unlock(&net->unx.table.locks[hash]);
 396         return s;
 397 }
 398
 399 static struct sock *unix_find_socket_byinode(struct inode *i)
 400 {
 401         unsigned int hash = unix_bsd_hash(i);
 402         struct sock *s;
 403
 404         spin_lock(&bsd_socket_locks[hash]);
 405         sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
 406                 struct dentry *dentry = unix_sk(s)->path.dentry;
 407
 408                 if (dentry && d_backing_inode(dentry) == i) {
 409                         sock_hold(s);
 410                         spin_unlock(&bsd_socket_locks[hash]);
 411                         return s;
 412                 }
 413         }
 414         spin_unlock(&bsd_socket_locks[hash]);
 415         return NULL;
 416 }
 417
 418 /* Support code for asymmetrically connected dgram sockets
 419  *
 420  * If a datagram socket is connected to a socket not itself connected
 421  * to the first socket (eg, /dev/log), clients may only enqueue more
 422  * messages if the present receive queue of the server socket is not
 423  * "too large". This means there's a second writeability condition
 424  * poll and sendmsg need to test. The dgram recv code will do a wake
 425  * up on the peer_wait wait queue of a socket upon reception of a
 426  * datagram which needs to be propagated to sleeping would-be writers
 427  * since these might not have sent anything so far. This can't be
 428  * accomplished via poll_wait because the lifetime of the server
 429  * socket might be less than that of its clients if these break their
 430  * association with it or if the server socket is closed while clients
 431  * are still connected to it and there's no way to inform "a polling
 432  * implementation" that it should let go of a certain wait queue
 433  *
 434  * In order to propagate a wake up, a wait_queue_entry_t of the client
 435  * socket is enqueued on the peer_wait queue of the server socket
 436  * whose wake function does a wake_up on the ordinary client socket
 437  * wait queue. This connection is established whenever a write (or
 438  * poll for write) hit the flow control condition and broken when the
 439  * association to the server socket is dissolved or after a wake up
 440  * was relayed.
 441  */
 442
 443 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
 444                                       void *key)
 445 {
 446         struct unix_sock *u;
 447         wait_queue_head_t *u_sleep;
 448
 449         u = container_of(q, struct unix_sock, peer_wake);
 450
 451         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
 452                             q);
 453         u->peer_wake.private = NULL;
 454
 455         /* relaying can only happen while the wq still exists */
 456         u_sleep = sk_sleep(&u->sk);
 457         if (u_sleep)
 458                 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
 459
 460         return 0;
 461 }
 462
 463 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
 464 {
 465         struct unix_sock *u, *u_other;
 466         int rc;
 467
 468         u = unix_sk(sk);
 469         u_other = unix_sk(other);
 470         rc = 0;
 471         spin_lock(&u_other->peer_wait.lock);
 472
 473         if (!u->peer_wake.private) {
 474                 u->peer_wake.private = other;
 475                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
 476
 477                 rc = 1;
 478         }
 479
 480         spin_unlock(&u_other->peer_wait.lock);
 481         return rc;
 482 }
 483
 484 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
 485                                             struct sock *other)
 486 {
 487         struct unix_sock *u, *u_other;
 488
 489         u = unix_sk(sk);
 490         u_other = unix_sk(other);
 491         spin_lock(&u_other->peer_wait.lock);
 492
 493         if (u->peer_wake.private == other) {
 494                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
 495                 u->peer_wake.private = NULL;
 496         }
 497
 498         spin_unlock(&u_other->peer_wait.lock);
 499 }
 500
 501 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 502                                                    struct sock *other)
 503 {
 504         unix_dgram_peer_wake_disconnect(sk, other);
 505         wake_up_interruptible_poll(sk_sleep(sk),
 506                                    EPOLLOUT |
 507                                    EPOLLWRNORM |
 508                                    EPOLLWRBAND);
 509 }
 510
 511 /* preconditions:
 512  *      - unix_peer(sk) == other
 513  *      - association is stable
 514  */
 515 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 516 {
 517         int connected;
 518
 519         connected = unix_dgram_peer_wake_connect(sk, other);
 520
 521         /* If other is SOCK_DEAD, we want to make sure we signal
 522          * POLLOUT, such that a subsequent write() can get a
 523          * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
 524          * to other and its full, we will hang waiting for POLLOUT.
 525          */
 526         if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
 527                 return 1;
 528
 529         if (connected)
 530                 unix_dgram_peer_wake_disconnect(sk, other);
 531
 532         return 0;
 533 }
 534
 535 static int unix_writable(const struct sock *sk)
 536 {
 537         return sk->sk_state != TCP_LISTEN &&
 538                (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 539 }
 540
 541 static void unix_write_space(struct sock *sk)
 542 {
 543         struct socket_wq *wq;
 544
 545         rcu_read_lock();
 546         if (unix_writable(sk)) {
 547                 wq = rcu_dereference(sk->sk_wq);
 548                 if (skwq_has_sleeper(wq))
 549                         wake_up_interruptible_sync_poll(&wq->wait,
 550                                 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
 551                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 552         }
 553         rcu_read_unlock();
 554 }
 555
 556 /* When dgram socket disconnects (or changes its peer), we clear its receive
 557  * queue of packets arrived from previous peer. First, it allows to do
 558  * flow control based only on wmem_alloc; second, sk connected to peer
 559  * may receive messages only from that peer. */
 560 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 561 {
 562         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 563                 skb_queue_purge(&sk->sk_receive_queue);
 564                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 565
 566                 /* If one link of bidirectional dgram pipe is disconnected,
 567                  * we signal error. Messages are lost. Do not make this,
 568                  * when peer was not connected to us.
 569                  */
 570                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 571                         WRITE_ONCE(other->sk_err, ECONNRESET);
 572                         sk_error_report(other);
 573                 }
 574         }
 575         other->sk_state = TCP_CLOSE;
 576 }
 577
 578 static void unix_sock_destructor(struct sock *sk)
 579 {
 580         struct unix_sock *u = unix_sk(sk);
 581
 582         skb_queue_purge(&sk->sk_receive_queue);
 583
 584         DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
 585         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 586         DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
 587         if (!sock_flag(sk, SOCK_DEAD)) {
 588                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 589                 return;
 590         }
 591
 592         if (u->addr)
 593                 unix_release_addr(u->addr);
 594
 595         atomic_long_dec(&unix_nr_socks);
 596         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 597 #ifdef UNIX_REFCNT_DEBUG
 598         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 599                 atomic_long_read(&unix_nr_socks));
 600 #endif
 601 }
 602
 603 static void unix_release_sock(struct sock *sk, int embrion)
 604 {
 605         struct unix_sock *u = unix_sk(sk);
 606         struct sock *skpair;
 607         struct sk_buff *skb;
 608         struct path path;
 609         int state;
 610
 611         unix_remove_socket(sock_net(sk), sk);
 612         unix_remove_bsd_socket(sk);
 613
 614         /* Clear state */
 615         unix_state_lock(sk);
 616         sock_orphan(sk);
 617         WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
 618         path         = u->path;
 619         u->path.dentry = NULL;
 620         u->path.mnt = NULL;
 621         state = sk->sk_state;
 622         sk->sk_state = TCP_CLOSE;
 623
 624         skpair = unix_peer(sk);
 625         unix_peer(sk) = NULL;
 626
 627         unix_state_unlock(sk);
 628
 629 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
 630         if (u->oob_skb) {
 631                 kfree_skb(u->oob_skb);
 632                 u->oob_skb = NULL;
 633         }
 634 #endif
 635
 636         wake_up_interruptible_all(&u->peer_wait);
 637
 638         if (skpair != NULL) {
 639                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 640                         unix_state_lock(skpair);
 641                         /* No more writes */
 642                         WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
 643                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 644                                 WRITE_ONCE(skpair->sk_err, ECONNRESET);
 645                         unix_state_unlock(skpair);
 646                         skpair->sk_state_change(skpair);
 647                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 648                 }
 649
 650                 unix_dgram_peer_wake_disconnect(sk, skpair);
 651                 sock_put(skpair); /* It may now die */
 652         }
 653
 654         /* Try to flush out this socket. Throw out buffers at least */
 655
 656         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 657                 if (state == TCP_LISTEN)
 658                         unix_release_sock(skb->sk, 1);
 659                 /* passed fds are erased in the kfree_skb hook        */
 660                 UNIXCB(skb).consumed = skb->len;
 661                 kfree_skb(skb);
 662         }
 663
 664         if (path.dentry)
 665                 path_put(&path);
 666
 667         sock_put(sk);
 668
 669         /* ---- Socket is dead now and most probably destroyed ---- */
 670
 671         /*
 672          * Fixme: BSD difference: In BSD all sockets connected to us get
 673          *        ECONNRESET and we die on the spot. In Linux we behave
 674          *        like files and pipes do and wait for the last
 675          *        dereference.
 676          *
 677          * Can't we simply set sock->err?
 678          *
 679          *        What the above comment does talk about? --ANK(980817)
 680          */
 681
 682         if (READ_ONCE(unix_tot_inflight))
 683                 unix_gc();              /* Garbage collect fds */
 684 }
 685
 686 static void init_peercred(struct sock *sk)
 687 {
 688         const struct cred *old_cred;
 689         struct pid *old_pid;
 690
 691         spin_lock(&sk->sk_peer_lock);
 692         old_pid = sk->sk_peer_pid;
 693         old_cred = sk->sk_peer_cred;
 694         sk->sk_peer_pid  = get_pid(task_tgid(current));
 695         sk->sk_peer_cred = get_current_cred();
 696         spin_unlock(&sk->sk_peer_lock);
 697
 698         put_pid(old_pid);
 699         put_cred(old_cred);
 700 }
 701
 702 static void copy_peercred(struct sock *sk, struct sock *peersk)
 703 {
 704         const struct cred *old_cred;
 705         struct pid *old_pid;
 706
 707         if (sk < peersk) {
 708                 spin_lock(&sk->sk_peer_lock);
 709                 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 710         } else {
 711                 spin_lock(&peersk->sk_peer_lock);
 712                 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 713         }
 714         old_pid = sk->sk_peer_pid;
 715         old_cred = sk->sk_peer_cred;
 716         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 717         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 718
 719         spin_unlock(&sk->sk_peer_lock);
 720         spin_unlock(&peersk->sk_peer_lock);
 721
 722         put_pid(old_pid);
 723         put_cred(old_cred);
 724 }
 725
 726 static int unix_listen(struct socket *sock, int backlog)
 727 {
 728         int err;
 729         struct sock *sk = sock->sk;
 730         struct unix_sock *u = unix_sk(sk);
 731
 732         err = -EOPNOTSUPP;
 733         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 734                 goto out;       /* Only stream/seqpacket sockets accept */
 735         err = -EINVAL;
 736         if (!u->addr)
 737                 goto out;       /* No listens on an unbound socket */
 738         unix_state_lock(sk);
 739         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 740                 goto out_unlock;
 741         if (backlog > sk->sk_max_ack_backlog)
 742                 wake_up_interruptible_all(&u->peer_wait);
 743         sk->sk_max_ack_backlog  = backlog;
 744         sk->sk_state            = TCP_LISTEN;
 745         /* set credentials so connect can copy them */
 746         init_peercred(sk);
 747         err = 0;
 748
 749 out_unlock:
 750         unix_state_unlock(sk);
 751 out:
 752         return err;
 753 }
 754
 755 static int unix_release(struct socket *);
 756 static int unix_bind(struct socket *, struct sockaddr *, int);
 757 static int unix_stream_connect(struct socket *, struct sockaddr *,
 758                                int addr_len, int flags);
 759 static int unix_socketpair(struct socket *, struct socket *);
 760 static int unix_accept(struct socket *, struct socket *, int, bool);
 761 static int unix_getname(struct socket *, struct sockaddr *, int);
 762 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
 763 static __poll_t unix_dgram_poll(struct file *, struct socket *,
 764                                     poll_table *);
 765 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 766 #ifdef CONFIG_COMPAT
 767 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 768 #endif
 769 static int unix_shutdown(struct socket *, int);
 770 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 771 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 772 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 773                                        struct pipe_inode_info *, size_t size,
 774                                        unsigned int flags);
 775 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 776 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 777 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 778 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 779 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 780                               int, int);
 781 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 782 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 783                                   int);
 784
 785 #ifdef CONFIG_PROC_FS
 786 static int unix_count_nr_fds(struct sock *sk)
 787 {
 788         struct sk_buff *skb;
 789         struct unix_sock *u;
 790         int nr_fds = 0;
 791
 792         spin_lock(&sk->sk_receive_queue.lock);
 793         skb = skb_peek(&sk->sk_receive_queue);
 794         while (skb) {
 795                 u = unix_sk(skb->sk);
 796                 nr_fds += atomic_read(&u->scm_stat.nr_fds);
 797                 skb = skb_peek_next(skb, &sk->sk_receive_queue);
 798         }
 799         spin_unlock(&sk->sk_receive_queue.lock);
 800
 801         return nr_fds;
 802 }
 803
 804 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
 805 {
 806         struct sock *sk = sock->sk;
 807         unsigned char s_state;
 808         struct unix_sock *u;
 809         int nr_fds = 0;
 810
 811         if (sk) {
 812                 s_state = READ_ONCE(sk->sk_state);
 813                 u = unix_sk(sk);
 814
 815                 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
 816                  * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
 817                  * SOCK_DGRAM is ordinary. So, no lock is needed.
 818                  */
 819                 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
 820                         nr_fds = atomic_read(&u->scm_stat.nr_fds);
 821                 else if (s_state == TCP_LISTEN)
 822                         nr_fds = unix_count_nr_fds(sk);
 823
 824                 seq_printf(m, "scm_fds: %u\n", nr_fds);
 825         }
 826 }
 827 #else
 828 #define unix_show_fdinfo NULL
 829 #endif
 830
 831 static const struct proto_ops unix_stream_ops = {
 832         .family =       PF_UNIX,
 833         .owner =        THIS_MODULE,
 834         .release =      unix_release,
 835         .bind =         unix_bind,
 836         .connect =      unix_stream_connect,
 837         .socketpair =   unix_socketpair,
 838         .accept =       unix_accept,
 839         .getname =      unix_getname,
 840         .poll =         unix_poll,
 841         .ioctl =        unix_ioctl,
 842 #ifdef CONFIG_COMPAT
 843         .compat_ioctl = unix_compat_ioctl,
 844 #endif
 845         .listen =       unix_listen,
 846         .shutdown =     unix_shutdown,
 847         .sendmsg =      unix_stream_sendmsg,
 848         .recvmsg =      unix_stream_recvmsg,
 849         .read_skb =     unix_stream_read_skb,
 850         .mmap =         sock_no_mmap,
 851         .splice_read =  unix_stream_splice_read,
 852         .set_peek_off = sk_set_peek_off,
 853         .show_fdinfo =  unix_show_fdinfo,
 854 };
 855
 856 static const struct proto_ops unix_dgram_ops = {
 857         .family =       PF_UNIX,
 858         .owner =        THIS_MODULE,
 859         .release =      unix_release,
 860         .bind =         unix_bind,
 861         .connect =      unix_dgram_connect,
 862         .socketpair =   unix_socketpair,
 863         .accept =       sock_no_accept,
 864         .getname =      unix_getname,
 865         .poll =         unix_dgram_poll,
 866         .ioctl =        unix_ioctl,
 867 #ifdef CONFIG_COMPAT
 868         .compat_ioctl = unix_compat_ioctl,
 869 #endif
 870         .listen =       sock_no_listen,
 871         .shutdown =     unix_shutdown,
 872         .sendmsg =      unix_dgram_sendmsg,
 873         .read_skb =     unix_read_skb,
 874         .recvmsg =      unix_dgram_recvmsg,
 875         .mmap =         sock_no_mmap,
 876         .set_peek_off = sk_set_peek_off,
 877         .show_fdinfo =  unix_show_fdinfo,
 878 };
 879
 880 static const struct proto_ops unix_seqpacket_ops = {
 881         .family =       PF_UNIX,
 882         .owner =        THIS_MODULE,
 883         .release =      unix_release,
 884         .bind =         unix_bind,
 885         .connect =      unix_stream_connect,
 886         .socketpair =   unix_socketpair,
 887         .accept =       unix_accept,
 888         .getname =      unix_getname,
 889         .poll =         unix_dgram_poll,
 890         .ioctl =        unix_ioctl,
 891 #ifdef CONFIG_COMPAT
 892         .compat_ioctl = unix_compat_ioctl,
 893 #endif
 894         .listen =       unix_listen,
 895         .shutdown =     unix_shutdown,
 896         .sendmsg =      unix_seqpacket_sendmsg,
 897         .recvmsg =      unix_seqpacket_recvmsg,
 898         .mmap =         sock_no_mmap,
 899         .set_peek_off = sk_set_peek_off,
 900         .show_fdinfo =  unix_show_fdinfo,
 901 };
 902
 903 static void unix_close(struct sock *sk, long timeout)
 904 {
 905         /* Nothing to do here, unix socket does not need a ->close().
 906          * This is merely for sockmap.
 907          */
 908 }
 909
 910 static void unix_unhash(struct sock *sk)
 911 {
 912         /* Nothing to do here, unix socket does not need a ->unhash().
 913          * This is merely for sockmap.
 914          */
 915 }
 916
 917 static bool unix_bpf_bypass_getsockopt(int level, int optname)
 918 {
 919         if (level == SOL_SOCKET) {
 920                 switch (optname) {
 921                 case SO_PEERPIDFD:
 922                         return true;
 923                 default:
 924                         return false;
 925                 }
 926         }
 927
 928         return false;
 929 }
 930
 931 struct proto unix_dgram_proto = {
 932         .name                   = "UNIX",
 933         .owner                  = THIS_MODULE,
 934         .obj_size               = sizeof(struct unix_sock),
 935         .close                  = unix_close,
 936         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
 937 #ifdef CONFIG_BPF_SYSCALL
 938         .psock_update_sk_prot   = unix_dgram_bpf_update_proto,
 939 #endif
 940 };
 941
 942 struct proto unix_stream_proto = {
 943         .name                   = "UNIX-STREAM",
 944         .owner                  = THIS_MODULE,
 945         .obj_size               = sizeof(struct unix_sock),
 946         .close                  = unix_close,
 947         .unhash                 = unix_unhash,
 948         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
 949 #ifdef CONFIG_BPF_SYSCALL
 950         .psock_update_sk_prot   = unix_stream_bpf_update_proto,
 951 #endif
 952 };
 953
 954 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
 955 {
 956         struct unix_sock *u;
 957         struct sock *sk;
 958         int err;
 959
 960         atomic_long_inc(&unix_nr_socks);
 961         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
 962                 err = -ENFILE;
 963                 goto err;
 964         }
 965
 966         if (type == SOCK_STREAM)
 967                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
 968         else /*dgram and  seqpacket */
 969                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
 970
 971         if (!sk) {
 972                 err = -ENOMEM;
 973                 goto err;
 974         }
 975
 976         sock_init_data(sock, sk);
 977
 978         sk->sk_hash             = unix_unbound_hash(sk);
 979         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
 980         sk->sk_write_space      = unix_write_space;
 981         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 982         sk->sk_destruct         = unix_sock_destructor;
 983         u = unix_sk(sk);
 984         u->inflight = 0;
 985         u->path.dentry = NULL;
 986         u->path.mnt = NULL;
 987         spin_lock_init(&u->lock);
 988         INIT_LIST_HEAD(&u->link);
 989         mutex_init(&u->iolock); /* single task reading lock */
 990         mutex_init(&u->bindlock); /* single task binding lock */
 991         init_waitqueue_head(&u->peer_wait);
 992         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 993         memset(&u->scm_stat, 0, sizeof(struct scm_stat));
 994         unix_insert_unbound_socket(net, sk);
 995
 996         sock_prot_inuse_add(net, sk->sk_prot, 1);
 997
 998         return sk;
 999
1000 err:
1001         atomic_long_dec(&unix_nr_socks);
1002         return ERR_PTR(err);
1003 }
1004
1005 static int unix_create(struct net *net, struct socket *sock, int protocol,
1006                        int kern)
1007 {
1008         struct sock *sk;
1009
1010         if (protocol && protocol != PF_UNIX)
1011                 return -EPROTONOSUPPORT;
1012
1013         sock->state = SS_UNCONNECTED;
1014
1015         switch (sock->type) {
1016         case SOCK_STREAM:
1017                 sock->ops = &unix_stream_ops;
1018                 break;
1019                 /*
1020                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
1021                  *      nothing uses it.
1022                  */
1023         case SOCK_RAW:
1024                 sock->type = SOCK_DGRAM;
1025                 fallthrough;
1026         case SOCK_DGRAM:
1027                 sock->ops = &unix_dgram_ops;
1028                 break;
1029         case SOCK_SEQPACKET:
1030                 sock->ops = &unix_seqpacket_ops;
1031                 break;
1032         default:
1033                 return -ESOCKTNOSUPPORT;
1034         }
1035
1036         sk = unix_create1(net, sock, kern, sock->type);
1037         if (IS_ERR(sk))
1038                 return PTR_ERR(sk);
1039
1040         return 0;
1041 }
1042
1043 static int unix_release(struct socket *sock)
1044 {
1045         struct sock *sk = sock->sk;
1046
1047         if (!sk)
1048                 return 0;
1049
1050         sk->sk_prot->close(sk, 0);
1051         unix_release_sock(sk, 0);
1052         sock->sk = NULL;
1053
1054         return 0;
1055 }
1056
1057 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1058                                   int type)
1059 {
1060         struct inode *inode;
1061         struct path path;
1062         struct sock *sk;
1063         int err;
1064
1065         unix_mkname_bsd(sunaddr, addr_len);
1066         err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1067         if (err)
1068                 goto fail;
1069
1070         err = path_permission(&path, MAY_WRITE);
1071         if (err)
1072                 goto path_put;
1073
1074         err = -ECONNREFUSED;
1075         inode = d_backing_inode(path.dentry);
1076         if (!S_ISSOCK(inode->i_mode))
1077                 goto path_put;
1078
1079         sk = unix_find_socket_byinode(inode);
1080         if (!sk)
1081                 goto path_put;
1082
1083         err = -EPROTOTYPE;
1084         if (sk->sk_type == type)
1085                 touch_atime(&path);
1086         else
1087                 goto sock_put;
1088
1089         path_put(&path);
1090
1091         return sk;
1092
1093 sock_put:
1094         sock_put(sk);
1095 path_put:
1096         path_put(&path);
1097 fail:
1098         return ERR_PTR(err);
1099 }
1100
1101 static struct sock *unix_find_abstract(struct net *net,
1102                                        struct sockaddr_un *sunaddr,
1103                                        int addr_len, int type)
1104 {
1105         unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1106         struct dentry *dentry;
1107         struct sock *sk;
1108
1109         sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1110         if (!sk)
1111                 return ERR_PTR(-ECONNREFUSED);
1112
1113         dentry = unix_sk(sk)->path.dentry;
1114         if (dentry)
1115                 touch_atime(&unix_sk(sk)->path);
1116
1117         return sk;
1118 }
1119
1120 static struct sock *unix_find_other(struct net *net,
1121                                     struct sockaddr_un *sunaddr,
1122                                     int addr_len, int type)
1123 {
1124         struct sock *sk;
1125
1126         if (sunaddr->sun_path[0])
1127                 sk = unix_find_bsd(sunaddr, addr_len, type);
1128         else
1129                 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1130
1131         return sk;
1132 }
1133
1134 static int unix_autobind(struct sock *sk)
1135 {
1136         unsigned int new_hash, old_hash = sk->sk_hash;
1137         struct unix_sock *u = unix_sk(sk);
1138         struct net *net = sock_net(sk);
1139         struct unix_address *addr;
1140         u32 lastnum, ordernum;
1141         int err;
1142
1143         err = mutex_lock_interruptible(&u->bindlock);
1144         if (err)
1145                 return err;
1146
1147         if (u->addr)
1148                 goto out;
1149
1150         err = -ENOMEM;
1151         addr = kzalloc(sizeof(*addr) +
1152                        offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1153         if (!addr)
1154                 goto out;
1155
1156         addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1157         addr->name->sun_family = AF_UNIX;
1158         refcount_set(&addr->refcnt, 1);
1159
1160         ordernum = get_random_u32();
1161         lastnum = ordernum & 0xFFFFF;
1162 retry:
1163         ordernum = (ordernum + 1) & 0xFFFFF;
1164         sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1165
1166         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1167         unix_table_double_lock(net, old_hash, new_hash);
1168
1169         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1170                 unix_table_double_unlock(net, old_hash, new_hash);
1171
1172                 /* __unix_find_socket_byname() may take long time if many names
1173                  * are already in use.
1174                  */
1175                 cond_resched();
1176
1177                 if (ordernum == lastnum) {
1178                         /* Give up if all names seems to be in use. */
1179                         err = -ENOSPC;
1180                         unix_release_addr(addr);
1181                         goto out;
1182                 }
1183
1184                 goto retry;
1185         }
1186
1187         __unix_set_addr_hash(net, sk, addr, new_hash);
1188         unix_table_double_unlock(net, old_hash, new_hash);
1189         err = 0;
1190
1191 out:    mutex_unlock(&u->bindlock);
1192         return err;
1193 }
1194
1195 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1196                          int addr_len)
1197 {
1198         umode_t mode = S_IFSOCK |
1199                (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1200         unsigned int new_hash, old_hash = sk->sk_hash;
1201         struct unix_sock *u = unix_sk(sk);
1202         struct net *net = sock_net(sk);
1203         struct mnt_idmap *idmap;
1204         struct unix_address *addr;
1205         struct dentry *dentry;
1206         struct path parent;
1207         int err;
1208
1209         addr_len = unix_mkname_bsd(sunaddr, addr_len);
1210         addr = unix_create_addr(sunaddr, addr_len);
1211         if (!addr)
1212                 return -ENOMEM;
1213
1214         /*
1215          * Get the parent directory, calculate the hash for last
1216          * component.
1217          */
1218         dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1219         if (IS_ERR(dentry)) {
1220                 err = PTR_ERR(dentry);
1221                 goto out;
1222         }
1223
1224         /*
1225          * All right, let's create it.
1226          */
1227         idmap = mnt_idmap(parent.mnt);
1228         err = security_path_mknod(&parent, dentry, mode, 0);
1229         if (!err)
1230                 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1231         if (err)
1232                 goto out_path;
1233         err = mutex_lock_interruptible(&u->bindlock);
1234         if (err)
1235                 goto out_unlink;
1236         if (u->addr)
1237                 goto out_unlock;
1238
1239         new_hash = unix_bsd_hash(d_backing_inode(dentry));
1240         unix_table_double_lock(net, old_hash, new_hash);
1241         u->path.mnt = mntget(parent.mnt);
1242         u->path.dentry = dget(dentry);
1243         __unix_set_addr_hash(net, sk, addr, new_hash);
1244         unix_table_double_unlock(net, old_hash, new_hash);
1245         unix_insert_bsd_socket(sk);
1246         mutex_unlock(&u->bindlock);
1247         done_path_create(&parent, dentry);
1248         return 0;
1249
1250 out_unlock:
1251         mutex_unlock(&u->bindlock);
1252         err = -EINVAL;
1253 out_unlink:
1254         /* failed after successful mknod?  unlink what we'd created... */
1255         vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1256 out_path:
1257         done_path_create(&parent, dentry);
1258 out:
1259         unix_release_addr(addr);
1260         return err == -EEXIST ? -EADDRINUSE : err;
1261 }
1262
1263 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1264                               int addr_len)
1265 {
1266         unsigned int new_hash, old_hash = sk->sk_hash;
1267         struct unix_sock *u = unix_sk(sk);
1268         struct net *net = sock_net(sk);
1269         struct unix_address *addr;
1270         int err;
1271
1272         addr = unix_create_addr(sunaddr, addr_len);
1273         if (!addr)
1274                 return -ENOMEM;
1275
1276         err = mutex_lock_interruptible(&u->bindlock);
1277         if (err)
1278                 goto out;
1279
1280         if (u->addr) {
1281                 err = -EINVAL;
1282                 goto out_mutex;
1283         }
1284
1285         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1286         unix_table_double_lock(net, old_hash, new_hash);
1287
1288         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1289                 goto out_spin;
1290
1291         __unix_set_addr_hash(net, sk, addr, new_hash);
1292         unix_table_double_unlock(net, old_hash, new_hash);
1293         mutex_unlock(&u->bindlock);
1294         return 0;
1295
1296 out_spin:
1297         unix_table_double_unlock(net, old_hash, new_hash);
1298         err = -EADDRINUSE;
1299 out_mutex:
1300         mutex_unlock(&u->bindlock);
1301 out:
1302         unix_release_addr(addr);
1303         return err;
1304 }
1305
1306 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1307 {
1308         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1309         struct sock *sk = sock->sk;
1310         int err;
1311
1312         if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1313             sunaddr->sun_family == AF_UNIX)
1314                 return unix_autobind(sk);
1315
1316         err = unix_validate_addr(sunaddr, addr_len);
1317         if (err)
1318                 return err;
1319
1320         if (sunaddr->sun_path[0])
1321                 err = unix_bind_bsd(sk, sunaddr, addr_len);
1322         else
1323                 err = unix_bind_abstract(sk, sunaddr, addr_len);
1324
1325         return err;
1326 }
1327
1328 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1329 {
1330         if (unlikely(sk1 == sk2) || !sk2) {
1331                 unix_state_lock(sk1);
1332                 return;
1333         }
1334         if (sk1 > sk2)
1335                 swap(sk1, sk2);
1336
1337         unix_state_lock(sk1);
1338         unix_state_lock_nested(sk2, U_LOCK_SECOND);
1339 }
1340
1341 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1342 {
1343         if (unlikely(sk1 == sk2) || !sk2) {
1344                 unix_state_unlock(sk1);
1345                 return;
1346         }
1347         unix_state_unlock(sk1);
1348         unix_state_unlock(sk2);
1349 }
1350
1351 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1352                               int alen, int flags)
1353 {
1354         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1355         struct sock *sk = sock->sk;
1356         struct sock *other;
1357         int err;
1358
1359         err = -EINVAL;
1360         if (alen < offsetofend(struct sockaddr, sa_family))
1361                 goto out;
1362
1363         if (addr->sa_family != AF_UNSPEC) {
1364                 err = unix_validate_addr(sunaddr, alen);
1365                 if (err)
1366                         goto out;
1367
1368                 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1369                 if (err)
1370                         goto out;
1371
1372                 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1373                      test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1374                     !unix_sk(sk)->addr) {
1375                         err = unix_autobind(sk);
1376                         if (err)
1377                                 goto out;
1378                 }
1379
1380 restart:
1381                 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1382                 if (IS_ERR(other)) {
1383                         err = PTR_ERR(other);
1384                         goto out;
1385                 }
1386
1387                 unix_state_double_lock(sk, other);
1388
1389                 /* Apparently VFS overslept socket death. Retry. */
1390                 if (sock_flag(other, SOCK_DEAD)) {
1391                         unix_state_double_unlock(sk, other);
1392                         sock_put(other);
1393                         goto restart;
1394                 }
1395
1396                 err = -EPERM;
1397                 if (!unix_may_send(sk, other))
1398                         goto out_unlock;
1399
1400                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1401                 if (err)
1402                         goto out_unlock;
1403
1404                 sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1405         } else {
1406                 /*
1407                  *      1003.1g breaking connected state with AF_UNSPEC
1408                  */
1409                 other = NULL;
1410                 unix_state_double_lock(sk, other);
1411         }
1412
1413         /*
1414          * If it was connected, reconnect.
1415          */
1416         if (unix_peer(sk)) {
1417                 struct sock *old_peer = unix_peer(sk);
1418
1419                 unix_peer(sk) = other;
1420                 if (!other)
1421                         sk->sk_state = TCP_CLOSE;
1422                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1423
1424                 unix_state_double_unlock(sk, other);
1425
1426                 if (other != old_peer)
1427                         unix_dgram_disconnected(sk, old_peer);
1428                 sock_put(old_peer);
1429         } else {
1430                 unix_peer(sk) = other;
1431                 unix_state_double_unlock(sk, other);
1432         }
1433
1434         return 0;
1435
1436 out_unlock:
1437         unix_state_double_unlock(sk, other);
1438         sock_put(other);
1439 out:
1440         return err;
1441 }
1442
1443 static long unix_wait_for_peer(struct sock *other, long timeo)
1444         __releases(&unix_sk(other)->lock)
1445 {
1446         struct unix_sock *u = unix_sk(other);
1447         int sched;
1448         DEFINE_WAIT(wait);
1449
1450         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1451
1452         sched = !sock_flag(other, SOCK_DEAD) &&
1453                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1454                 unix_recvq_full_lockless(other);
1455
1456         unix_state_unlock(other);
1457
1458         if (sched)
1459                 timeo = schedule_timeout(timeo);
1460
1461         finish_wait(&u->peer_wait, &wait);
1462         return timeo;
1463 }
1464
1465 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1466                                int addr_len, int flags)
1467 {
1468         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1469         struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1470         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1471         struct net *net = sock_net(sk);
1472         struct sk_buff *skb = NULL;
1473         long timeo;
1474         int err;
1475         int st;
1476
1477         err = unix_validate_addr(sunaddr, addr_len);
1478         if (err)
1479                 goto out;
1480
1481         err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1482         if (err)
1483                 goto out;
1484
1485         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1486              test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1487                 err = unix_autobind(sk);
1488                 if (err)
1489                         goto out;
1490         }
1491
1492         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1493
1494         /* First of all allocate resources.
1495            If we will make it after state is locked,
1496            we will have to recheck all again in any case.
1497          */
1498
1499         /* create new sock for complete connection */
1500         newsk = unix_create1(net, NULL, 0, sock->type);
1501         if (IS_ERR(newsk)) {
1502                 err = PTR_ERR(newsk);
1503                 newsk = NULL;
1504                 goto out;
1505         }
1506
1507         err = -ENOMEM;
1508
1509         /* Allocate skb for sending to listening sock */
1510         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1511         if (skb == NULL)
1512                 goto out;
1513
1514 restart:
1515         /*  Find listening sock. */
1516         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1517         if (IS_ERR(other)) {
1518                 err = PTR_ERR(other);
1519                 other = NULL;
1520                 goto out;
1521         }
1522
1523         /* Latch state of peer */
1524         unix_state_lock(other);
1525
1526         /* Apparently VFS overslept socket death. Retry. */
1527         if (sock_flag(other, SOCK_DEAD)) {
1528                 unix_state_unlock(other);
1529                 sock_put(other);
1530                 goto restart;
1531         }
1532
1533         err = -ECONNREFUSED;
1534         if (other->sk_state != TCP_LISTEN)
1535                 goto out_unlock;
1536         if (other->sk_shutdown & RCV_SHUTDOWN)
1537                 goto out_unlock;
1538
1539         if (unix_recvq_full(other)) {
1540                 err = -EAGAIN;
1541                 if (!timeo)
1542                         goto out_unlock;
1543
1544                 timeo = unix_wait_for_peer(other, timeo);
1545
1546                 err = sock_intr_errno(timeo);
1547                 if (signal_pending(current))
1548                         goto out;
1549                 sock_put(other);
1550                 goto restart;
1551         }
1552
1553         /* Latch our state.
1554
1555            It is tricky place. We need to grab our state lock and cannot
1556            drop lock on peer. It is dangerous because deadlock is
1557            possible. Connect to self case and simultaneous
1558            attempt to connect are eliminated by checking socket
1559            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1560            check this before attempt to grab lock.
1561
1562            Well, and we have to recheck the state after socket locked.
1563          */
1564         st = sk->sk_state;
1565
1566         switch (st) {
1567         case TCP_CLOSE:
1568                 /* This is ok... continue with connect */
1569                 break;
1570         case TCP_ESTABLISHED:
1571                 /* Socket is already connected */
1572                 err = -EISCONN;
1573                 goto out_unlock;
1574         default:
1575                 err = -EINVAL;
1576                 goto out_unlock;
1577         }
1578
1579         unix_state_lock_nested(sk, U_LOCK_SECOND);
1580
1581         if (sk->sk_state != st) {
1582                 unix_state_unlock(sk);
1583                 unix_state_unlock(other);
1584                 sock_put(other);
1585                 goto restart;
1586         }
1587
1588         err = security_unix_stream_connect(sk, other, newsk);
1589         if (err) {
1590                 unix_state_unlock(sk);
1591                 goto out_unlock;
1592         }
1593
1594         /* The way is open! Fastly set all the necessary fields... */
1595
1596         sock_hold(sk);
1597         unix_peer(newsk)        = sk;
1598         newsk->sk_state         = TCP_ESTABLISHED;
1599         newsk->sk_type          = sk->sk_type;
1600         init_peercred(newsk);
1601         newu = unix_sk(newsk);
1602         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1603         otheru = unix_sk(other);
1604
1605         /* copy address information from listening to new sock
1606          *
1607          * The contents of *(otheru->addr) and otheru->path
1608          * are seen fully set up here, since we have found
1609          * otheru in hash under its lock.  Insertion into the
1610          * hash chain we'd found it in had been done in an
1611          * earlier critical area protected by the chain's lock,
1612          * the same one where we'd set *(otheru->addr) contents,
1613          * as well as otheru->path and otheru->addr itself.
1614          *
1615          * Using smp_store_release() here to set newu->addr
1616          * is enough to make those stores, as well as stores
1617          * to newu->path visible to anyone who gets newu->addr
1618          * by smp_load_acquire().  IOW, the same warranties
1619          * as for unix_sock instances bound in unix_bind() or
1620          * in unix_autobind().
1621          */
1622         if (otheru->path.dentry) {
1623                 path_get(&otheru->path);
1624                 newu->path = otheru->path;
1625         }
1626         refcount_inc(&otheru->addr->refcnt);
1627         smp_store_release(&newu->addr, otheru->addr);
1628
1629         /* Set credentials */
1630         copy_peercred(sk, other);
1631
1632         sock->state     = SS_CONNECTED;
1633         sk->sk_state    = TCP_ESTABLISHED;
1634         sock_hold(newsk);
1635
1636         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1637         unix_peer(sk)   = newsk;
1638
1639         unix_state_unlock(sk);
1640
1641         /* take ten and send info to listening sock */
1642         spin_lock(&other->sk_receive_queue.lock);
1643         __skb_queue_tail(&other->sk_receive_queue, skb);
1644         spin_unlock(&other->sk_receive_queue.lock);
1645         unix_state_unlock(other);
1646         other->sk_data_ready(other);
1647         sock_put(other);
1648         return 0;
1649
1650 out_unlock:
1651         if (other)
1652                 unix_state_unlock(other);
1653
1654 out:
1655         kfree_skb(skb);
1656         if (newsk)
1657                 unix_release_sock(newsk, 0);
1658         if (other)
1659                 sock_put(other);
1660         return err;
1661 }
1662
1663 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1664 {
1665         struct sock *ska = socka->sk, *skb = sockb->sk;
1666
1667         /* Join our sockets back to back */
1668         sock_hold(ska);
1669         sock_hold(skb);
1670         unix_peer(ska) = skb;
1671         unix_peer(skb) = ska;
1672         init_peercred(ska);
1673         init_peercred(skb);
1674
1675         ska->sk_state = TCP_ESTABLISHED;
1676         skb->sk_state = TCP_ESTABLISHED;
1677         socka->state  = SS_CONNECTED;
1678         sockb->state  = SS_CONNECTED;
1679         return 0;
1680 }
1681
1682 static void unix_sock_inherit_flags(const struct socket *old,
1683                                     struct socket *new)
1684 {
1685         if (test_bit(SOCK_PASSCRED, &old->flags))
1686                 set_bit(SOCK_PASSCRED, &new->flags);
1687         if (test_bit(SOCK_PASSPIDFD, &old->flags))
1688                 set_bit(SOCK_PASSPIDFD, &new->flags);
1689         if (test_bit(SOCK_PASSSEC, &old->flags))
1690                 set_bit(SOCK_PASSSEC, &new->flags);
1691 }
1692
1693 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1694                        bool kern)
1695 {
1696         struct sock *sk = sock->sk;
1697         struct sock *tsk;
1698         struct sk_buff *skb;
1699         int err;
1700
1701         err = -EOPNOTSUPP;
1702         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1703                 goto out;
1704
1705         err = -EINVAL;
1706         if (sk->sk_state != TCP_LISTEN)
1707                 goto out;
1708
1709         /* If socket state is TCP_LISTEN it cannot change (for now...),
1710          * so that no locks are necessary.
1711          */
1712
1713         skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1714                                 &err);
1715         if (!skb) {
1716                 /* This means receive shutdown. */
1717                 if (err == 0)
1718                         err = -EINVAL;
1719                 goto out;
1720         }
1721
1722         tsk = skb->sk;
1723         skb_free_datagram(sk, skb);
1724         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1725
1726         /* attach accepted sock to socket */
1727         unix_state_lock(tsk);
1728         newsock->state = SS_CONNECTED;
1729         unix_sock_inherit_flags(sock, newsock);
1730         sock_graft(tsk, newsock);
1731         unix_state_unlock(tsk);
1732         return 0;
1733
1734 out:
1735         return err;
1736 }
1737
1738
1739 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1740 {
1741         struct sock *sk = sock->sk;
1742         struct unix_address *addr;
1743         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1744         int err = 0;
1745
1746         if (peer) {
1747                 sk = unix_peer_get(sk);
1748
1749                 err = -ENOTCONN;
1750                 if (!sk)
1751                         goto out;
1752                 err = 0;
1753         } else {
1754                 sock_hold(sk);
1755         }
1756
1757         addr = smp_load_acquire(&unix_sk(sk)->addr);
1758         if (!addr) {
1759                 sunaddr->sun_family = AF_UNIX;
1760                 sunaddr->sun_path[0] = 0;
1761                 err = offsetof(struct sockaddr_un, sun_path);
1762         } else {
1763                 err = addr->len;
1764                 memcpy(sunaddr, addr->name, addr->len);
1765
1766                 if (peer)
1767                         BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1768                                                CGROUP_UNIX_GETPEERNAME);
1769                 else
1770                         BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1771                                                CGROUP_UNIX_GETSOCKNAME);
1772         }
1773         sock_put(sk);
1774 out:
1775         return err;
1776 }
1777
1778 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1779 {
1780         scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1781
1782         /*
1783          * Garbage collection of unix sockets starts by selecting a set of
1784          * candidate sockets which have reference only from being in flight
1785          * (total_refs == inflight_refs).  This condition is checked once during
1786          * the candidate collection phase, and candidates are marked as such, so
1787          * that non-candidates can later be ignored.  While inflight_refs is
1788          * protected by unix_gc_lock, total_refs (file count) is not, hence this
1789          * is an instantaneous decision.
1790          *
1791          * Once a candidate, however, the socket must not be reinstalled into a
1792          * file descriptor while the garbage collection is in progress.
1793          *
1794          * If the above conditions are met, then the directed graph of
1795          * candidates (*) does not change while unix_gc_lock is held.
1796          *
1797          * Any operations that changes the file count through file descriptors
1798          * (dup, close, sendmsg) does not change the graph since candidates are
1799          * not installed in fds.
1800          *
1801          * Dequeing a candidate via recvmsg would install it into an fd, but
1802          * that takes unix_gc_lock to decrement the inflight count, so it's
1803          * serialized with garbage collection.
1804          *
1805          * MSG_PEEK is special in that it does not change the inflight count,
1806          * yet does install the socket into an fd.  The following lock/unlock
1807          * pair is to ensure serialization with garbage collection.  It must be
1808          * done between incrementing the file count and installing the file into
1809          * an fd.
1810          *
1811          * If garbage collection starts after the barrier provided by the
1812          * lock/unlock, then it will see the elevated refcount and not mark this
1813          * as a candidate.  If a garbage collection is already in progress
1814          * before the file count was incremented, then the lock/unlock pair will
1815          * ensure that garbage collection is finished before progressing to
1816          * installing the fd.
1817          *
1818          * (*) A -> B where B is on the queue of A or B is on the queue of C
1819          * which is on the queue of listening socket A.
1820          */
1821         spin_lock(&unix_gc_lock);
1822         spin_unlock(&unix_gc_lock);
1823 }
1824
1825 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1826 {
1827         int err = 0;
1828
1829         UNIXCB(skb).pid  = get_pid(scm->pid);
1830         UNIXCB(skb).uid = scm->creds.uid;
1831         UNIXCB(skb).gid = scm->creds.gid;
1832         UNIXCB(skb).fp = NULL;
1833         unix_get_secdata(scm, skb);
1834         if (scm->fp && send_fds)
1835                 err = unix_attach_fds(scm, skb);
1836
1837         skb->destructor = unix_destruct_scm;
1838         return err;
1839 }
1840
1841 static bool unix_passcred_enabled(const struct socket *sock,
1842                                   const struct sock *other)
1843 {
1844         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1845                test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1846                !other->sk_socket ||
1847                test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1848                test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1849 }
1850
1851 /*
1852  * Some apps rely on write() giving SCM_CREDENTIALS
1853  * We include credentials if source or destination socket
1854  * asserted SOCK_PASSCRED.
1855  */
1856 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1857                             const struct sock *other)
1858 {
1859         if (UNIXCB(skb).pid)
1860                 return;
1861         if (unix_passcred_enabled(sock, other)) {
1862                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1863                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1864         }
1865 }
1866
1867 static bool unix_skb_scm_eq(struct sk_buff *skb,
1868                             struct scm_cookie *scm)
1869 {
1870         return UNIXCB(skb).pid == scm->pid &&
1871                uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1872                gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1873                unix_secdata_eq(scm, skb);
1874 }
1875
1876 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1877 {
1878         struct scm_fp_list *fp = UNIXCB(skb).fp;
1879         struct unix_sock *u = unix_sk(sk);
1880
1881         if (unlikely(fp && fp->count))
1882                 atomic_add(fp->count, &u->scm_stat.nr_fds);
1883 }
1884
1885 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1886 {
1887         struct scm_fp_list *fp = UNIXCB(skb).fp;
1888         struct unix_sock *u = unix_sk(sk);
1889
1890         if (unlikely(fp && fp->count))
1891                 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1892 }
1893
1894 /*
1895  *      Send AF_UNIX data.
1896  */
1897
1898 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1899                               size_t len)
1900 {
1901         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1902         struct sock *sk = sock->sk, *other = NULL;
1903         struct unix_sock *u = unix_sk(sk);
1904         struct scm_cookie scm;
1905         struct sk_buff *skb;
1906         int data_len = 0;
1907         int sk_locked;
1908         long timeo;
1909         int err;
1910
1911         wait_for_unix_gc();
1912         err = scm_send(sock, msg, &scm, false);
1913         if (err < 0)
1914                 return err;
1915
1916         err = -EOPNOTSUPP;
1917         if (msg->msg_flags&MSG_OOB)
1918                 goto out;
1919
1920         if (msg->msg_namelen) {
1921                 err = unix_validate_addr(sunaddr, msg->msg_namelen);
1922                 if (err)
1923                         goto out;
1924
1925                 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1926                                                             msg->msg_name,
1927                                                             &msg->msg_namelen,
1928                                                             NULL);
1929                 if (err)
1930                         goto out;
1931         } else {
1932                 sunaddr = NULL;
1933                 err = -ENOTCONN;
1934                 other = unix_peer_get(sk);
1935                 if (!other)
1936                         goto out;
1937         }
1938
1939         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1940              test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1941                 err = unix_autobind(sk);
1942                 if (err)
1943                         goto out;
1944         }
1945
1946         err = -EMSGSIZE;
1947         if (len > sk->sk_sndbuf - 32)
1948                 goto out;
1949
1950         if (len > SKB_MAX_ALLOC) {
1951                 data_len = min_t(size_t,
1952                                  len - SKB_MAX_ALLOC,
1953                                  MAX_SKB_FRAGS * PAGE_SIZE);
1954                 data_len = PAGE_ALIGN(data_len);
1955
1956                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1957         }
1958
1959         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1960                                    msg->msg_flags & MSG_DONTWAIT, &err,
1961                                    PAGE_ALLOC_COSTLY_ORDER);
1962         if (skb == NULL)
1963                 goto out;
1964
1965         err = unix_scm_to_skb(&scm, skb, true);
1966         if (err < 0)
1967                 goto out_free;
1968
1969         skb_put(skb, len - data_len);
1970         skb->data_len = data_len;
1971         skb->len = len;
1972         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1973         if (err)
1974                 goto out_free;
1975
1976         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1977
1978 restart:
1979         if (!other) {
1980                 err = -ECONNRESET;
1981                 if (sunaddr == NULL)
1982                         goto out_free;
1983
1984                 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1985                                         sk->sk_type);
1986                 if (IS_ERR(other)) {
1987                         err = PTR_ERR(other);
1988                         other = NULL;
1989                         goto out_free;
1990                 }
1991         }
1992
1993         if (sk_filter(other, skb) < 0) {
1994                 /* Toss the packet but do not return any error to the sender */
1995                 err = len;
1996                 goto out_free;
1997         }
1998
1999         sk_locked = 0;
2000         unix_state_lock(other);
2001 restart_locked:
2002         err = -EPERM;
2003         if (!unix_may_send(sk, other))
2004                 goto out_unlock;
2005
2006         if (unlikely(sock_flag(other, SOCK_DEAD))) {
2007                 /*
2008                  *      Check with 1003.1g - what should
2009                  *      datagram error
2010                  */
2011                 unix_state_unlock(other);
2012                 sock_put(other);
2013
2014                 if (!sk_locked)
2015                         unix_state_lock(sk);
2016
2017                 err = 0;
2018                 if (sk->sk_type == SOCK_SEQPACKET) {
2019                         /* We are here only when racing with unix_release_sock()
2020                          * is clearing @other. Never change state to TCP_CLOSE
2021                          * unlike SOCK_DGRAM wants.
2022                          */
2023                         unix_state_unlock(sk);
2024                         err = -EPIPE;
2025                 } else if (unix_peer(sk) == other) {
2026                         unix_peer(sk) = NULL;
2027                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2028
2029                         sk->sk_state = TCP_CLOSE;
2030                         unix_state_unlock(sk);
2031
2032                         unix_dgram_disconnected(sk, other);
2033                         sock_put(other);
2034                         err = -ECONNREFUSED;
2035                 } else {
2036                         unix_state_unlock(sk);
2037                 }
2038
2039                 other = NULL;
2040                 if (err)
2041                         goto out_free;
2042                 goto restart;
2043         }
2044
2045         err = -EPIPE;
2046         if (other->sk_shutdown & RCV_SHUTDOWN)
2047                 goto out_unlock;
2048
2049         if (sk->sk_type != SOCK_SEQPACKET) {
2050                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2051                 if (err)
2052                         goto out_unlock;
2053         }
2054
2055         /* other == sk && unix_peer(other) != sk if
2056          * - unix_peer(sk) == NULL, destination address bound to sk
2057          * - unix_peer(sk) == sk by time of get but disconnected before lock
2058          */
2059         if (other != sk &&
2060             unlikely(unix_peer(other) != sk &&
2061             unix_recvq_full_lockless(other))) {
2062                 if (timeo) {
2063                         timeo = unix_wait_for_peer(other, timeo);
2064
2065                         err = sock_intr_errno(timeo);
2066                         if (signal_pending(current))
2067                                 goto out_free;
2068
2069                         goto restart;
2070                 }
2071
2072                 if (!sk_locked) {
2073                         unix_state_unlock(other);
2074                         unix_state_double_lock(sk, other);
2075                 }
2076
2077                 if (unix_peer(sk) != other ||
2078                     unix_dgram_peer_wake_me(sk, other)) {
2079                         err = -EAGAIN;
2080                         sk_locked = 1;
2081                         goto out_unlock;
2082                 }
2083
2084                 if (!sk_locked) {
2085                         sk_locked = 1;
2086                         goto restart_locked;
2087                 }
2088         }
2089
2090         if (unlikely(sk_locked))
2091                 unix_state_unlock(sk);
2092
2093         if (sock_flag(other, SOCK_RCVTSTAMP))
2094                 __net_timestamp(skb);
2095         maybe_add_creds(skb, sock, other);
2096         scm_stat_add(other, skb);
2097         skb_queue_tail(&other->sk_receive_queue, skb);
2098         unix_state_unlock(other);
2099         other->sk_data_ready(other);
2100         sock_put(other);
2101         scm_destroy(&scm);
2102         return len;
2103
2104 out_unlock:
2105         if (sk_locked)
2106                 unix_state_unlock(sk);
2107         unix_state_unlock(other);
2108 out_free:
2109         kfree_skb(skb);
2110 out:
2111         if (other)
2112                 sock_put(other);
2113         scm_destroy(&scm);
2114         return err;
2115 }
2116
2117 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2118  * bytes, and a minimum of a full page.
2119  */
2120 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2121
2122 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2123 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2124                      struct scm_cookie *scm, bool fds_sent)
2125 {
2126         struct unix_sock *ousk = unix_sk(other);
2127         struct sk_buff *skb;
2128         int err = 0;
2129
2130         skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2131
2132         if (!skb)
2133                 return err;
2134
2135         err = unix_scm_to_skb(scm, skb, !fds_sent);
2136         if (err < 0) {
2137                 kfree_skb(skb);
2138                 return err;
2139         }
2140         skb_put(skb, 1);
2141         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2142
2143         if (err) {
2144                 kfree_skb(skb);
2145                 return err;
2146         }
2147
2148         unix_state_lock(other);
2149
2150         if (sock_flag(other, SOCK_DEAD) ||
2151             (other->sk_shutdown & RCV_SHUTDOWN)) {
2152                 unix_state_unlock(other);
2153                 kfree_skb(skb);
2154                 return -EPIPE;
2155         }
2156
2157         maybe_add_creds(skb, sock, other);
2158         skb_get(skb);
2159
2160         if (ousk->oob_skb)
2161                 consume_skb(ousk->oob_skb);
2162
2163         WRITE_ONCE(ousk->oob_skb, skb);
2164
2165         scm_stat_add(other, skb);
2166         skb_queue_tail(&other->sk_receive_queue, skb);
2167         sk_send_sigurg(other);
2168         unix_state_unlock(other);
2169         other->sk_data_ready(other);
2170
2171         return err;
2172 }
2173 #endif
2174
2175 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2176                                size_t len)
2177 {
2178         struct sock *sk = sock->sk;
2179         struct sock *other = NULL;
2180         int err, size;
2181         struct sk_buff *skb;
2182         int sent = 0;
2183         struct scm_cookie scm;
2184         bool fds_sent = false;
2185         int data_len;
2186
2187         wait_for_unix_gc();
2188         err = scm_send(sock, msg, &scm, false);
2189         if (err < 0)
2190                 return err;
2191
2192         err = -EOPNOTSUPP;
2193         if (msg->msg_flags & MSG_OOB) {
2194 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2195                 if (len)
2196                         len--;
2197                 else
2198 #endif
2199                         goto out_err;
2200         }
2201
2202         if (msg->msg_namelen) {
2203                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2204                 goto out_err;
2205         } else {
2206                 err = -ENOTCONN;
2207                 other = unix_peer(sk);
2208                 if (!other)
2209                         goto out_err;
2210         }
2211
2212         if (sk->sk_shutdown & SEND_SHUTDOWN)
2213                 goto pipe_err;
2214
2215         while (sent < len) {
2216                 size = len - sent;
2217
2218                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2219                         skb = sock_alloc_send_pskb(sk, 0, 0,
2220                                                    msg->msg_flags & MSG_DONTWAIT,
2221                                                    &err, 0);
2222                 } else {
2223                         /* Keep two messages in the pipe so it schedules better */
2224                         size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2225
2226                         /* allow fallback to order-0 allocations */
2227                         size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2228
2229                         data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2230
2231                         data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2232
2233                         skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2234                                                    msg->msg_flags & MSG_DONTWAIT, &err,
2235                                                    get_order(UNIX_SKB_FRAGS_SZ));
2236                 }
2237                 if (!skb)
2238                         goto out_err;
2239
2240                 /* Only send the fds in the first buffer */
2241                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2242                 if (err < 0) {
2243                         kfree_skb(skb);
2244                         goto out_err;
2245                 }
2246                 fds_sent = true;
2247
2248                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2249                         err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2250                                                    sk->sk_allocation);
2251                         if (err < 0) {
2252                                 kfree_skb(skb);
2253                                 goto out_err;
2254                         }
2255                         size = err;
2256                         refcount_add(size, &sk->sk_wmem_alloc);
2257                 } else {
2258                         skb_put(skb, size - data_len);
2259                         skb->data_len = data_len;
2260                         skb->len = size;
2261                         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2262                         if (err) {
2263                                 kfree_skb(skb);
2264                                 goto out_err;
2265                         }
2266                 }
2267
2268                 unix_state_lock(other);
2269
2270                 if (sock_flag(other, SOCK_DEAD) ||
2271                     (other->sk_shutdown & RCV_SHUTDOWN))
2272                         goto pipe_err_free;
2273
2274                 maybe_add_creds(skb, sock, other);
2275                 scm_stat_add(other, skb);
2276                 skb_queue_tail(&other->sk_receive_queue, skb);
2277                 unix_state_unlock(other);
2278                 other->sk_data_ready(other);
2279                 sent += size;
2280         }
2281
2282 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2283         if (msg->msg_flags & MSG_OOB) {
2284                 err = queue_oob(sock, msg, other, &scm, fds_sent);
2285                 if (err)
2286                         goto out_err;
2287                 sent++;
2288         }
2289 #endif
2290
2291         scm_destroy(&scm);
2292
2293         return sent;
2294
2295 pipe_err_free:
2296         unix_state_unlock(other);
2297         kfree_skb(skb);
2298 pipe_err:
2299         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2300                 send_sig(SIGPIPE, current, 0);
2301         err = -EPIPE;
2302 out_err:
2303         scm_destroy(&scm);
2304         return sent ? : err;
2305 }
2306
2307 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2308                                   size_t len)
2309 {
2310         int err;
2311         struct sock *sk = sock->sk;
2312
2313         err = sock_error(sk);
2314         if (err)
2315                 return err;
2316
2317         if (sk->sk_state != TCP_ESTABLISHED)
2318                 return -ENOTCONN;
2319
2320         if (msg->msg_namelen)
2321                 msg->msg_namelen = 0;
2322
2323         return unix_dgram_sendmsg(sock, msg, len);
2324 }
2325
2326 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2327                                   size_t size, int flags)
2328 {
2329         struct sock *sk = sock->sk;
2330
2331         if (sk->sk_state != TCP_ESTABLISHED)
2332                 return -ENOTCONN;
2333
2334         return unix_dgram_recvmsg(sock, msg, size, flags);
2335 }
2336
2337 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2338 {
2339         struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2340
2341         if (addr) {
2342                 msg->msg_namelen = addr->len;
2343                 memcpy(msg->msg_name, addr->name, addr->len);
2344         }
2345 }
2346
2347 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2348                          int flags)
2349 {
2350         struct scm_cookie scm;
2351         struct socket *sock = sk->sk_socket;
2352         struct unix_sock *u = unix_sk(sk);
2353         struct sk_buff *skb, *last;
2354         long timeo;
2355         int skip;
2356         int err;
2357
2358         err = -EOPNOTSUPP;
2359         if (flags&MSG_OOB)
2360                 goto out;
2361
2362         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2363
2364         do {
2365                 mutex_lock(&u->iolock);
2366
2367                 skip = sk_peek_offset(sk, flags);
2368                 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2369                                               &skip, &err, &last);
2370                 if (skb) {
2371                         if (!(flags & MSG_PEEK))
2372                                 scm_stat_del(sk, skb);
2373                         break;
2374                 }
2375
2376                 mutex_unlock(&u->iolock);
2377
2378                 if (err != -EAGAIN)
2379                         break;
2380         } while (timeo &&
2381                  !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2382                                               &err, &timeo, last));
2383
2384         if (!skb) { /* implies iolock unlocked */
2385                 unix_state_lock(sk);
2386                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2387                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2388                     (sk->sk_shutdown & RCV_SHUTDOWN))
2389                         err = 0;
2390                 unix_state_unlock(sk);
2391                 goto out;
2392         }
2393
2394         if (wq_has_sleeper(&u->peer_wait))
2395                 wake_up_interruptible_sync_poll(&u->peer_wait,
2396                                                 EPOLLOUT | EPOLLWRNORM |
2397                                                 EPOLLWRBAND);
2398
2399         if (msg->msg_name) {
2400                 unix_copy_addr(msg, skb->sk);
2401
2402                 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2403                                                       msg->msg_name,
2404                                                       &msg->msg_namelen);
2405         }
2406
2407         if (size > skb->len - skip)
2408                 size = skb->len - skip;
2409         else if (size < skb->len - skip)
2410                 msg->msg_flags |= MSG_TRUNC;
2411
2412         err = skb_copy_datagram_msg(skb, skip, msg, size);
2413         if (err)
2414                 goto out_free;
2415
2416         if (sock_flag(sk, SOCK_RCVTSTAMP))
2417                 __sock_recv_timestamp(msg, sk, skb);
2418
2419         memset(&scm, 0, sizeof(scm));
2420
2421         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2422         unix_set_secdata(&scm, skb);
2423
2424         if (!(flags & MSG_PEEK)) {
2425                 if (UNIXCB(skb).fp)
2426                         unix_detach_fds(&scm, skb);
2427
2428                 sk_peek_offset_bwd(sk, skb->len);
2429         } else {
2430                 /* It is questionable: on PEEK we could:
2431                    - do not return fds - good, but too simple 8)
2432                    - return fds, and do not return them on read (old strategy,
2433                      apparently wrong)
2434                    - clone fds (I chose it for now, it is the most universal
2435                      solution)
2436
2437                    POSIX 1003.1g does not actually define this clearly
2438                    at all. POSIX 1003.1g doesn't define a lot of things
2439                    clearly however!
2440
2441                 */
2442
2443                 sk_peek_offset_fwd(sk, size);
2444
2445                 if (UNIXCB(skb).fp)
2446                         unix_peek_fds(&scm, skb);
2447         }
2448         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2449
2450         scm_recv_unix(sock, msg, &scm, flags);
2451
2452 out_free:
2453         skb_free_datagram(sk, skb);
2454         mutex_unlock(&u->iolock);
2455 out:
2456         return err;
2457 }
2458
2459 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2460                               int flags)
2461 {
2462         struct sock *sk = sock->sk;
2463
2464 #ifdef CONFIG_BPF_SYSCALL
2465         const struct proto *prot = READ_ONCE(sk->sk_prot);
2466
2467         if (prot != &unix_dgram_proto)
2468                 return prot->recvmsg(sk, msg, size, flags, NULL);
2469 #endif
2470         return __unix_dgram_recvmsg(sk, msg, size, flags);
2471 }
2472
2473 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2474 {
2475         struct unix_sock *u = unix_sk(sk);
2476         struct sk_buff *skb;
2477         int err;
2478
2479         mutex_lock(&u->iolock);
2480         skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2481         mutex_unlock(&u->iolock);
2482         if (!skb)
2483                 return err;
2484
2485         return recv_actor(sk, skb);
2486 }
2487
2488 /*
2489  *      Sleep until more data has arrived. But check for races..
2490  */
2491 static long unix_stream_data_wait(struct sock *sk, long timeo,
2492                                   struct sk_buff *last, unsigned int last_len,
2493                                   bool freezable)
2494 {
2495         unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2496         struct sk_buff *tail;
2497         DEFINE_WAIT(wait);
2498
2499         unix_state_lock(sk);
2500
2501         for (;;) {
2502                 prepare_to_wait(sk_sleep(sk), &wait, state);
2503
2504                 tail = skb_peek_tail(&sk->sk_receive_queue);
2505                 if (tail != last ||
2506                     (tail && tail->len != last_len) ||
2507                     sk->sk_err ||
2508                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2509                     signal_pending(current) ||
2510                     !timeo)
2511                         break;
2512
2513                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2514                 unix_state_unlock(sk);
2515                 timeo = schedule_timeout(timeo);
2516                 unix_state_lock(sk);
2517
2518                 if (sock_flag(sk, SOCK_DEAD))
2519                         break;
2520
2521                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2522         }
2523
2524         finish_wait(sk_sleep(sk), &wait);
2525         unix_state_unlock(sk);
2526         return timeo;
2527 }
2528
2529 static unsigned int unix_skb_len(const struct sk_buff *skb)
2530 {
2531         return skb->len - UNIXCB(skb).consumed;
2532 }
2533
2534 struct unix_stream_read_state {
2535         int (*recv_actor)(struct sk_buff *, int, int,
2536                           struct unix_stream_read_state *);
2537         struct socket *socket;
2538         struct msghdr *msg;
2539         struct pipe_inode_info *pipe;
2540         size_t size;
2541         int flags;
2542         unsigned int splice_flags;
2543 };
2544
2545 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2546 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2547 {
2548         struct socket *sock = state->socket;
2549         struct sock *sk = sock->sk;
2550         struct unix_sock *u = unix_sk(sk);
2551         int chunk = 1;
2552         struct sk_buff *oob_skb;
2553
2554         mutex_lock(&u->iolock);
2555         unix_state_lock(sk);
2556
2557         if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2558                 unix_state_unlock(sk);
2559                 mutex_unlock(&u->iolock);
2560                 return -EINVAL;
2561         }
2562
2563         oob_skb = u->oob_skb;
2564
2565         if (!(state->flags & MSG_PEEK))
2566                 WRITE_ONCE(u->oob_skb, NULL);
2567         else
2568                 skb_get(oob_skb);
2569         unix_state_unlock(sk);
2570
2571         chunk = state->recv_actor(oob_skb, 0, chunk, state);
2572
2573         if (!(state->flags & MSG_PEEK))
2574                 UNIXCB(oob_skb).consumed += 1;
2575
2576         consume_skb(oob_skb);
2577
2578         mutex_unlock(&u->iolock);
2579
2580         if (chunk < 0)
2581                 return -EFAULT;
2582
2583         state->msg->msg_flags |= MSG_OOB;
2584         return 1;
2585 }
2586
2587 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2588                                   int flags, int copied)
2589 {
2590         struct unix_sock *u = unix_sk(sk);
2591
2592         if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2593                 skb_unlink(skb, &sk->sk_receive_queue);
2594                 consume_skb(skb);
2595                 skb = NULL;
2596         } else {
2597                 if (skb == u->oob_skb) {
2598                         if (copied) {
2599                                 skb = NULL;
2600                         } else if (sock_flag(sk, SOCK_URGINLINE)) {
2601                                 if (!(flags & MSG_PEEK)) {
2602                                         WRITE_ONCE(u->oob_skb, NULL);
2603                                         consume_skb(skb);
2604                                 }
2605                         } else if (flags & MSG_PEEK) {
2606                                 skb = NULL;
2607                         } else {
2608                                 skb_unlink(skb, &sk->sk_receive_queue);
2609                                 WRITE_ONCE(u->oob_skb, NULL);
2610                                 if (!WARN_ON_ONCE(skb_unref(skb)))
2611                                         kfree_skb(skb);
2612                                 skb = skb_peek(&sk->sk_receive_queue);
2613                         }
2614                 }
2615         }
2616         return skb;
2617 }
2618 #endif
2619
2620 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2621 {
2622         if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2623                 return -ENOTCONN;
2624
2625         return unix_read_skb(sk, recv_actor);
2626 }
2627
2628 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2629                                     bool freezable)
2630 {
2631         struct scm_cookie scm;
2632         struct socket *sock = state->socket;
2633         struct sock *sk = sock->sk;
2634         struct unix_sock *u = unix_sk(sk);
2635         int copied = 0;
2636         int flags = state->flags;
2637         int noblock = flags & MSG_DONTWAIT;
2638         bool check_creds = false;
2639         int target;
2640         int err = 0;
2641         long timeo;
2642         int skip;
2643         size_t size = state->size;
2644         unsigned int last_len;
2645
2646         if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2647                 err = -EINVAL;
2648                 goto out;
2649         }
2650
2651         if (unlikely(flags & MSG_OOB)) {
2652                 err = -EOPNOTSUPP;
2653 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2654                 err = unix_stream_recv_urg(state);
2655 #endif
2656                 goto out;
2657         }
2658
2659         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2660         timeo = sock_rcvtimeo(sk, noblock);
2661
2662         memset(&scm, 0, sizeof(scm));
2663
2664         /* Lock the socket to prevent queue disordering
2665          * while sleeps in memcpy_tomsg
2666          */
2667         mutex_lock(&u->iolock);
2668
2669         skip = max(sk_peek_offset(sk, flags), 0);
2670
2671         do {
2672                 int chunk;
2673                 bool drop_skb;
2674                 struct sk_buff *skb, *last;
2675
2676 redo:
2677                 unix_state_lock(sk);
2678                 if (sock_flag(sk, SOCK_DEAD)) {
2679                         err = -ECONNRESET;
2680                         goto unlock;
2681                 }
2682                 last = skb = skb_peek(&sk->sk_receive_queue);
2683                 last_len = last ? last->len : 0;
2684
2685 again:
2686 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2687                 if (skb) {
2688                         skb = manage_oob(skb, sk, flags, copied);
2689                         if (!skb && copied) {
2690                                 unix_state_unlock(sk);
2691                                 break;
2692                         }
2693                 }
2694 #endif
2695                 if (skb == NULL) {
2696                         if (copied >= target)
2697                                 goto unlock;
2698
2699                         /*
2700                          *      POSIX 1003.1g mandates this order.
2701                          */
2702
2703                         err = sock_error(sk);
2704                         if (err)
2705                                 goto unlock;
2706                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2707                                 goto unlock;
2708
2709                         unix_state_unlock(sk);
2710                         if (!timeo) {
2711                                 err = -EAGAIN;
2712                                 break;
2713                         }
2714
2715                         mutex_unlock(&u->iolock);
2716
2717                         timeo = unix_stream_data_wait(sk, timeo, last,
2718                                                       last_len, freezable);
2719
2720                         if (signal_pending(current)) {
2721                                 err = sock_intr_errno(timeo);
2722                                 scm_destroy(&scm);
2723                                 goto out;
2724                         }
2725
2726                         mutex_lock(&u->iolock);
2727                         goto redo;
2728 unlock:
2729                         unix_state_unlock(sk);
2730                         break;
2731                 }
2732
2733                 while (skip >= unix_skb_len(skb)) {
2734                         skip -= unix_skb_len(skb);
2735                         last = skb;
2736                         last_len = skb->len;
2737                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2738                         if (!skb)
2739                                 goto again;
2740                 }
2741
2742                 unix_state_unlock(sk);
2743
2744                 if (check_creds) {
2745                         /* Never glue messages from different writers */
2746                         if (!unix_skb_scm_eq(skb, &scm))
2747                                 break;
2748                 } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2749                            test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2750                         /* Copy credentials */
2751                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2752                         unix_set_secdata(&scm, skb);
2753                         check_creds = true;
2754                 }
2755
2756                 /* Copy address just once */
2757                 if (state->msg && state->msg->msg_name) {
2758                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2759                                          state->msg->msg_name);
2760                         unix_copy_addr(state->msg, skb->sk);
2761
2762                         BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2763                                                               state->msg->msg_name,
2764                                                               &state->msg->msg_namelen);
2765
2766                         sunaddr = NULL;
2767                 }
2768
2769                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2770                 skb_get(skb);
2771                 chunk = state->recv_actor(skb, skip, chunk, state);
2772                 drop_skb = !unix_skb_len(skb);
2773                 /* skb is only safe to use if !drop_skb */
2774                 consume_skb(skb);
2775                 if (chunk < 0) {
2776                         if (copied == 0)
2777                                 copied = -EFAULT;
2778                         break;
2779                 }
2780                 copied += chunk;
2781                 size -= chunk;
2782
2783                 if (drop_skb) {
2784                         /* the skb was touched by a concurrent reader;
2785                          * we should not expect anything from this skb
2786                          * anymore and assume it invalid - we can be
2787                          * sure it was dropped from the socket queue
2788                          *
2789                          * let's report a short read
2790                          */
2791                         err = 0;
2792                         break;
2793                 }
2794
2795                 /* Mark read part of skb as used */
2796                 if (!(flags & MSG_PEEK)) {
2797                         UNIXCB(skb).consumed += chunk;
2798
2799                         sk_peek_offset_bwd(sk, chunk);
2800
2801                         if (UNIXCB(skb).fp) {
2802                                 scm_stat_del(sk, skb);
2803                                 unix_detach_fds(&scm, skb);
2804                         }
2805
2806                         if (unix_skb_len(skb))
2807                                 break;
2808
2809                         skb_unlink(skb, &sk->sk_receive_queue);
2810                         consume_skb(skb);
2811
2812                         if (scm.fp)
2813                                 break;
2814                 } else {
2815                         /* It is questionable, see note in unix_dgram_recvmsg.
2816                          */
2817                         if (UNIXCB(skb).fp)
2818                                 unix_peek_fds(&scm, skb);
2819
2820                         sk_peek_offset_fwd(sk, chunk);
2821
2822                         if (UNIXCB(skb).fp)
2823                                 break;
2824
2825                         skip = 0;
2826                         last = skb;
2827                         last_len = skb->len;
2828                         unix_state_lock(sk);
2829                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2830                         if (skb)
2831                                 goto again;
2832                         unix_state_unlock(sk);
2833                         break;
2834                 }
2835         } while (size);
2836
2837         mutex_unlock(&u->iolock);
2838         if (state->msg)
2839                 scm_recv_unix(sock, state->msg, &scm, flags);
2840         else
2841                 scm_destroy(&scm);
2842 out:
2843         return copied ? : err;
2844 }
2845
2846 static int unix_stream_read_actor(struct sk_buff *skb,
2847                                   int skip, int chunk,
2848                                   struct unix_stream_read_state *state)
2849 {
2850         int ret;
2851
2852         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2853                                     state->msg, chunk);
2854         return ret ?: chunk;
2855 }
2856
2857 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2858                           size_t size, int flags)
2859 {
2860         struct unix_stream_read_state state = {
2861                 .recv_actor = unix_stream_read_actor,
2862                 .socket = sk->sk_socket,
2863                 .msg = msg,
2864                 .size = size,
2865                 .flags = flags
2866         };
2867
2868         return unix_stream_read_generic(&state, true);
2869 }
2870
2871 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2872                                size_t size, int flags)
2873 {
2874         struct unix_stream_read_state state = {
2875                 .recv_actor = unix_stream_read_actor,
2876                 .socket = sock,
2877                 .msg = msg,
2878                 .size = size,
2879                 .flags = flags
2880         };
2881
2882 #ifdef CONFIG_BPF_SYSCALL
2883         struct sock *sk = sock->sk;
2884         const struct proto *prot = READ_ONCE(sk->sk_prot);
2885
2886         if (prot != &unix_stream_proto)
2887                 return prot->recvmsg(sk, msg, size, flags, NULL);
2888 #endif
2889         return unix_stream_read_generic(&state, true);
2890 }
2891
2892 static int unix_stream_splice_actor(struct sk_buff *skb,
2893                                     int skip, int chunk,
2894                                     struct unix_stream_read_state *state)
2895 {
2896         return skb_splice_bits(skb, state->socket->sk,
2897                                UNIXCB(skb).consumed + skip,
2898                                state->pipe, chunk, state->splice_flags);
2899 }
2900
2901 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2902                                        struct pipe_inode_info *pipe,
2903                                        size_t size, unsigned int flags)
2904 {
2905         struct unix_stream_read_state state = {
2906                 .recv_actor = unix_stream_splice_actor,
2907                 .socket = sock,
2908                 .pipe = pipe,
2909                 .size = size,
2910                 .splice_flags = flags,
2911         };
2912
2913         if (unlikely(*ppos))
2914                 return -ESPIPE;
2915
2916         if (sock->file->f_flags & O_NONBLOCK ||
2917             flags & SPLICE_F_NONBLOCK)
2918                 state.flags = MSG_DONTWAIT;
2919
2920         return unix_stream_read_generic(&state, false);
2921 }
2922
2923 static int unix_shutdown(struct socket *sock, int mode)
2924 {
2925         struct sock *sk = sock->sk;
2926         struct sock *other;
2927
2928         if (mode < SHUT_RD || mode > SHUT_RDWR)
2929                 return -EINVAL;
2930         /* This maps:
2931          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2932          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2933          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2934          */
2935         ++mode;
2936
2937         unix_state_lock(sk);
2938         WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2939         other = unix_peer(sk);
2940         if (other)
2941                 sock_hold(other);
2942         unix_state_unlock(sk);
2943         sk->sk_state_change(sk);
2944
2945         if (other &&
2946                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2947
2948                 int peer_mode = 0;
2949                 const struct proto *prot = READ_ONCE(other->sk_prot);
2950
2951                 if (prot->unhash)
2952                         prot->unhash(other);
2953                 if (mode&RCV_SHUTDOWN)
2954                         peer_mode |= SEND_SHUTDOWN;
2955                 if (mode&SEND_SHUTDOWN)
2956                         peer_mode |= RCV_SHUTDOWN;
2957                 unix_state_lock(other);
2958                 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2959                 unix_state_unlock(other);
2960                 other->sk_state_change(other);
2961                 if (peer_mode == SHUTDOWN_MASK)
2962                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2963                 else if (peer_mode & RCV_SHUTDOWN)
2964                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2965         }
2966         if (other)
2967                 sock_put(other);
2968
2969         return 0;
2970 }
2971
2972 long unix_inq_len(struct sock *sk)
2973 {
2974         struct sk_buff *skb;
2975         long amount = 0;
2976
2977         if (sk->sk_state == TCP_LISTEN)
2978                 return -EINVAL;
2979
2980         spin_lock(&sk->sk_receive_queue.lock);
2981         if (sk->sk_type == SOCK_STREAM ||
2982             sk->sk_type == SOCK_SEQPACKET) {
2983                 skb_queue_walk(&sk->sk_receive_queue, skb)
2984                         amount += unix_skb_len(skb);
2985         } else {
2986                 skb = skb_peek(&sk->sk_receive_queue);
2987                 if (skb)
2988                         amount = skb->len;
2989         }
2990         spin_unlock(&sk->sk_receive_queue.lock);
2991
2992         return amount;
2993 }
2994 EXPORT_SYMBOL_GPL(unix_inq_len);
2995
2996 long unix_outq_len(struct sock *sk)
2997 {
2998         return sk_wmem_alloc_get(sk);
2999 }
3000 EXPORT_SYMBOL_GPL(unix_outq_len);
3001
3002 static int unix_open_file(struct sock *sk)
3003 {
3004         struct path path;
3005         struct file *f;
3006         int fd;
3007
3008         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3009                 return -EPERM;
3010
3011         if (!smp_load_acquire(&unix_sk(sk)->addr))
3012                 return -ENOENT;
3013
3014         path = unix_sk(sk)->path;
3015         if (!path.dentry)
3016                 return -ENOENT;
3017
3018         path_get(&path);
3019
3020         fd = get_unused_fd_flags(O_CLOEXEC);
3021         if (fd < 0)
3022                 goto out;
3023
3024         f = dentry_open(&path, O_PATH, current_cred());
3025         if (IS_ERR(f)) {
3026                 put_unused_fd(fd);
3027                 fd = PTR_ERR(f);
3028                 goto out;
3029         }
3030
3031         fd_install(fd, f);
3032 out:
3033         path_put(&path);
3034
3035         return fd;
3036 }
3037
3038 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3039 {
3040         struct sock *sk = sock->sk;
3041         long amount = 0;
3042         int err;
3043
3044         switch (cmd) {
3045         case SIOCOUTQ:
3046                 amount = unix_outq_len(sk);
3047                 err = put_user(amount, (int __user *)arg);
3048                 break;
3049         case SIOCINQ:
3050                 amount = unix_inq_len(sk);
3051                 if (amount < 0)
3052                         err = amount;
3053                 else
3054                         err = put_user(amount, (int __user *)arg);
3055                 break;
3056         case SIOCUNIXFILE:
3057                 err = unix_open_file(sk);
3058                 break;
3059 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3060         case SIOCATMARK:
3061                 {
3062                         struct sk_buff *skb;
3063                         int answ = 0;
3064
3065                         skb = skb_peek(&sk->sk_receive_queue);
3066                         if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3067                                 answ = 1;
3068                         err = put_user(answ, (int __user *)arg);
3069                 }
3070                 break;
3071 #endif
3072         default:
3073                 err = -ENOIOCTLCMD;
3074                 break;
3075         }
3076         return err;
3077 }
3078
3079 #ifdef CONFIG_COMPAT
3080 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3081 {
3082         return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3083 }
3084 #endif
3085
3086 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3087 {
3088         struct sock *sk = sock->sk;
3089         __poll_t mask;
3090         u8 shutdown;
3091
3092         sock_poll_wait(file, sock, wait);
3093         mask = 0;
3094         shutdown = READ_ONCE(sk->sk_shutdown);
3095
3096         /* exceptional events? */
3097         if (READ_ONCE(sk->sk_err))
3098                 mask |= EPOLLERR;
3099         if (shutdown == SHUTDOWN_MASK)
3100                 mask |= EPOLLHUP;
3101         if (shutdown & RCV_SHUTDOWN)
3102                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3103
3104         /* readable? */
3105         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3106                 mask |= EPOLLIN | EPOLLRDNORM;
3107         if (sk_is_readable(sk))
3108                 mask |= EPOLLIN | EPOLLRDNORM;
3109 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3110         if (READ_ONCE(unix_sk(sk)->oob_skb))
3111                 mask |= EPOLLPRI;
3112 #endif
3113
3114         /* Connection-based need to check for termination and startup */
3115         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3116             sk->sk_state == TCP_CLOSE)
3117                 mask |= EPOLLHUP;
3118
3119         /*
3120          * we set writable also when the other side has shut down the
3121          * connection. This prevents stuck sockets.
3122          */
3123         if (unix_writable(sk))
3124                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3125
3126         return mask;
3127 }
3128
3129 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3130                                     poll_table *wait)
3131 {
3132         struct sock *sk = sock->sk, *other;
3133         unsigned int writable;
3134         __poll_t mask;
3135         u8 shutdown;
3136
3137         sock_poll_wait(file, sock, wait);
3138         mask = 0;
3139         shutdown = READ_ONCE(sk->sk_shutdown);
3140
3141         /* exceptional events? */
3142         if (READ_ONCE(sk->sk_err) ||
3143             !skb_queue_empty_lockless(&sk->sk_error_queue))
3144                 mask |= EPOLLERR |
3145                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3146
3147         if (shutdown & RCV_SHUTDOWN)
3148                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3149         if (shutdown == SHUTDOWN_MASK)
3150                 mask |= EPOLLHUP;
3151
3152         /* readable? */
3153         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3154                 mask |= EPOLLIN | EPOLLRDNORM;
3155         if (sk_is_readable(sk))
3156                 mask |= EPOLLIN | EPOLLRDNORM;
3157
3158         /* Connection-based need to check for termination and startup */
3159         if (sk->sk_type == SOCK_SEQPACKET) {
3160                 if (sk->sk_state == TCP_CLOSE)
3161                         mask |= EPOLLHUP;
3162                 /* connection hasn't started yet? */
3163                 if (sk->sk_state == TCP_SYN_SENT)
3164                         return mask;
3165         }
3166
3167         /* No write status requested, avoid expensive OUT tests. */
3168         if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3169                 return mask;
3170
3171         writable = unix_writable(sk);
3172         if (writable) {
3173                 unix_state_lock(sk);
3174
3175                 other = unix_peer(sk);
3176                 if (other && unix_peer(other) != sk &&
3177                     unix_recvq_full_lockless(other) &&
3178                     unix_dgram_peer_wake_me(sk, other))
3179                         writable = 0;
3180
3181                 unix_state_unlock(sk);
3182         }
3183
3184         if (writable)
3185                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3186         else
3187                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3188
3189         return mask;
3190 }
3191
3192 #ifdef CONFIG_PROC_FS
3193
3194 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3195
3196 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3197 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3198 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3199
3200 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3201 {
3202         unsigned long offset = get_offset(*pos);
3203         unsigned long bucket = get_bucket(*pos);
3204         unsigned long count = 0;
3205         struct sock *sk;
3206
3207         for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3208              sk; sk = sk_next(sk)) {
3209                 if (++count == offset)
3210                         break;
3211         }
3212
3213         return sk;
3214 }
3215
3216 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3217 {
3218         unsigned long bucket = get_bucket(*pos);
3219         struct net *net = seq_file_net(seq);
3220         struct sock *sk;
3221
3222         while (bucket < UNIX_HASH_SIZE) {
3223                 spin_lock(&net->unx.table.locks[bucket]);
3224
3225                 sk = unix_from_bucket(seq, pos);
3226                 if (sk)
3227                         return sk;
3228
3229                 spin_unlock(&net->unx.table.locks[bucket]);
3230
3231                 *pos = set_bucket_offset(++bucket, 1);
3232         }
3233
3234         return NULL;
3235 }
3236
3237 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3238                                   loff_t *pos)
3239 {
3240         unsigned long bucket = get_bucket(*pos);
3241
3242         sk = sk_next(sk);
3243         if (sk)
3244                 return sk;
3245
3246
3247         spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3248
3249         *pos = set_bucket_offset(++bucket, 1);
3250
3251         return unix_get_first(seq, pos);
3252 }
3253
3254 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3255 {
3256         if (!*pos)
3257                 return SEQ_START_TOKEN;
3258
3259         return unix_get_first(seq, pos);
3260 }
3261
3262 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3263 {
3264         ++*pos;
3265
3266         if (v == SEQ_START_TOKEN)
3267                 return unix_get_first(seq, pos);
3268
3269         return unix_get_next(seq, v, pos);
3270 }
3271
3272 static void unix_seq_stop(struct seq_file *seq, void *v)
3273 {
3274         struct sock *sk = v;
3275
3276         if (sk)
3277                 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3278 }
3279
3280 static int unix_seq_show(struct seq_file *seq, void *v)
3281 {
3282
3283         if (v == SEQ_START_TOKEN)
3284                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3285                          "Inode Path\n");
3286         else {
3287                 struct sock *s = v;
3288                 struct unix_sock *u = unix_sk(s);
3289                 unix_state_lock(s);
3290
3291                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3292                         s,
3293                         refcount_read(&s->sk_refcnt),
3294                         0,
3295                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3296                         s->sk_type,
3297                         s->sk_socket ?
3298                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3299                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3300                         sock_i_ino(s));
3301
3302                 if (u->addr) {  // under a hash table lock here
3303                         int i, len;
3304                         seq_putc(seq, ' ');
3305
3306                         i = 0;
3307                         len = u->addr->len -
3308                                 offsetof(struct sockaddr_un, sun_path);
3309                         if (u->addr->name->sun_path[0]) {
3310                                 len--;
3311                         } else {
3312                                 seq_putc(seq, '@');
3313                                 i++;
3314                         }
3315                         for ( ; i < len; i++)
3316                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
3317                                          '@');
3318                 }
3319                 unix_state_unlock(s);
3320                 seq_putc(seq, '\n');
3321         }
3322
3323         return 0;
3324 }
3325
3326 static const struct seq_operations unix_seq_ops = {
3327         .start  = unix_seq_start,
3328         .next   = unix_seq_next,
3329         .stop   = unix_seq_stop,
3330         .show   = unix_seq_show,
3331 };
3332
3333 #ifdef CONFIG_BPF_SYSCALL
3334 struct bpf_unix_iter_state {
3335         struct seq_net_private p;
3336         unsigned int cur_sk;
3337         unsigned int end_sk;
3338         unsigned int max_sk;
3339         struct sock **batch;
3340         bool st_bucket_done;
3341 };
3342
3343 struct bpf_iter__unix {
3344         __bpf_md_ptr(struct bpf_iter_meta *, meta);
3345         __bpf_md_ptr(struct unix_sock *, unix_sk);
3346         uid_t uid __aligned(8);
3347 };
3348
3349 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3350                               struct unix_sock *unix_sk, uid_t uid)
3351 {
3352         struct bpf_iter__unix ctx;
3353
3354         meta->seq_num--;  /* skip SEQ_START_TOKEN */
3355         ctx.meta = meta;
3356         ctx.unix_sk = unix_sk;
3357         ctx.uid = uid;
3358         return bpf_iter_run_prog(prog, &ctx);
3359 }
3360
3361 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3362
3363 {
3364         struct bpf_unix_iter_state *iter = seq->private;
3365         unsigned int expected = 1;
3366         struct sock *sk;
3367
3368         sock_hold(start_sk);
3369         iter->batch[iter->end_sk++] = start_sk;
3370
3371         for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3372                 if (iter->end_sk < iter->max_sk) {
3373                         sock_hold(sk);
3374                         iter->batch[iter->end_sk++] = sk;
3375                 }
3376
3377                 expected++;
3378         }
3379
3380         spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3381
3382         return expected;
3383 }
3384
3385 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3386 {
3387         while (iter->cur_sk < iter->end_sk)
3388                 sock_put(iter->batch[iter->cur_sk++]);
3389 }
3390
3391 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3392                                        unsigned int new_batch_sz)
3393 {
3394         struct sock **new_batch;
3395
3396         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3397                              GFP_USER | __GFP_NOWARN);
3398         if (!new_batch)
3399                 return -ENOMEM;
3400
3401         bpf_iter_unix_put_batch(iter);
3402         kvfree(iter->batch);
3403         iter->batch = new_batch;
3404         iter->max_sk = new_batch_sz;
3405
3406         return 0;
3407 }
3408
3409 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3410                                         loff_t *pos)
3411 {
3412         struct bpf_unix_iter_state *iter = seq->private;
3413         unsigned int expected;
3414         bool resized = false;
3415         struct sock *sk;
3416
3417         if (iter->st_bucket_done)
3418                 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3419
3420 again:
3421         /* Get a new batch */
3422         iter->cur_sk = 0;
3423         iter->end_sk = 0;
3424
3425         sk = unix_get_first(seq, pos);
3426         if (!sk)
3427                 return NULL; /* Done */
3428
3429         expected = bpf_iter_unix_hold_batch(seq, sk);
3430
3431         if (iter->end_sk == expected) {
3432                 iter->st_bucket_done = true;
3433                 return sk;
3434         }
3435
3436         if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3437                 resized = true;
3438                 goto again;
3439         }
3440
3441         return sk;
3442 }
3443
3444 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3445 {
3446         if (!*pos)
3447                 return SEQ_START_TOKEN;
3448
3449         /* bpf iter does not support lseek, so it always
3450          * continue from where it was stop()-ped.
3451          */
3452         return bpf_iter_unix_batch(seq, pos);
3453 }
3454
3455 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3456 {
3457         struct bpf_unix_iter_state *iter = seq->private;
3458         struct sock *sk;
3459
3460         /* Whenever seq_next() is called, the iter->cur_sk is
3461          * done with seq_show(), so advance to the next sk in
3462          * the batch.
3463          */
3464         if (iter->cur_sk < iter->end_sk)
3465                 sock_put(iter->batch[iter->cur_sk++]);
3466
3467         ++*pos;
3468
3469         if (iter->cur_sk < iter->end_sk)
3470                 sk = iter->batch[iter->cur_sk];
3471         else
3472                 sk = bpf_iter_unix_batch(seq, pos);
3473
3474         return sk;
3475 }
3476
3477 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3478 {
3479         struct bpf_iter_meta meta;
3480         struct bpf_prog *prog;
3481         struct sock *sk = v;
3482         uid_t uid;
3483         bool slow;
3484         int ret;
3485
3486         if (v == SEQ_START_TOKEN)
3487                 return 0;
3488
3489         slow = lock_sock_fast(sk);
3490
3491         if (unlikely(sk_unhashed(sk))) {
3492                 ret = SEQ_SKIP;
3493                 goto unlock;
3494         }
3495
3496         uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3497         meta.seq = seq;
3498         prog = bpf_iter_get_info(&meta, false);
3499         ret = unix_prog_seq_show(prog, &meta, v, uid);
3500 unlock:
3501         unlock_sock_fast(sk, slow);
3502         return ret;
3503 }
3504
3505 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3506 {
3507         struct bpf_unix_iter_state *iter = seq->private;
3508         struct bpf_iter_meta meta;
3509         struct bpf_prog *prog;
3510
3511         if (!v) {
3512                 meta.seq = seq;
3513                 prog = bpf_iter_get_info(&meta, true);
3514                 if (prog)
3515                         (void)unix_prog_seq_show(prog, &meta, v, 0);
3516         }
3517
3518         if (iter->cur_sk < iter->end_sk)
3519                 bpf_iter_unix_put_batch(iter);
3520 }
3521
3522 static const struct seq_operations bpf_iter_unix_seq_ops = {
3523         .start  = bpf_iter_unix_seq_start,
3524         .next   = bpf_iter_unix_seq_next,
3525         .stop   = bpf_iter_unix_seq_stop,
3526         .show   = bpf_iter_unix_seq_show,
3527 };
3528 #endif
3529 #endif
3530
3531 static const struct net_proto_family unix_family_ops = {
3532         .family = PF_UNIX,
3533         .create = unix_create,
3534         .owner  = THIS_MODULE,
3535 };
3536
3537
3538 static int __net_init unix_net_init(struct net *net)
3539 {
3540         int i;
3541
3542         net->unx.sysctl_max_dgram_qlen = 10;
3543         if (unix_sysctl_register(net))
3544                 goto out;
3545
3546 #ifdef CONFIG_PROC_FS
3547         if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3548                              sizeof(struct seq_net_private)))
3549                 goto err_sysctl;
3550 #endif
3551
3552         net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3553                                               sizeof(spinlock_t), GFP_KERNEL);
3554         if (!net->unx.table.locks)
3555                 goto err_proc;
3556
3557         net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3558                                                 sizeof(struct hlist_head),
3559                                                 GFP_KERNEL);
3560         if (!net->unx.table.buckets)
3561                 goto free_locks;
3562
3563         for (i = 0; i < UNIX_HASH_SIZE; i++) {
3564                 spin_lock_init(&net->unx.table.locks[i]);
3565                 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3566         }
3567
3568         return 0;
3569
3570 free_locks:
3571         kvfree(net->unx.table.locks);
3572 err_proc:
3573 #ifdef CONFIG_PROC_FS
3574         remove_proc_entry("unix", net->proc_net);
3575 err_sysctl:
3576 #endif
3577         unix_sysctl_unregister(net);
3578 out:
3579         return -ENOMEM;
3580 }
3581
3582 static void __net_exit unix_net_exit(struct net *net)
3583 {
3584         kvfree(net->unx.table.buckets);
3585         kvfree(net->unx.table.locks);
3586         unix_sysctl_unregister(net);
3587         remove_proc_entry("unix", net->proc_net);
3588 }
3589
3590 static struct pernet_operations unix_net_ops = {
3591         .init = unix_net_init,
3592         .exit = unix_net_exit,
3593 };
3594
3595 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3596 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3597                      struct unix_sock *unix_sk, uid_t uid)
3598
3599 #define INIT_BATCH_SZ 16
3600
3601 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3602 {
3603         struct bpf_unix_iter_state *iter = priv_data;
3604         int err;
3605
3606         err = bpf_iter_init_seq_net(priv_data, aux);
3607         if (err)
3608                 return err;
3609
3610         err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3611         if (err) {
3612                 bpf_iter_fini_seq_net(priv_data);
3613                 return err;
3614         }
3615
3616         return 0;
3617 }
3618
3619 static void bpf_iter_fini_unix(void *priv_data)
3620 {
3621         struct bpf_unix_iter_state *iter = priv_data;
3622
3623         bpf_iter_fini_seq_net(priv_data);
3624         kvfree(iter->batch);
3625 }
3626
3627 static const struct bpf_iter_seq_info unix_seq_info = {
3628         .seq_ops                = &bpf_iter_unix_seq_ops,
3629         .init_seq_private       = bpf_iter_init_unix,
3630         .fini_seq_private       = bpf_iter_fini_unix,
3631         .seq_priv_size          = sizeof(struct bpf_unix_iter_state),
3632 };
3633
3634 static const struct bpf_func_proto *
3635 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3636                              const struct bpf_prog *prog)
3637 {
3638         switch (func_id) {
3639         case BPF_FUNC_setsockopt:
3640                 return &bpf_sk_setsockopt_proto;
3641         case BPF_FUNC_getsockopt:
3642                 return &bpf_sk_getsockopt_proto;
3643         default:
3644                 return NULL;
3645         }
3646 }
3647
3648 static struct bpf_iter_reg unix_reg_info = {
3649         .target                 = "unix",
3650         .ctx_arg_info_size      = 1,
3651         .ctx_arg_info           = {
3652                 { offsetof(struct bpf_iter__unix, unix_sk),
3653                   PTR_TO_BTF_ID_OR_NULL },
3654         },
3655         .get_func_proto         = bpf_iter_unix_get_func_proto,
3656         .seq_info               = &unix_seq_info,
3657 };
3658
3659 static void __init bpf_iter_register(void)
3660 {
3661         unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3662         if (bpf_iter_reg_target(&unix_reg_info))
3663                 pr_warn("Warning: could not register bpf iterator unix\n");
3664 }
3665 #endif
3666
3667 static int __init af_unix_init(void)
3668 {
3669         int i, rc = -1;
3670
3671         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3672
3673         for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3674                 spin_lock_init(&bsd_socket_locks[i]);
3675                 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3676         }
3677
3678         rc = proto_register(&unix_dgram_proto, 1);
3679         if (rc != 0) {
3680                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3681                 goto out;
3682         }
3683
3684         rc = proto_register(&unix_stream_proto, 1);
3685         if (rc != 0) {
3686                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3687                 proto_unregister(&unix_dgram_proto);
3688                 goto out;
3689         }
3690
3691         sock_register(&unix_family_ops);
3692         register_pernet_subsys(&unix_net_ops);
3693         unix_bpf_build_proto();
3694
3695 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3696         bpf_iter_register();
3697 #endif
3698
3699 out:
3700         return rc;
3701 }
3702
3703 /* Later than subsys_initcall() because we depend on stuff initialised there */
3704 fs_initcall(af_unix_init);