net/unix/af_unix.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * NET4:        Implementation of BSD Unix domain sockets.
   4  *
   5  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   6  *
   7  * Fixes:
   8  *              Linus Torvalds  :       Assorted bug cures.
   9  *              Niibe Yutaka    :       async I/O support.
  10  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  11  *              Alan Cox        :       Limit size of allocated blocks.
  12  *              Alan Cox        :       Fixed the stupid socketpair bug.
  13  *              Alan Cox        :       BSD compatibility fine tuning.
  14  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  15  *              Alan Cox        :       Sorted out a proper draft version of
  16  *                                      file descriptor passing hacked up from
  17  *                                      Mike Shaver's work.
  18  *              Marty Leisner   :       Fixes to fd passing
  19  *              Nick Nevin      :       recvmsg bugfix.
  20  *              Alan Cox        :       Started proper garbage collector
  21  *              Heiko EiBfeldt  :       Missing verify_area check
  22  *              Alan Cox        :       Started POSIXisms
  23  *              Andreas Schwab  :       Replace inode by dentry for proper
  24  *                                      reference counting
  25  *              Kirk Petersen   :       Made this a module
  26  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  27  *                                      Lots of bug fixes.
  28  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  29  *                                      by above two patches.
  30  *           Andrea Arcangeli   :       If possible we block in connect(2)
  31  *                                      if the max backlog of the listen socket
  32  *                                      is been reached. This won't break
  33  *                                      old apps and it will avoid huge amount
  34  *                                      of socks hashed (this for unix_gc()
  35  *                                      performances reasons).
  36  *                                      Security fix that limits the max
  37  *                                      number of socks to 2*max_files and
  38  *                                      the number of skb queueable in the
  39  *                                      dgram receiver.
  40  *              Artur Skawina   :       Hash function optimizations
  41  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  42  *            Malcolm Beattie   :       Set peercred for socketpair
  43  *           Michal Ostrowski   :       Module initialization cleanup.
  44  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  45  *                                      the core infrastructure is doing that
  46  *                                      for all net proto families now (2.5.69+)
  47  *
  48  * Known differences from reference BSD that was tested:
  49  *
  50  *      [TO FIX]
  51  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  52  *              other the moment one end closes.
  53  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  54  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  55  *      [NOT TO FIX]
  56  *      accept() returns a path name even if the connecting socket has closed
  57  *              in the meantime (BSD loses the path and gives up).
  58  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  59  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  60  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  61  *      BSD af_unix apparently has connect forgetting to block properly.
  62  *              (need to check this with the POSIX spec in detail)
  63  *
  64  * Differences from 2.0.0-11-... (ANK)
  65  *      Bug fixes and improvements.
  66  *              - client shutdown killed server socket.
  67  *              - removed all useless cli/sti pairs.
  68  *
  69  *      Semantic changes/extensions.
  70  *              - generic control message passing.
  71  *              - SCM_CREDENTIALS control message.
  72  *              - "Abstract" (not FS based) socket bindings.
  73  *                Abstract names are sequences of bytes (not zero terminated)
  74  *                started by 0, so that this name space does not intersect
  75  *                with BSD names.
  76  */
  77
  78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  79
  80 #include <linux/module.h>
  81 #include <linux/kernel.h>
  82 #include <linux/signal.h>
  83 #include <linux/sched/signal.h>
  84 #include <linux/errno.h>
  85 #include <linux/string.h>
  86 #include <linux/stat.h>
  87 #include <linux/dcache.h>
  88 #include <linux/namei.h>
  89 #include <linux/socket.h>
  90 #include <linux/un.h>
  91 #include <linux/fcntl.h>
  92 #include <linux/termios.h>
  93 #include <linux/sockios.h>
  94 #include <linux/net.h>
  95 #include <linux/in.h>
  96 #include <linux/fs.h>
  97 #include <linux/slab.h>
  98 #include <linux/uaccess.h>
  99 #include <linux/skbuff.h>
 100 #include <linux/netdevice.h>
 101 #include <net/net_namespace.h>
 102 #include <net/sock.h>
 103 #include <net/tcp_states.h>
 104 #include <net/af_unix.h>
 105 #include <linux/proc_fs.h>
 106 #include <linux/seq_file.h>
 107 #include <net/scm.h>
 108 #include <linux/init.h>
 109 #include <linux/poll.h>
 110 #include <linux/rtnetlink.h>
 111 #include <linux/mount.h>
 112 #include <net/checksum.h>
 113 #include <linux/security.h>
 114 #include <linux/freezer.h>
 115 #include <linux/file.h>
 116
 117 #include "scm.h"
 118
 119 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 120 EXPORT_SYMBOL_GPL(unix_socket_table);
 121 DEFINE_SPINLOCK(unix_table_lock);
 122 EXPORT_SYMBOL_GPL(unix_table_lock);
 123 static atomic_long_t unix_nr_socks;
 124
 125
 126 static struct hlist_head *unix_sockets_unbound(void *addr)
 127 {
 128         unsigned long hash = (unsigned long)addr;
 129
 130         hash ^= hash >> 16;
 131         hash ^= hash >> 8;
 132         hash %= UNIX_HASH_SIZE;
 133         return &unix_socket_table[UNIX_HASH_SIZE + hash];
 134 }
 135
 136 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
 137
 138 #ifdef CONFIG_SECURITY_NETWORK
 139 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 140 {
 141         UNIXCB(skb).secid = scm->secid;
 142 }
 143
 144 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 145 {
 146         scm->secid = UNIXCB(skb).secid;
 147 }
 148
 149 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 150 {
 151         return (scm->secid == UNIXCB(skb).secid);
 152 }
 153 #else
 154 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 155 { }
 156
 157 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 158 { }
 159
 160 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 161 {
 162         return true;
 163 }
 164 #endif /* CONFIG_SECURITY_NETWORK */
 165
 166 /*
 167  *  SMP locking strategy:
 168  *    hash table is protected with spinlock unix_table_lock
 169  *    each socket state is protected by separate spin lock.
 170  */
 171
 172 static inline unsigned int unix_hash_fold(__wsum n)
 173 {
 174         unsigned int hash = (__force unsigned int)csum_fold(n);
 175
 176         hash ^= hash>>8;
 177         return hash&(UNIX_HASH_SIZE-1);
 178 }
 179
 180 #define unix_peer(sk) (unix_sk(sk)->peer)
 181
 182 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 183 {
 184         return unix_peer(osk) == sk;
 185 }
 186
 187 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 188 {
 189         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 190 }
 191
 192 static inline int unix_recvq_full(const struct sock *sk)
 193 {
 194         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 195 }
 196
 197 static inline int unix_recvq_full_lockless(const struct sock *sk)
 198 {
 199         return skb_queue_len_lockless(&sk->sk_receive_queue) >
 200                 READ_ONCE(sk->sk_max_ack_backlog);
 201 }
 202
 203 struct sock *unix_peer_get(struct sock *s)
 204 {
 205         struct sock *peer;
 206
 207         unix_state_lock(s);
 208         peer = unix_peer(s);
 209         if (peer)
 210                 sock_hold(peer);
 211         unix_state_unlock(s);
 212         return peer;
 213 }
 214 EXPORT_SYMBOL_GPL(unix_peer_get);
 215
 216 static inline void unix_release_addr(struct unix_address *addr)
 217 {
 218         if (refcount_dec_and_test(&addr->refcnt))
 219                 kfree(addr);
 220 }
 221
 222 /*
 223  *      Check unix socket name:
 224  *              - should be not zero length.
 225  *              - if started by not zero, should be NULL terminated (FS object)
 226  *              - if started by zero, it is abstract name.
 227  */
 228
 229 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
 230 {
 231         *hashp = 0;
 232
 233         if (len <= sizeof(short) || len > sizeof(*sunaddr))
 234                 return -EINVAL;
 235         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
 236                 return -EINVAL;
 237         if (sunaddr->sun_path[0]) {
 238                 /*
 239                  * This may look like an off by one error but it is a bit more
 240                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
 241                  * sun_path[108] doesn't as such exist.  However in kernel space
 242                  * we are guaranteed that it is a valid memory location in our
 243                  * kernel address buffer.
 244                  */
 245                 ((char *)sunaddr)[len] = 0;
 246                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
 247                 return len;
 248         }
 249
 250         *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
 251         return len;
 252 }
 253
 254 static void __unix_remove_socket(struct sock *sk)
 255 {
 256         sk_del_node_init(sk);
 257 }
 258
 259 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 260 {
 261         WARN_ON(!sk_unhashed(sk));
 262         sk_add_node(sk, list);
 263 }
 264
 265 static inline void unix_remove_socket(struct sock *sk)
 266 {
 267         spin_lock(&unix_table_lock);
 268         __unix_remove_socket(sk);
 269         spin_unlock(&unix_table_lock);
 270 }
 271
 272 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 273 {
 274         spin_lock(&unix_table_lock);
 275         __unix_insert_socket(list, sk);
 276         spin_unlock(&unix_table_lock);
 277 }
 278
 279 static struct sock *__unix_find_socket_byname(struct net *net,
 280                                               struct sockaddr_un *sunname,
 281                                               int len, int type, unsigned int hash)
 282 {
 283         struct sock *s;
 284
 285         sk_for_each(s, &unix_socket_table[hash ^ type]) {
 286                 struct unix_sock *u = unix_sk(s);
 287
 288                 if (!net_eq(sock_net(s), net))
 289                         continue;
 290
 291                 if (u->addr->len == len &&
 292                     !memcmp(u->addr->name, sunname, len))
 293                         return s;
 294         }
 295         return NULL;
 296 }
 297
 298 static inline struct sock *unix_find_socket_byname(struct net *net,
 299                                                    struct sockaddr_un *sunname,
 300                                                    int len, int type,
 301                                                    unsigned int hash)
 302 {
 303         struct sock *s;
 304
 305         spin_lock(&unix_table_lock);
 306         s = __unix_find_socket_byname(net, sunname, len, type, hash);
 307         if (s)
 308                 sock_hold(s);
 309         spin_unlock(&unix_table_lock);
 310         return s;
 311 }
 312
 313 static struct sock *unix_find_socket_byinode(struct inode *i)
 314 {
 315         struct sock *s;
 316
 317         spin_lock(&unix_table_lock);
 318         sk_for_each(s,
 319                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 320                 struct dentry *dentry = unix_sk(s)->path.dentry;
 321
 322                 if (dentry && d_backing_inode(dentry) == i) {
 323                         sock_hold(s);
 324                         goto found;
 325                 }
 326         }
 327         s = NULL;
 328 found:
 329         spin_unlock(&unix_table_lock);
 330         return s;
 331 }
 332
 333 /* Support code for asymmetrically connected dgram sockets
 334  *
 335  * If a datagram socket is connected to a socket not itself connected
 336  * to the first socket (eg, /dev/log), clients may only enqueue more
 337  * messages if the present receive queue of the server socket is not
 338  * "too large". This means there's a second writeability condition
 339  * poll and sendmsg need to test. The dgram recv code will do a wake
 340  * up on the peer_wait wait queue of a socket upon reception of a
 341  * datagram which needs to be propagated to sleeping would-be writers
 342  * since these might not have sent anything so far. This can't be
 343  * accomplished via poll_wait because the lifetime of the server
 344  * socket might be less than that of its clients if these break their
 345  * association with it or if the server socket is closed while clients
 346  * are still connected to it and there's no way to inform "a polling
 347  * implementation" that it should let go of a certain wait queue
 348  *
 349  * In order to propagate a wake up, a wait_queue_entry_t of the client
 350  * socket is enqueued on the peer_wait queue of the server socket
 351  * whose wake function does a wake_up on the ordinary client socket
 352  * wait queue. This connection is established whenever a write (or
 353  * poll for write) hit the flow control condition and broken when the
 354  * association to the server socket is dissolved or after a wake up
 355  * was relayed.
 356  */
 357
 358 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
 359                                       void *key)
 360 {
 361         struct unix_sock *u;
 362         wait_queue_head_t *u_sleep;
 363
 364         u = container_of(q, struct unix_sock, peer_wake);
 365
 366         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
 367                             q);
 368         u->peer_wake.private = NULL;
 369
 370         /* relaying can only happen while the wq still exists */
 371         u_sleep = sk_sleep(&u->sk);
 372         if (u_sleep)
 373                 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
 374
 375         return 0;
 376 }
 377
 378 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
 379 {
 380         struct unix_sock *u, *u_other;
 381         int rc;
 382
 383         u = unix_sk(sk);
 384         u_other = unix_sk(other);
 385         rc = 0;
 386         spin_lock(&u_other->peer_wait.lock);
 387
 388         if (!u->peer_wake.private) {
 389                 u->peer_wake.private = other;
 390                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
 391
 392                 rc = 1;
 393         }
 394
 395         spin_unlock(&u_other->peer_wait.lock);
 396         return rc;
 397 }
 398
 399 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
 400                                             struct sock *other)
 401 {
 402         struct unix_sock *u, *u_other;
 403
 404         u = unix_sk(sk);
 405         u_other = unix_sk(other);
 406         spin_lock(&u_other->peer_wait.lock);
 407
 408         if (u->peer_wake.private == other) {
 409                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
 410                 u->peer_wake.private = NULL;
 411         }
 412
 413         spin_unlock(&u_other->peer_wait.lock);
 414 }
 415
 416 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 417                                                    struct sock *other)
 418 {
 419         unix_dgram_peer_wake_disconnect(sk, other);
 420         wake_up_interruptible_poll(sk_sleep(sk),
 421                                    EPOLLOUT |
 422                                    EPOLLWRNORM |
 423                                    EPOLLWRBAND);
 424 }
 425
 426 /* preconditions:
 427  *      - unix_peer(sk) == other
 428  *      - association is stable
 429  */
 430 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 431 {
 432         int connected;
 433
 434         connected = unix_dgram_peer_wake_connect(sk, other);
 435
 436         /* If other is SOCK_DEAD, we want to make sure we signal
 437          * POLLOUT, such that a subsequent write() can get a
 438          * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
 439          * to other and its full, we will hang waiting for POLLOUT.
 440          */
 441         if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
 442                 return 1;
 443
 444         if (connected)
 445                 unix_dgram_peer_wake_disconnect(sk, other);
 446
 447         return 0;
 448 }
 449
 450 static int unix_writable(const struct sock *sk)
 451 {
 452         return sk->sk_state != TCP_LISTEN &&
 453                (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 454 }
 455
 456 static void unix_write_space(struct sock *sk)
 457 {
 458         struct socket_wq *wq;
 459
 460         rcu_read_lock();
 461         if (unix_writable(sk)) {
 462                 wq = rcu_dereference(sk->sk_wq);
 463                 if (skwq_has_sleeper(wq))
 464                         wake_up_interruptible_sync_poll(&wq->wait,
 465                                 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
 466                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 467         }
 468         rcu_read_unlock();
 469 }
 470
 471 /* When dgram socket disconnects (or changes its peer), we clear its receive
 472  * queue of packets arrived from previous peer. First, it allows to do
 473  * flow control based only on wmem_alloc; second, sk connected to peer
 474  * may receive messages only from that peer. */
 475 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 476 {
 477         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 478                 skb_queue_purge(&sk->sk_receive_queue);
 479                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 480
 481                 /* If one link of bidirectional dgram pipe is disconnected,
 482                  * we signal error. Messages are lost. Do not make this,
 483                  * when peer was not connected to us.
 484                  */
 485                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 486                         other->sk_err = ECONNRESET;
 487                         other->sk_error_report(other);
 488                 }
 489         }
 490 }
 491
 492 static void unix_sock_destructor(struct sock *sk)
 493 {
 494         struct unix_sock *u = unix_sk(sk);
 495
 496         skb_queue_purge(&sk->sk_receive_queue);
 497
 498         WARN_ON(refcount_read(&sk->sk_wmem_alloc));
 499         WARN_ON(!sk_unhashed(sk));
 500         WARN_ON(sk->sk_socket);
 501         if (!sock_flag(sk, SOCK_DEAD)) {
 502                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 503                 return;
 504         }
 505
 506         if (u->addr)
 507                 unix_release_addr(u->addr);
 508
 509         atomic_long_dec(&unix_nr_socks);
 510         local_bh_disable();
 511         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 512         local_bh_enable();
 513 #ifdef UNIX_REFCNT_DEBUG
 514         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 515                 atomic_long_read(&unix_nr_socks));
 516 #endif
 517 }
 518
 519 static void unix_release_sock(struct sock *sk, int embrion)
 520 {
 521         struct unix_sock *u = unix_sk(sk);
 522         struct path path;
 523         struct sock *skpair;
 524         struct sk_buff *skb;
 525         int state;
 526
 527         unix_remove_socket(sk);
 528
 529         /* Clear state */
 530         unix_state_lock(sk);
 531         sock_orphan(sk);
 532         sk->sk_shutdown = SHUTDOWN_MASK;
 533         path         = u->path;
 534         u->path.dentry = NULL;
 535         u->path.mnt = NULL;
 536         state = sk->sk_state;
 537         sk->sk_state = TCP_CLOSE;
 538
 539         skpair = unix_peer(sk);
 540         unix_peer(sk) = NULL;
 541
 542         unix_state_unlock(sk);
 543
 544         wake_up_interruptible_all(&u->peer_wait);
 545
 546         if (skpair != NULL) {
 547                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 548                         unix_state_lock(skpair);
 549                         /* No more writes */
 550                         skpair->sk_shutdown = SHUTDOWN_MASK;
 551                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 552                                 skpair->sk_err = ECONNRESET;
 553                         unix_state_unlock(skpair);
 554                         skpair->sk_state_change(skpair);
 555                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 556                 }
 557
 558                 unix_dgram_peer_wake_disconnect(sk, skpair);
 559                 sock_put(skpair); /* It may now die */
 560         }
 561
 562         /* Try to flush out this socket. Throw out buffers at least */
 563
 564         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 565                 if (state == TCP_LISTEN)
 566                         unix_release_sock(skb->sk, 1);
 567                 /* passed fds are erased in the kfree_skb hook        */
 568                 UNIXCB(skb).consumed = skb->len;
 569                 kfree_skb(skb);
 570         }
 571
 572         if (path.dentry)
 573                 path_put(&path);
 574
 575         sock_put(sk);
 576
 577         /* ---- Socket is dead now and most probably destroyed ---- */
 578
 579         /*
 580          * Fixme: BSD difference: In BSD all sockets connected to us get
 581          *        ECONNRESET and we die on the spot. In Linux we behave
 582          *        like files and pipes do and wait for the last
 583          *        dereference.
 584          *
 585          * Can't we simply set sock->err?
 586          *
 587          *        What the above comment does talk about? --ANK(980817)
 588          */
 589
 590         if (unix_tot_inflight)
 591                 unix_gc();              /* Garbage collect fds */
 592 }
 593
 594 static void init_peercred(struct sock *sk)
 595 {
 596         const struct cred *old_cred;
 597         struct pid *old_pid;
 598
 599         spin_lock(&sk->sk_peer_lock);
 600         old_pid = sk->sk_peer_pid;
 601         old_cred = sk->sk_peer_cred;
 602         sk->sk_peer_pid  = get_pid(task_tgid(current));
 603         sk->sk_peer_cred = get_current_cred();
 604         spin_unlock(&sk->sk_peer_lock);
 605
 606         put_pid(old_pid);
 607         put_cred(old_cred);
 608 }
 609
 610 static void copy_peercred(struct sock *sk, struct sock *peersk)
 611 {
 612         const struct cred *old_cred;
 613         struct pid *old_pid;
 614
 615         if (sk < peersk) {
 616                 spin_lock(&sk->sk_peer_lock);
 617                 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 618         } else {
 619                 spin_lock(&peersk->sk_peer_lock);
 620                 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 621         }
 622         old_pid = sk->sk_peer_pid;
 623         old_cred = sk->sk_peer_cred;
 624         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 625         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 626
 627         spin_unlock(&sk->sk_peer_lock);
 628         spin_unlock(&peersk->sk_peer_lock);
 629
 630         put_pid(old_pid);
 631         put_cred(old_cred);
 632 }
 633
 634 static int unix_listen(struct socket *sock, int backlog)
 635 {
 636         int err;
 637         struct sock *sk = sock->sk;
 638         struct unix_sock *u = unix_sk(sk);
 639
 640         err = -EOPNOTSUPP;
 641         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 642                 goto out;       /* Only stream/seqpacket sockets accept */
 643         err = -EINVAL;
 644         if (!u->addr)
 645                 goto out;       /* No listens on an unbound socket */
 646         unix_state_lock(sk);
 647         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 648                 goto out_unlock;
 649         if (backlog > sk->sk_max_ack_backlog)
 650                 wake_up_interruptible_all(&u->peer_wait);
 651         sk->sk_max_ack_backlog  = backlog;
 652         sk->sk_state            = TCP_LISTEN;
 653         /* set credentials so connect can copy them */
 654         init_peercred(sk);
 655         err = 0;
 656
 657 out_unlock:
 658         unix_state_unlock(sk);
 659 out:
 660         return err;
 661 }
 662
 663 static int unix_release(struct socket *);
 664 static int unix_bind(struct socket *, struct sockaddr *, int);
 665 static int unix_stream_connect(struct socket *, struct sockaddr *,
 666                                int addr_len, int flags);
 667 static int unix_socketpair(struct socket *, struct socket *);
 668 static int unix_accept(struct socket *, struct socket *, int, bool);
 669 static int unix_getname(struct socket *, struct sockaddr *, int);
 670 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
 671 static __poll_t unix_dgram_poll(struct file *, struct socket *,
 672                                     poll_table *);
 673 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 674 #ifdef CONFIG_COMPAT
 675 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 676 #endif
 677 static int unix_shutdown(struct socket *, int);
 678 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 679 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 680 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
 681                                     size_t size, int flags);
 682 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 683                                        struct pipe_inode_info *, size_t size,
 684                                        unsigned int flags);
 685 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 686 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 687 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 688                               int, int);
 689 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 690 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 691                                   int);
 692
 693 static int unix_set_peek_off(struct sock *sk, int val)
 694 {
 695         struct unix_sock *u = unix_sk(sk);
 696
 697         if (mutex_lock_interruptible(&u->iolock))
 698                 return -EINTR;
 699
 700         sk->sk_peek_off = val;
 701         mutex_unlock(&u->iolock);
 702
 703         return 0;
 704 }
 705
 706 #ifdef CONFIG_PROC_FS
 707 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
 708 {
 709         struct sock *sk = sock->sk;
 710         struct unix_sock *u;
 711
 712         if (sk) {
 713                 u = unix_sk(sock->sk);
 714                 seq_printf(m, "scm_fds: %u\n",
 715                            atomic_read(&u->scm_stat.nr_fds));
 716         }
 717 }
 718 #else
 719 #define unix_show_fdinfo NULL
 720 #endif
 721
 722 static const struct proto_ops unix_stream_ops = {
 723         .family =       PF_UNIX,
 724         .owner =        THIS_MODULE,
 725         .release =      unix_release,
 726         .bind =         unix_bind,
 727         .connect =      unix_stream_connect,
 728         .socketpair =   unix_socketpair,
 729         .accept =       unix_accept,
 730         .getname =      unix_getname,
 731         .poll =         unix_poll,
 732         .ioctl =        unix_ioctl,
 733 #ifdef CONFIG_COMPAT
 734         .compat_ioctl = unix_compat_ioctl,
 735 #endif
 736         .listen =       unix_listen,
 737         .shutdown =     unix_shutdown,
 738         .sendmsg =      unix_stream_sendmsg,
 739         .recvmsg =      unix_stream_recvmsg,
 740         .mmap =         sock_no_mmap,
 741         .sendpage =     unix_stream_sendpage,
 742         .splice_read =  unix_stream_splice_read,
 743         .set_peek_off = unix_set_peek_off,
 744         .show_fdinfo =  unix_show_fdinfo,
 745 };
 746
 747 static const struct proto_ops unix_dgram_ops = {
 748         .family =       PF_UNIX,
 749         .owner =        THIS_MODULE,
 750         .release =      unix_release,
 751         .bind =         unix_bind,
 752         .connect =      unix_dgram_connect,
 753         .socketpair =   unix_socketpair,
 754         .accept =       sock_no_accept,
 755         .getname =      unix_getname,
 756         .poll =         unix_dgram_poll,
 757         .ioctl =        unix_ioctl,
 758 #ifdef CONFIG_COMPAT
 759         .compat_ioctl = unix_compat_ioctl,
 760 #endif
 761         .listen =       sock_no_listen,
 762         .shutdown =     unix_shutdown,
 763         .sendmsg =      unix_dgram_sendmsg,
 764         .recvmsg =      unix_dgram_recvmsg,
 765         .mmap =         sock_no_mmap,
 766         .sendpage =     sock_no_sendpage,
 767         .set_peek_off = unix_set_peek_off,
 768         .show_fdinfo =  unix_show_fdinfo,
 769 };
 770
 771 static const struct proto_ops unix_seqpacket_ops = {
 772         .family =       PF_UNIX,
 773         .owner =        THIS_MODULE,
 774         .release =      unix_release,
 775         .bind =         unix_bind,
 776         .connect =      unix_stream_connect,
 777         .socketpair =   unix_socketpair,
 778         .accept =       unix_accept,
 779         .getname =      unix_getname,
 780         .poll =         unix_dgram_poll,
 781         .ioctl =        unix_ioctl,
 782 #ifdef CONFIG_COMPAT
 783         .compat_ioctl = unix_compat_ioctl,
 784 #endif
 785         .listen =       unix_listen,
 786         .shutdown =     unix_shutdown,
 787         .sendmsg =      unix_seqpacket_sendmsg,
 788         .recvmsg =      unix_seqpacket_recvmsg,
 789         .mmap =         sock_no_mmap,
 790         .sendpage =     sock_no_sendpage,
 791         .set_peek_off = unix_set_peek_off,
 792         .show_fdinfo =  unix_show_fdinfo,
 793 };
 794
 795 static struct proto unix_proto = {
 796         .name                   = "UNIX",
 797         .owner                  = THIS_MODULE,
 798         .obj_size               = sizeof(struct unix_sock),
 799 };
 800
 801 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 802 {
 803         struct sock *sk = NULL;
 804         struct unix_sock *u;
 805
 806         atomic_long_inc(&unix_nr_socks);
 807         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 808                 goto out;
 809
 810         sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
 811         if (!sk)
 812                 goto out;
 813
 814         sock_init_data(sock, sk);
 815
 816         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
 817         sk->sk_write_space      = unix_write_space;
 818         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 819         sk->sk_destruct         = unix_sock_destructor;
 820         u         = unix_sk(sk);
 821         u->path.dentry = NULL;
 822         u->path.mnt = NULL;
 823         spin_lock_init(&u->lock);
 824         atomic_long_set(&u->inflight, 0);
 825         INIT_LIST_HEAD(&u->link);
 826         mutex_init(&u->iolock); /* single task reading lock */
 827         mutex_init(&u->bindlock); /* single task binding lock */
 828         init_waitqueue_head(&u->peer_wait);
 829         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 830         memset(&u->scm_stat, 0, sizeof(struct scm_stat));
 831         unix_insert_socket(unix_sockets_unbound(sk), sk);
 832 out:
 833         if (sk == NULL)
 834                 atomic_long_dec(&unix_nr_socks);
 835         else {
 836                 local_bh_disable();
 837                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 838                 local_bh_enable();
 839         }
 840         return sk;
 841 }
 842
 843 static int unix_create(struct net *net, struct socket *sock, int protocol,
 844                        int kern)
 845 {
 846         if (protocol && protocol != PF_UNIX)
 847                 return -EPROTONOSUPPORT;
 848
 849         sock->state = SS_UNCONNECTED;
 850
 851         switch (sock->type) {
 852         case SOCK_STREAM:
 853                 sock->ops = &unix_stream_ops;
 854                 break;
 855                 /*
 856                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
 857                  *      nothing uses it.
 858                  */
 859         case SOCK_RAW:
 860                 sock->type = SOCK_DGRAM;
 861                 fallthrough;
 862         case SOCK_DGRAM:
 863                 sock->ops = &unix_dgram_ops;
 864                 break;
 865         case SOCK_SEQPACKET:
 866                 sock->ops = &unix_seqpacket_ops;
 867                 break;
 868         default:
 869                 return -ESOCKTNOSUPPORT;
 870         }
 871
 872         return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
 873 }
 874
 875 static int unix_release(struct socket *sock)
 876 {
 877         struct sock *sk = sock->sk;
 878
 879         if (!sk)
 880                 return 0;
 881
 882         unix_release_sock(sk, 0);
 883         sock->sk = NULL;
 884
 885         return 0;
 886 }
 887
 888 static int unix_autobind(struct socket *sock)
 889 {
 890         struct sock *sk = sock->sk;
 891         struct net *net = sock_net(sk);
 892         struct unix_sock *u = unix_sk(sk);
 893         static u32 ordernum = 1;
 894         struct unix_address *addr;
 895         int err;
 896         unsigned int retries = 0;
 897
 898         err = mutex_lock_interruptible(&u->bindlock);
 899         if (err)
 900                 return err;
 901
 902         if (u->addr)
 903                 goto out;
 904
 905         err = -ENOMEM;
 906         addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
 907         if (!addr)
 908                 goto out;
 909
 910         addr->name->sun_family = AF_UNIX;
 911         refcount_set(&addr->refcnt, 1);
 912
 913 retry:
 914         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
 915         addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 916
 917         spin_lock(&unix_table_lock);
 918         ordernum = (ordernum+1)&0xFFFFF;
 919
 920         if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
 921                                       addr->hash)) {
 922                 spin_unlock(&unix_table_lock);
 923                 /*
 924                  * __unix_find_socket_byname() may take long time if many names
 925                  * are already in use.
 926                  */
 927                 cond_resched();
 928                 /* Give up if all names seems to be in use. */
 929                 if (retries++ == 0xFFFFF) {
 930                         err = -ENOSPC;
 931                         kfree(addr);
 932                         goto out;
 933                 }
 934                 goto retry;
 935         }
 936         addr->hash ^= sk->sk_type;
 937
 938         __unix_remove_socket(sk);
 939         smp_store_release(&u->addr, addr);
 940         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
 941         spin_unlock(&unix_table_lock);
 942         err = 0;
 943
 944 out:    mutex_unlock(&u->bindlock);
 945         return err;
 946 }
 947
 948 static struct sock *unix_find_other(struct net *net,
 949                                     struct sockaddr_un *sunname, int len,
 950                                     int type, unsigned int hash, int *error)
 951 {
 952         struct sock *u;
 953         struct path path;
 954         int err = 0;
 955
 956         if (sunname->sun_path[0]) {
 957                 struct inode *inode;
 958                 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
 959                 if (err)
 960                         goto fail;
 961                 inode = d_backing_inode(path.dentry);
 962                 err = inode_permission(inode, MAY_WRITE);
 963                 if (err)
 964                         goto put_fail;
 965
 966                 err = -ECONNREFUSED;
 967                 if (!S_ISSOCK(inode->i_mode))
 968                         goto put_fail;
 969                 u = unix_find_socket_byinode(inode);
 970                 if (!u)
 971                         goto put_fail;
 972
 973                 if (u->sk_type == type)
 974                         touch_atime(&path);
 975
 976                 path_put(&path);
 977
 978                 err = -EPROTOTYPE;
 979                 if (u->sk_type != type) {
 980                         sock_put(u);
 981                         goto fail;
 982                 }
 983         } else {
 984                 err = -ECONNREFUSED;
 985                 u = unix_find_socket_byname(net, sunname, len, type, hash);
 986                 if (u) {
 987                         struct dentry *dentry;
 988                         dentry = unix_sk(u)->path.dentry;
 989                         if (dentry)
 990                                 touch_atime(&unix_sk(u)->path);
 991                 } else
 992                         goto fail;
 993         }
 994         return u;
 995
 996 put_fail:
 997         path_put(&path);
 998 fail:
 999         *error = err;
1000         return NULL;
1001 }
1002
1003 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
1004 {
1005         struct dentry *dentry;
1006         struct path path;
1007         int err = 0;
1008         /*
1009          * Get the parent directory, calculate the hash for last
1010          * component.
1011          */
1012         dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
1013         err = PTR_ERR(dentry);
1014         if (IS_ERR(dentry))
1015                 return err;
1016
1017         /*
1018          * All right, let's create it.
1019          */
1020         err = security_path_mknod(&path, dentry, mode, 0);
1021         if (!err) {
1022                 err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
1023                 if (!err) {
1024                         res->mnt = mntget(path.mnt);
1025                         res->dentry = dget(dentry);
1026                 }
1027         }
1028         done_path_create(&path, dentry);
1029         return err;
1030 }
1031
1032 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1033 {
1034         struct sock *sk = sock->sk;
1035         struct net *net = sock_net(sk);
1036         struct unix_sock *u = unix_sk(sk);
1037         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1038         char *sun_path = sunaddr->sun_path;
1039         int err;
1040         unsigned int hash;
1041         struct unix_address *addr;
1042         struct hlist_head *list;
1043         struct path path = { };
1044
1045         err = -EINVAL;
1046         if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
1047             sunaddr->sun_family != AF_UNIX)
1048                 goto out;
1049
1050         if (addr_len == sizeof(short)) {
1051                 err = unix_autobind(sock);
1052                 goto out;
1053         }
1054
1055         err = unix_mkname(sunaddr, addr_len, &hash);
1056         if (err < 0)
1057                 goto out;
1058         addr_len = err;
1059
1060         if (sun_path[0]) {
1061                 umode_t mode = S_IFSOCK |
1062                        (SOCK_INODE(sock)->i_mode & ~current_umask());
1063                 err = unix_mknod(sun_path, mode, &path);
1064                 if (err) {
1065                         if (err == -EEXIST)
1066                                 err = -EADDRINUSE;
1067                         goto out;
1068                 }
1069         }
1070
1071         err = mutex_lock_interruptible(&u->bindlock);
1072         if (err)
1073                 goto out_put;
1074
1075         err = -EINVAL;
1076         if (u->addr)
1077                 goto out_up;
1078
1079         err = -ENOMEM;
1080         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
1081         if (!addr)
1082                 goto out_up;
1083
1084         memcpy(addr->name, sunaddr, addr_len);
1085         addr->len = addr_len;
1086         addr->hash = hash ^ sk->sk_type;
1087         refcount_set(&addr->refcnt, 1);
1088
1089         if (sun_path[0]) {
1090                 addr->hash = UNIX_HASH_SIZE;
1091                 hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
1092                 spin_lock(&unix_table_lock);
1093                 u->path = path;
1094                 list = &unix_socket_table[hash];
1095         } else {
1096                 spin_lock(&unix_table_lock);
1097                 err = -EADDRINUSE;
1098                 if (__unix_find_socket_byname(net, sunaddr, addr_len,
1099                                               sk->sk_type, hash)) {
1100                         unix_release_addr(addr);
1101                         goto out_unlock;
1102                 }
1103
1104                 list = &unix_socket_table[addr->hash];
1105         }
1106
1107         err = 0;
1108         __unix_remove_socket(sk);
1109         smp_store_release(&u->addr, addr);
1110         __unix_insert_socket(list, sk);
1111
1112 out_unlock:
1113         spin_unlock(&unix_table_lock);
1114 out_up:
1115         mutex_unlock(&u->bindlock);
1116 out_put:
1117         if (err)
1118                 path_put(&path);
1119 out:
1120         return err;
1121 }
1122
1123 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1124 {
1125         if (unlikely(sk1 == sk2) || !sk2) {
1126                 unix_state_lock(sk1);
1127                 return;
1128         }
1129         if (sk1 < sk2) {
1130                 unix_state_lock(sk1);
1131                 unix_state_lock_nested(sk2);
1132         } else {
1133                 unix_state_lock(sk2);
1134                 unix_state_lock_nested(sk1);
1135         }
1136 }
1137
1138 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1139 {
1140         if (unlikely(sk1 == sk2) || !sk2) {
1141                 unix_state_unlock(sk1);
1142                 return;
1143         }
1144         unix_state_unlock(sk1);
1145         unix_state_unlock(sk2);
1146 }
1147
1148 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1149                               int alen, int flags)
1150 {
1151         struct sock *sk = sock->sk;
1152         struct net *net = sock_net(sk);
1153         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1154         struct sock *other;
1155         unsigned int hash;
1156         int err;
1157
1158         err = -EINVAL;
1159         if (alen < offsetofend(struct sockaddr, sa_family))
1160                 goto out;
1161
1162         if (addr->sa_family != AF_UNSPEC) {
1163                 err = unix_mkname(sunaddr, alen, &hash);
1164                 if (err < 0)
1165                         goto out;
1166                 alen = err;
1167
1168                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1169                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
1170                         goto out;
1171
1172 restart:
1173                 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1174                 if (!other)
1175                         goto out;
1176
1177                 unix_state_double_lock(sk, other);
1178
1179                 /* Apparently VFS overslept socket death. Retry. */
1180                 if (sock_flag(other, SOCK_DEAD)) {
1181                         unix_state_double_unlock(sk, other);
1182                         sock_put(other);
1183                         goto restart;
1184                 }
1185
1186                 err = -EPERM;
1187                 if (!unix_may_send(sk, other))
1188                         goto out_unlock;
1189
1190                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1191                 if (err)
1192                         goto out_unlock;
1193
1194         } else {
1195                 /*
1196                  *      1003.1g breaking connected state with AF_UNSPEC
1197                  */
1198                 other = NULL;
1199                 unix_state_double_lock(sk, other);
1200         }
1201
1202         /*
1203          * If it was connected, reconnect.
1204          */
1205         if (unix_peer(sk)) {
1206                 struct sock *old_peer = unix_peer(sk);
1207                 unix_peer(sk) = other;
1208                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1209
1210                 unix_state_double_unlock(sk, other);
1211
1212                 if (other != old_peer)
1213                         unix_dgram_disconnected(sk, old_peer);
1214                 sock_put(old_peer);
1215         } else {
1216                 unix_peer(sk) = other;
1217                 unix_state_double_unlock(sk, other);
1218         }
1219         return 0;
1220
1221 out_unlock:
1222         unix_state_double_unlock(sk, other);
1223         sock_put(other);
1224 out:
1225         return err;
1226 }
1227
1228 static long unix_wait_for_peer(struct sock *other, long timeo)
1229         __releases(&unix_sk(other)->lock)
1230 {
1231         struct unix_sock *u = unix_sk(other);
1232         int sched;
1233         DEFINE_WAIT(wait);
1234
1235         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1236
1237         sched = !sock_flag(other, SOCK_DEAD) &&
1238                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1239                 unix_recvq_full(other);
1240
1241         unix_state_unlock(other);
1242
1243         if (sched)
1244                 timeo = schedule_timeout(timeo);
1245
1246         finish_wait(&u->peer_wait, &wait);
1247         return timeo;
1248 }
1249
1250 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1251                                int addr_len, int flags)
1252 {
1253         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1254         struct sock *sk = sock->sk;
1255         struct net *net = sock_net(sk);
1256         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1257         struct sock *newsk = NULL;
1258         struct sock *other = NULL;
1259         struct sk_buff *skb = NULL;
1260         unsigned int hash;
1261         int st;
1262         int err;
1263         long timeo;
1264
1265         err = unix_mkname(sunaddr, addr_len, &hash);
1266         if (err < 0)
1267                 goto out;
1268         addr_len = err;
1269
1270         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1271             (err = unix_autobind(sock)) != 0)
1272                 goto out;
1273
1274         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1275
1276         /* First of all allocate resources.
1277            If we will make it after state is locked,
1278            we will have to recheck all again in any case.
1279          */
1280
1281         err = -ENOMEM;
1282
1283         /* create new sock for complete connection */
1284         newsk = unix_create1(sock_net(sk), NULL, 0);
1285         if (newsk == NULL)
1286                 goto out;
1287
1288         /* Allocate skb for sending to listening sock */
1289         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1290         if (skb == NULL)
1291                 goto out;
1292
1293 restart:
1294         /*  Find listening sock. */
1295         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1296         if (!other)
1297                 goto out;
1298
1299         /* Latch state of peer */
1300         unix_state_lock(other);
1301
1302         /* Apparently VFS overslept socket death. Retry. */
1303         if (sock_flag(other, SOCK_DEAD)) {
1304                 unix_state_unlock(other);
1305                 sock_put(other);
1306                 goto restart;
1307         }
1308
1309         err = -ECONNREFUSED;
1310         if (other->sk_state != TCP_LISTEN)
1311                 goto out_unlock;
1312         if (other->sk_shutdown & RCV_SHUTDOWN)
1313                 goto out_unlock;
1314
1315         if (unix_recvq_full(other)) {
1316                 err = -EAGAIN;
1317                 if (!timeo)
1318                         goto out_unlock;
1319
1320                 timeo = unix_wait_for_peer(other, timeo);
1321
1322                 err = sock_intr_errno(timeo);
1323                 if (signal_pending(current))
1324                         goto out;
1325                 sock_put(other);
1326                 goto restart;
1327         }
1328
1329         /* Latch our state.
1330
1331            It is tricky place. We need to grab our state lock and cannot
1332            drop lock on peer. It is dangerous because deadlock is
1333            possible. Connect to self case and simultaneous
1334            attempt to connect are eliminated by checking socket
1335            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1336            check this before attempt to grab lock.
1337
1338            Well, and we have to recheck the state after socket locked.
1339          */
1340         st = sk->sk_state;
1341
1342         switch (st) {
1343         case TCP_CLOSE:
1344                 /* This is ok... continue with connect */
1345                 break;
1346         case TCP_ESTABLISHED:
1347                 /* Socket is already connected */
1348                 err = -EISCONN;
1349                 goto out_unlock;
1350         default:
1351                 err = -EINVAL;
1352                 goto out_unlock;
1353         }
1354
1355         unix_state_lock_nested(sk);
1356
1357         if (sk->sk_state != st) {
1358                 unix_state_unlock(sk);
1359                 unix_state_unlock(other);
1360                 sock_put(other);
1361                 goto restart;
1362         }
1363
1364         err = security_unix_stream_connect(sk, other, newsk);
1365         if (err) {
1366                 unix_state_unlock(sk);
1367                 goto out_unlock;
1368         }
1369
1370         /* The way is open! Fastly set all the necessary fields... */
1371
1372         sock_hold(sk);
1373         unix_peer(newsk)        = sk;
1374         newsk->sk_state         = TCP_ESTABLISHED;
1375         newsk->sk_type          = sk->sk_type;
1376         init_peercred(newsk);
1377         newu = unix_sk(newsk);
1378         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1379         otheru = unix_sk(other);
1380
1381         /* copy address information from listening to new sock
1382          *
1383          * The contents of *(otheru->addr) and otheru->path
1384          * are seen fully set up here, since we have found
1385          * otheru in hash under unix_table_lock.  Insertion
1386          * into the hash chain we'd found it in had been done
1387          * in an earlier critical area protected by unix_table_lock,
1388          * the same one where we'd set *(otheru->addr) contents,
1389          * as well as otheru->path and otheru->addr itself.
1390          *
1391          * Using smp_store_release() here to set newu->addr
1392          * is enough to make those stores, as well as stores
1393          * to newu->path visible to anyone who gets newu->addr
1394          * by smp_load_acquire().  IOW, the same warranties
1395          * as for unix_sock instances bound in unix_bind() or
1396          * in unix_autobind().
1397          */
1398         if (otheru->path.dentry) {
1399                 path_get(&otheru->path);
1400                 newu->path = otheru->path;
1401         }
1402         refcount_inc(&otheru->addr->refcnt);
1403         smp_store_release(&newu->addr, otheru->addr);
1404
1405         /* Set credentials */
1406         copy_peercred(sk, other);
1407
1408         sock->state     = SS_CONNECTED;
1409         sk->sk_state    = TCP_ESTABLISHED;
1410         sock_hold(newsk);
1411
1412         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1413         unix_peer(sk)   = newsk;
1414
1415         unix_state_unlock(sk);
1416
1417         /* take ten and and send info to listening sock */
1418         spin_lock(&other->sk_receive_queue.lock);
1419         __skb_queue_tail(&other->sk_receive_queue, skb);
1420         spin_unlock(&other->sk_receive_queue.lock);
1421         unix_state_unlock(other);
1422         other->sk_data_ready(other);
1423         sock_put(other);
1424         return 0;
1425
1426 out_unlock:
1427         if (other)
1428                 unix_state_unlock(other);
1429
1430 out:
1431         kfree_skb(skb);
1432         if (newsk)
1433                 unix_release_sock(newsk, 0);
1434         if (other)
1435                 sock_put(other);
1436         return err;
1437 }
1438
1439 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1440 {
1441         struct sock *ska = socka->sk, *skb = sockb->sk;
1442
1443         /* Join our sockets back to back */
1444         sock_hold(ska);
1445         sock_hold(skb);
1446         unix_peer(ska) = skb;
1447         unix_peer(skb) = ska;
1448         init_peercred(ska);
1449         init_peercred(skb);
1450
1451         if (ska->sk_type != SOCK_DGRAM) {
1452                 ska->sk_state = TCP_ESTABLISHED;
1453                 skb->sk_state = TCP_ESTABLISHED;
1454                 socka->state  = SS_CONNECTED;
1455                 sockb->state  = SS_CONNECTED;
1456         }
1457         return 0;
1458 }
1459
1460 static void unix_sock_inherit_flags(const struct socket *old,
1461                                     struct socket *new)
1462 {
1463         if (test_bit(SOCK_PASSCRED, &old->flags))
1464                 set_bit(SOCK_PASSCRED, &new->flags);
1465         if (test_bit(SOCK_PASSSEC, &old->flags))
1466                 set_bit(SOCK_PASSSEC, &new->flags);
1467 }
1468
1469 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1470                        bool kern)
1471 {
1472         struct sock *sk = sock->sk;
1473         struct sock *tsk;
1474         struct sk_buff *skb;
1475         int err;
1476
1477         err = -EOPNOTSUPP;
1478         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1479                 goto out;
1480
1481         err = -EINVAL;
1482         if (sk->sk_state != TCP_LISTEN)
1483                 goto out;
1484
1485         /* If socket state is TCP_LISTEN it cannot change (for now...),
1486          * so that no locks are necessary.
1487          */
1488
1489         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1490         if (!skb) {
1491                 /* This means receive shutdown. */
1492                 if (err == 0)
1493                         err = -EINVAL;
1494                 goto out;
1495         }
1496
1497         tsk = skb->sk;
1498         skb_free_datagram(sk, skb);
1499         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1500
1501         /* attach accepted sock to socket */
1502         unix_state_lock(tsk);
1503         newsock->state = SS_CONNECTED;
1504         unix_sock_inherit_flags(sock, newsock);
1505         sock_graft(tsk, newsock);
1506         unix_state_unlock(tsk);
1507         return 0;
1508
1509 out:
1510         return err;
1511 }
1512
1513
1514 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1515 {
1516         struct sock *sk = sock->sk;
1517         struct unix_address *addr;
1518         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1519         int err = 0;
1520
1521         if (peer) {
1522                 sk = unix_peer_get(sk);
1523
1524                 err = -ENOTCONN;
1525                 if (!sk)
1526                         goto out;
1527                 err = 0;
1528         } else {
1529                 sock_hold(sk);
1530         }
1531
1532         addr = smp_load_acquire(&unix_sk(sk)->addr);
1533         if (!addr) {
1534                 sunaddr->sun_family = AF_UNIX;
1535                 sunaddr->sun_path[0] = 0;
1536                 err = sizeof(short);
1537         } else {
1538                 err = addr->len;
1539                 memcpy(sunaddr, addr->name, addr->len);
1540         }
1541         sock_put(sk);
1542 out:
1543         return err;
1544 }
1545
1546 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1547 {
1548         scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1549
1550         /*
1551          * Garbage collection of unix sockets starts by selecting a set of
1552          * candidate sockets which have reference only from being in flight
1553          * (total_refs == inflight_refs).  This condition is checked once during
1554          * the candidate collection phase, and candidates are marked as such, so
1555          * that non-candidates can later be ignored.  While inflight_refs is
1556          * protected by unix_gc_lock, total_refs (file count) is not, hence this
1557          * is an instantaneous decision.
1558          *
1559          * Once a candidate, however, the socket must not be reinstalled into a
1560          * file descriptor while the garbage collection is in progress.
1561          *
1562          * If the above conditions are met, then the directed graph of
1563          * candidates (*) does not change while unix_gc_lock is held.
1564          *
1565          * Any operations that changes the file count through file descriptors
1566          * (dup, close, sendmsg) does not change the graph since candidates are
1567          * not installed in fds.
1568          *
1569          * Dequeing a candidate via recvmsg would install it into an fd, but
1570          * that takes unix_gc_lock to decrement the inflight count, so it's
1571          * serialized with garbage collection.
1572          *
1573          * MSG_PEEK is special in that it does not change the inflight count,
1574          * yet does install the socket into an fd.  The following lock/unlock
1575          * pair is to ensure serialization with garbage collection.  It must be
1576          * done between incrementing the file count and installing the file into
1577          * an fd.
1578          *
1579          * If garbage collection starts after the barrier provided by the
1580          * lock/unlock, then it will see the elevated refcount and not mark this
1581          * as a candidate.  If a garbage collection is already in progress
1582          * before the file count was incremented, then the lock/unlock pair will
1583          * ensure that garbage collection is finished before progressing to
1584          * installing the fd.
1585          *
1586          * (*) A -> B where B is on the queue of A or B is on the queue of C
1587          * which is on the queue of listening socket A.
1588          */
1589         spin_lock(&unix_gc_lock);
1590         spin_unlock(&unix_gc_lock);
1591 }
1592
1593 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1594 {
1595         int err = 0;
1596
1597         UNIXCB(skb).pid  = get_pid(scm->pid);
1598         UNIXCB(skb).uid = scm->creds.uid;
1599         UNIXCB(skb).gid = scm->creds.gid;
1600         UNIXCB(skb).fp = NULL;
1601         unix_get_secdata(scm, skb);
1602         if (scm->fp && send_fds)
1603                 err = unix_attach_fds(scm, skb);
1604
1605         skb->destructor = unix_destruct_scm;
1606         return err;
1607 }
1608
1609 static bool unix_passcred_enabled(const struct socket *sock,
1610                                   const struct sock *other)
1611 {
1612         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1613                !other->sk_socket ||
1614                test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1615 }
1616
1617 /*
1618  * Some apps rely on write() giving SCM_CREDENTIALS
1619  * We include credentials if source or destination socket
1620  * asserted SOCK_PASSCRED.
1621  */
1622 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1623                             const struct sock *other)
1624 {
1625         if (UNIXCB(skb).pid)
1626                 return;
1627         if (unix_passcred_enabled(sock, other)) {
1628                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1629                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1630         }
1631 }
1632
1633 static int maybe_init_creds(struct scm_cookie *scm,
1634                             struct socket *socket,
1635                             const struct sock *other)
1636 {
1637         int err;
1638         struct msghdr msg = { .msg_controllen = 0 };
1639
1640         err = scm_send(socket, &msg, scm, false);
1641         if (err)
1642                 return err;
1643
1644         if (unix_passcred_enabled(socket, other)) {
1645                 scm->pid = get_pid(task_tgid(current));
1646                 current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1647         }
1648         return err;
1649 }
1650
1651 static bool unix_skb_scm_eq(struct sk_buff *skb,
1652                             struct scm_cookie *scm)
1653 {
1654         const struct unix_skb_parms *u = &UNIXCB(skb);
1655
1656         return u->pid == scm->pid &&
1657                uid_eq(u->uid, scm->creds.uid) &&
1658                gid_eq(u->gid, scm->creds.gid) &&
1659                unix_secdata_eq(scm, skb);
1660 }
1661
1662 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1663 {
1664         struct scm_fp_list *fp = UNIXCB(skb).fp;
1665         struct unix_sock *u = unix_sk(sk);
1666
1667         if (unlikely(fp && fp->count))
1668                 atomic_add(fp->count, &u->scm_stat.nr_fds);
1669 }
1670
1671 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1672 {
1673         struct scm_fp_list *fp = UNIXCB(skb).fp;
1674         struct unix_sock *u = unix_sk(sk);
1675
1676         if (unlikely(fp && fp->count))
1677                 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1678 }
1679
1680 /*
1681  *      Send AF_UNIX data.
1682  */
1683
1684 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1685                               size_t len)
1686 {
1687         struct sock *sk = sock->sk;
1688         struct net *net = sock_net(sk);
1689         struct unix_sock *u = unix_sk(sk);
1690         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1691         struct sock *other = NULL;
1692         int namelen = 0; /* fake GCC */
1693         int err;
1694         unsigned int hash;
1695         struct sk_buff *skb;
1696         long timeo;
1697         struct scm_cookie scm;
1698         int data_len = 0;
1699         int sk_locked;
1700
1701         wait_for_unix_gc();
1702         err = scm_send(sock, msg, &scm, false);
1703         if (err < 0)
1704                 return err;
1705
1706         err = -EOPNOTSUPP;
1707         if (msg->msg_flags&MSG_OOB)
1708                 goto out;
1709
1710         if (msg->msg_namelen) {
1711                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1712                 if (err < 0)
1713                         goto out;
1714                 namelen = err;
1715         } else {
1716                 sunaddr = NULL;
1717                 err = -ENOTCONN;
1718                 other = unix_peer_get(sk);
1719                 if (!other)
1720                         goto out;
1721         }
1722
1723         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1724             && (err = unix_autobind(sock)) != 0)
1725                 goto out;
1726
1727         err = -EMSGSIZE;
1728         if (len > sk->sk_sndbuf - 32)
1729                 goto out;
1730
1731         if (len > SKB_MAX_ALLOC) {
1732                 data_len = min_t(size_t,
1733                                  len - SKB_MAX_ALLOC,
1734                                  MAX_SKB_FRAGS * PAGE_SIZE);
1735                 data_len = PAGE_ALIGN(data_len);
1736
1737                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1738         }
1739
1740         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1741                                    msg->msg_flags & MSG_DONTWAIT, &err,
1742                                    PAGE_ALLOC_COSTLY_ORDER);
1743         if (skb == NULL)
1744                 goto out;
1745
1746         err = unix_scm_to_skb(&scm, skb, true);
1747         if (err < 0)
1748                 goto out_free;
1749
1750         skb_put(skb, len - data_len);
1751         skb->data_len = data_len;
1752         skb->len = len;
1753         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1754         if (err)
1755                 goto out_free;
1756
1757         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1758
1759 restart:
1760         if (!other) {
1761                 err = -ECONNRESET;
1762                 if (sunaddr == NULL)
1763                         goto out_free;
1764
1765                 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1766                                         hash, &err);
1767                 if (other == NULL)
1768                         goto out_free;
1769         }
1770
1771         if (sk_filter(other, skb) < 0) {
1772                 /* Toss the packet but do not return any error to the sender */
1773                 err = len;
1774                 goto out_free;
1775         }
1776
1777         sk_locked = 0;
1778         unix_state_lock(other);
1779 restart_locked:
1780         err = -EPERM;
1781         if (!unix_may_send(sk, other))
1782                 goto out_unlock;
1783
1784         if (unlikely(sock_flag(other, SOCK_DEAD))) {
1785                 /*
1786                  *      Check with 1003.1g - what should
1787                  *      datagram error
1788                  */
1789                 unix_state_unlock(other);
1790                 sock_put(other);
1791
1792                 if (!sk_locked)
1793                         unix_state_lock(sk);
1794
1795                 err = 0;
1796                 if (unix_peer(sk) == other) {
1797                         unix_peer(sk) = NULL;
1798                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1799
1800                         unix_state_unlock(sk);
1801
1802                         unix_dgram_disconnected(sk, other);
1803                         sock_put(other);
1804                         err = -ECONNREFUSED;
1805                 } else {
1806                         unix_state_unlock(sk);
1807                 }
1808
1809                 other = NULL;
1810                 if (err)
1811                         goto out_free;
1812                 goto restart;
1813         }
1814
1815         err = -EPIPE;
1816         if (other->sk_shutdown & RCV_SHUTDOWN)
1817                 goto out_unlock;
1818
1819         if (sk->sk_type != SOCK_SEQPACKET) {
1820                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1821                 if (err)
1822                         goto out_unlock;
1823         }
1824
1825         /* other == sk && unix_peer(other) != sk if
1826          * - unix_peer(sk) == NULL, destination address bound to sk
1827          * - unix_peer(sk) == sk by time of get but disconnected before lock
1828          */
1829         if (other != sk &&
1830             unlikely(unix_peer(other) != sk &&
1831             unix_recvq_full_lockless(other))) {
1832                 if (timeo) {
1833                         timeo = unix_wait_for_peer(other, timeo);
1834
1835                         err = sock_intr_errno(timeo);
1836                         if (signal_pending(current))
1837                                 goto out_free;
1838
1839                         goto restart;
1840                 }
1841
1842                 if (!sk_locked) {
1843                         unix_state_unlock(other);
1844                         unix_state_double_lock(sk, other);
1845                 }
1846
1847                 if (unix_peer(sk) != other ||
1848                     unix_dgram_peer_wake_me(sk, other)) {
1849                         err = -EAGAIN;
1850                         sk_locked = 1;
1851                         goto out_unlock;
1852                 }
1853
1854                 if (!sk_locked) {
1855                         sk_locked = 1;
1856                         goto restart_locked;
1857                 }
1858         }
1859
1860         if (unlikely(sk_locked))
1861                 unix_state_unlock(sk);
1862
1863         if (sock_flag(other, SOCK_RCVTSTAMP))
1864                 __net_timestamp(skb);
1865         maybe_add_creds(skb, sock, other);
1866         scm_stat_add(other, skb);
1867         skb_queue_tail(&other->sk_receive_queue, skb);
1868         unix_state_unlock(other);
1869         other->sk_data_ready(other);
1870         sock_put(other);
1871         scm_destroy(&scm);
1872         return len;
1873
1874 out_unlock:
1875         if (sk_locked)
1876                 unix_state_unlock(sk);
1877         unix_state_unlock(other);
1878 out_free:
1879         kfree_skb(skb);
1880 out:
1881         if (other)
1882                 sock_put(other);
1883         scm_destroy(&scm);
1884         return err;
1885 }
1886
1887 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1888  * bytes, and a minimum of a full page.
1889  */
1890 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1891
1892 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1893                                size_t len)
1894 {
1895         struct sock *sk = sock->sk;
1896         struct sock *other = NULL;
1897         int err, size;
1898         struct sk_buff *skb;
1899         int sent = 0;
1900         struct scm_cookie scm;
1901         bool fds_sent = false;
1902         int data_len;
1903
1904         wait_for_unix_gc();
1905         err = scm_send(sock, msg, &scm, false);
1906         if (err < 0)
1907                 return err;
1908
1909         err = -EOPNOTSUPP;
1910         if (msg->msg_flags&MSG_OOB)
1911                 goto out_err;
1912
1913         if (msg->msg_namelen) {
1914                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1915                 goto out_err;
1916         } else {
1917                 err = -ENOTCONN;
1918                 other = unix_peer(sk);
1919                 if (!other)
1920                         goto out_err;
1921         }
1922
1923         if (sk->sk_shutdown & SEND_SHUTDOWN)
1924                 goto pipe_err;
1925
1926         while (sent < len) {
1927                 size = len - sent;
1928
1929                 /* Keep two messages in the pipe so it schedules better */
1930                 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1931
1932                 /* allow fallback to order-0 allocations */
1933                 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1934
1935                 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1936
1937                 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1938
1939                 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1940                                            msg->msg_flags & MSG_DONTWAIT, &err,
1941                                            get_order(UNIX_SKB_FRAGS_SZ));
1942                 if (!skb)
1943                         goto out_err;
1944
1945                 /* Only send the fds in the first buffer */
1946                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
1947                 if (err < 0) {
1948                         kfree_skb(skb);
1949                         goto out_err;
1950                 }
1951                 fds_sent = true;
1952
1953                 skb_put(skb, size - data_len);
1954                 skb->data_len = data_len;
1955                 skb->len = size;
1956                 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1957                 if (err) {
1958                         kfree_skb(skb);
1959                         goto out_err;
1960                 }
1961
1962                 unix_state_lock(other);
1963
1964                 if (sock_flag(other, SOCK_DEAD) ||
1965                     (other->sk_shutdown & RCV_SHUTDOWN))
1966                         goto pipe_err_free;
1967
1968                 maybe_add_creds(skb, sock, other);
1969                 scm_stat_add(other, skb);
1970                 skb_queue_tail(&other->sk_receive_queue, skb);
1971                 unix_state_unlock(other);
1972                 other->sk_data_ready(other);
1973                 sent += size;
1974         }
1975
1976         scm_destroy(&scm);
1977
1978         return sent;
1979
1980 pipe_err_free:
1981         unix_state_unlock(other);
1982         kfree_skb(skb);
1983 pipe_err:
1984         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1985                 send_sig(SIGPIPE, current, 0);
1986         err = -EPIPE;
1987 out_err:
1988         scm_destroy(&scm);
1989         return sent ? : err;
1990 }
1991
1992 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
1993                                     int offset, size_t size, int flags)
1994 {
1995         int err;
1996         bool send_sigpipe = false;
1997         bool init_scm = true;
1998         struct scm_cookie scm;
1999         struct sock *other, *sk = socket->sk;
2000         struct sk_buff *skb, *newskb = NULL, *tail = NULL;
2001
2002         if (flags & MSG_OOB)
2003                 return -EOPNOTSUPP;
2004
2005         other = unix_peer(sk);
2006         if (!other || sk->sk_state != TCP_ESTABLISHED)
2007                 return -ENOTCONN;
2008
2009         if (false) {
2010 alloc_skb:
2011                 unix_state_unlock(other);
2012                 mutex_unlock(&unix_sk(other)->iolock);
2013                 newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
2014                                               &err, 0);
2015                 if (!newskb)
2016                         goto err;
2017         }
2018
2019         /* we must acquire iolock as we modify already present
2020          * skbs in the sk_receive_queue and mess with skb->len
2021          */
2022         err = mutex_lock_interruptible(&unix_sk(other)->iolock);
2023         if (err) {
2024                 err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
2025                 goto err;
2026         }
2027
2028         if (sk->sk_shutdown & SEND_SHUTDOWN) {
2029                 err = -EPIPE;
2030                 send_sigpipe = true;
2031                 goto err_unlock;
2032         }
2033
2034         unix_state_lock(other);
2035
2036         if (sock_flag(other, SOCK_DEAD) ||
2037             other->sk_shutdown & RCV_SHUTDOWN) {
2038                 err = -EPIPE;
2039                 send_sigpipe = true;
2040                 goto err_state_unlock;
2041         }
2042
2043         if (init_scm) {
2044                 err = maybe_init_creds(&scm, socket, other);
2045                 if (err)
2046                         goto err_state_unlock;
2047                 init_scm = false;
2048         }
2049
2050         skb = skb_peek_tail(&other->sk_receive_queue);
2051         if (tail && tail == skb) {
2052                 skb = newskb;
2053         } else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2054                 if (newskb) {
2055                         skb = newskb;
2056                 } else {
2057                         tail = skb;
2058                         goto alloc_skb;
2059                 }
2060         } else if (newskb) {
2061                 /* this is fast path, we don't necessarily need to
2062                  * call to kfree_skb even though with newskb == NULL
2063                  * this - does no harm
2064                  */
2065                 consume_skb(newskb);
2066                 newskb = NULL;
2067         }
2068
2069         if (skb_append_pagefrags(skb, page, offset, size)) {
2070                 tail = skb;
2071                 goto alloc_skb;
2072         }
2073
2074         skb->len += size;
2075         skb->data_len += size;
2076         skb->truesize += size;
2077         refcount_add(size, &sk->sk_wmem_alloc);
2078
2079         if (newskb) {
2080                 err = unix_scm_to_skb(&scm, skb, false);
2081                 if (err)
2082                         goto err_state_unlock;
2083                 spin_lock(&other->sk_receive_queue.lock);
2084                 __skb_queue_tail(&other->sk_receive_queue, newskb);
2085                 spin_unlock(&other->sk_receive_queue.lock);
2086         }
2087
2088         unix_state_unlock(other);
2089         mutex_unlock(&unix_sk(other)->iolock);
2090
2091         other->sk_data_ready(other);
2092         scm_destroy(&scm);
2093         return size;
2094
2095 err_state_unlock:
2096         unix_state_unlock(other);
2097 err_unlock:
2098         mutex_unlock(&unix_sk(other)->iolock);
2099 err:
2100         kfree_skb(newskb);
2101         if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2102                 send_sig(SIGPIPE, current, 0);
2103         if (!init_scm)
2104                 scm_destroy(&scm);
2105         return err;
2106 }
2107
2108 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2109                                   size_t len)
2110 {
2111         int err;
2112         struct sock *sk = sock->sk;
2113
2114         err = sock_error(sk);
2115         if (err)
2116                 return err;
2117
2118         if (sk->sk_state != TCP_ESTABLISHED)
2119                 return -ENOTCONN;
2120
2121         if (msg->msg_namelen)
2122                 msg->msg_namelen = 0;
2123
2124         return unix_dgram_sendmsg(sock, msg, len);
2125 }
2126
2127 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2128                                   size_t size, int flags)
2129 {
2130         struct sock *sk = sock->sk;
2131
2132         if (sk->sk_state != TCP_ESTABLISHED)
2133                 return -ENOTCONN;
2134
2135         return unix_dgram_recvmsg(sock, msg, size, flags);
2136 }
2137
2138 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2139 {
2140         struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2141
2142         if (addr) {
2143                 msg->msg_namelen = addr->len;
2144                 memcpy(msg->msg_name, addr->name, addr->len);
2145         }
2146 }
2147
2148 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
2149                               size_t size, int flags)
2150 {
2151         struct scm_cookie scm;
2152         struct sock *sk = sock->sk;
2153         struct unix_sock *u = unix_sk(sk);
2154         struct sk_buff *skb, *last;
2155         long timeo;
2156         int skip;
2157         int err;
2158
2159         err = -EOPNOTSUPP;
2160         if (flags&MSG_OOB)
2161                 goto out;
2162
2163         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2164
2165         do {
2166                 mutex_lock(&u->iolock);
2167
2168                 skip = sk_peek_offset(sk, flags);
2169                 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2170                                               &skip, &err, &last);
2171                 if (skb) {
2172                         if (!(flags & MSG_PEEK))
2173                                 scm_stat_del(sk, skb);
2174                         break;
2175                 }
2176
2177                 mutex_unlock(&u->iolock);
2178
2179                 if (err != -EAGAIN)
2180                         break;
2181         } while (timeo &&
2182                  !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2183                                               &err, &timeo, last));
2184
2185         if (!skb) { /* implies iolock unlocked */
2186                 unix_state_lock(sk);
2187                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2188                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2189                     (sk->sk_shutdown & RCV_SHUTDOWN))
2190                         err = 0;
2191                 unix_state_unlock(sk);
2192                 goto out;
2193         }
2194
2195         if (wq_has_sleeper(&u->peer_wait))
2196                 wake_up_interruptible_sync_poll(&u->peer_wait,
2197                                                 EPOLLOUT | EPOLLWRNORM |
2198                                                 EPOLLWRBAND);
2199
2200         if (msg->msg_name)
2201                 unix_copy_addr(msg, skb->sk);
2202
2203         if (size > skb->len - skip)
2204                 size = skb->len - skip;
2205         else if (size < skb->len - skip)
2206                 msg->msg_flags |= MSG_TRUNC;
2207
2208         err = skb_copy_datagram_msg(skb, skip, msg, size);
2209         if (err)
2210                 goto out_free;
2211
2212         if (sock_flag(sk, SOCK_RCVTSTAMP))
2213                 __sock_recv_timestamp(msg, sk, skb);
2214
2215         memset(&scm, 0, sizeof(scm));
2216
2217         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2218         unix_set_secdata(&scm, skb);
2219
2220         if (!(flags & MSG_PEEK)) {
2221                 if (UNIXCB(skb).fp)
2222                         unix_detach_fds(&scm, skb);
2223
2224                 sk_peek_offset_bwd(sk, skb->len);
2225         } else {
2226                 /* It is questionable: on PEEK we could:
2227                    - do not return fds - good, but too simple 8)
2228                    - return fds, and do not return them on read (old strategy,
2229                      apparently wrong)
2230                    - clone fds (I chose it for now, it is the most universal
2231                      solution)
2232
2233                    POSIX 1003.1g does not actually define this clearly
2234                    at all. POSIX 1003.1g doesn't define a lot of things
2235                    clearly however!
2236
2237                 */
2238
2239                 sk_peek_offset_fwd(sk, size);
2240
2241                 if (UNIXCB(skb).fp)
2242                         unix_peek_fds(&scm, skb);
2243         }
2244         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2245
2246         scm_recv(sock, msg, &scm, flags);
2247
2248 out_free:
2249         skb_free_datagram(sk, skb);
2250         mutex_unlock(&u->iolock);
2251 out:
2252         return err;
2253 }
2254
2255 /*
2256  *      Sleep until more data has arrived. But check for races..
2257  */
2258 static long unix_stream_data_wait(struct sock *sk, long timeo,
2259                                   struct sk_buff *last, unsigned int last_len,
2260                                   bool freezable)
2261 {
2262         struct sk_buff *tail;
2263         DEFINE_WAIT(wait);
2264
2265         unix_state_lock(sk);
2266
2267         for (;;) {
2268                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2269
2270                 tail = skb_peek_tail(&sk->sk_receive_queue);
2271                 if (tail != last ||
2272                     (tail && tail->len != last_len) ||
2273                     sk->sk_err ||
2274                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2275                     signal_pending(current) ||
2276                     !timeo)
2277                         break;
2278
2279                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2280                 unix_state_unlock(sk);
2281                 if (freezable)
2282                         timeo = freezable_schedule_timeout(timeo);
2283                 else
2284                         timeo = schedule_timeout(timeo);
2285                 unix_state_lock(sk);
2286
2287                 if (sock_flag(sk, SOCK_DEAD))
2288                         break;
2289
2290                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2291         }
2292
2293         finish_wait(sk_sleep(sk), &wait);
2294         unix_state_unlock(sk);
2295         return timeo;
2296 }
2297
2298 static unsigned int unix_skb_len(const struct sk_buff *skb)
2299 {
2300         return skb->len - UNIXCB(skb).consumed;
2301 }
2302
2303 struct unix_stream_read_state {
2304         int (*recv_actor)(struct sk_buff *, int, int,
2305                           struct unix_stream_read_state *);
2306         struct socket *socket;
2307         struct msghdr *msg;
2308         struct pipe_inode_info *pipe;
2309         size_t size;
2310         int flags;
2311         unsigned int splice_flags;
2312 };
2313
2314 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2315                                     bool freezable)
2316 {
2317         struct scm_cookie scm;
2318         struct socket *sock = state->socket;
2319         struct sock *sk = sock->sk;
2320         struct unix_sock *u = unix_sk(sk);
2321         int copied = 0;
2322         int flags = state->flags;
2323         int noblock = flags & MSG_DONTWAIT;
2324         bool check_creds = false;
2325         int target;
2326         int err = 0;
2327         long timeo;
2328         int skip;
2329         size_t size = state->size;
2330         unsigned int last_len;
2331
2332         if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2333                 err = -EINVAL;
2334                 goto out;
2335         }
2336
2337         if (unlikely(flags & MSG_OOB)) {
2338                 err = -EOPNOTSUPP;
2339                 goto out;
2340         }
2341
2342         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2343         timeo = sock_rcvtimeo(sk, noblock);
2344
2345         memset(&scm, 0, sizeof(scm));
2346
2347         /* Lock the socket to prevent queue disordering
2348          * while sleeps in memcpy_tomsg
2349          */
2350         mutex_lock(&u->iolock);
2351
2352         skip = max(sk_peek_offset(sk, flags), 0);
2353
2354         do {
2355                 int chunk;
2356                 bool drop_skb;
2357                 struct sk_buff *skb, *last;
2358
2359 redo:
2360                 unix_state_lock(sk);
2361                 if (sock_flag(sk, SOCK_DEAD)) {
2362                         err = -ECONNRESET;
2363                         goto unlock;
2364                 }
2365                 last = skb = skb_peek(&sk->sk_receive_queue);
2366                 last_len = last ? last->len : 0;
2367 again:
2368                 if (skb == NULL) {
2369                         if (copied >= target)
2370                                 goto unlock;
2371
2372                         /*
2373                          *      POSIX 1003.1g mandates this order.
2374                          */
2375
2376                         err = sock_error(sk);
2377                         if (err)
2378                                 goto unlock;
2379                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2380                                 goto unlock;
2381
2382                         unix_state_unlock(sk);
2383                         if (!timeo) {
2384                                 err = -EAGAIN;
2385                                 break;
2386                         }
2387
2388                         mutex_unlock(&u->iolock);
2389
2390                         timeo = unix_stream_data_wait(sk, timeo, last,
2391                                                       last_len, freezable);
2392
2393                         if (signal_pending(current)) {
2394                                 err = sock_intr_errno(timeo);
2395                                 scm_destroy(&scm);
2396                                 goto out;
2397                         }
2398
2399                         mutex_lock(&u->iolock);
2400                         goto redo;
2401 unlock:
2402                         unix_state_unlock(sk);
2403                         break;
2404                 }
2405
2406                 while (skip >= unix_skb_len(skb)) {
2407                         skip -= unix_skb_len(skb);
2408                         last = skb;
2409                         last_len = skb->len;
2410                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2411                         if (!skb)
2412                                 goto again;
2413                 }
2414
2415                 unix_state_unlock(sk);
2416
2417                 if (check_creds) {
2418                         /* Never glue messages from different writers */
2419                         if (!unix_skb_scm_eq(skb, &scm))
2420                                 break;
2421                 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2422                         /* Copy credentials */
2423                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2424                         unix_set_secdata(&scm, skb);
2425                         check_creds = true;
2426                 }
2427
2428                 /* Copy address just once */
2429                 if (state->msg && state->msg->msg_name) {
2430                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2431                                          state->msg->msg_name);
2432                         unix_copy_addr(state->msg, skb->sk);
2433                         sunaddr = NULL;
2434                 }
2435
2436                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2437                 skb_get(skb);
2438                 chunk = state->recv_actor(skb, skip, chunk, state);
2439                 drop_skb = !unix_skb_len(skb);
2440                 /* skb is only safe to use if !drop_skb */
2441                 consume_skb(skb);
2442                 if (chunk < 0) {
2443                         if (copied == 0)
2444                                 copied = -EFAULT;
2445                         break;
2446                 }
2447                 copied += chunk;
2448                 size -= chunk;
2449
2450                 if (drop_skb) {
2451                         /* the skb was touched by a concurrent reader;
2452                          * we should not expect anything from this skb
2453                          * anymore and assume it invalid - we can be
2454                          * sure it was dropped from the socket queue
2455                          *
2456                          * let's report a short read
2457                          */
2458                         err = 0;
2459                         break;
2460                 }
2461
2462                 /* Mark read part of skb as used */
2463                 if (!(flags & MSG_PEEK)) {
2464                         UNIXCB(skb).consumed += chunk;
2465
2466                         sk_peek_offset_bwd(sk, chunk);
2467
2468                         if (UNIXCB(skb).fp) {
2469                                 scm_stat_del(sk, skb);
2470                                 unix_detach_fds(&scm, skb);
2471                         }
2472
2473                         if (unix_skb_len(skb))
2474                                 break;
2475
2476                         skb_unlink(skb, &sk->sk_receive_queue);
2477                         consume_skb(skb);
2478
2479                         if (scm.fp)
2480                                 break;
2481                 } else {
2482                         /* It is questionable, see note in unix_dgram_recvmsg.
2483                          */
2484                         if (UNIXCB(skb).fp)
2485                                 unix_peek_fds(&scm, skb);
2486
2487                         sk_peek_offset_fwd(sk, chunk);
2488
2489                         if (UNIXCB(skb).fp)
2490                                 break;
2491
2492                         skip = 0;
2493                         last = skb;
2494                         last_len = skb->len;
2495                         unix_state_lock(sk);
2496                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2497                         if (skb)
2498                                 goto again;
2499                         unix_state_unlock(sk);
2500                         break;
2501                 }
2502         } while (size);
2503
2504         mutex_unlock(&u->iolock);
2505         if (state->msg)
2506                 scm_recv(sock, state->msg, &scm, flags);
2507         else
2508                 scm_destroy(&scm);
2509 out:
2510         return copied ? : err;
2511 }
2512
2513 static int unix_stream_read_actor(struct sk_buff *skb,
2514                                   int skip, int chunk,
2515                                   struct unix_stream_read_state *state)
2516 {
2517         int ret;
2518
2519         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2520                                     state->msg, chunk);
2521         return ret ?: chunk;
2522 }
2523
2524 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2525                                size_t size, int flags)
2526 {
2527         struct unix_stream_read_state state = {
2528                 .recv_actor = unix_stream_read_actor,
2529                 .socket = sock,
2530                 .msg = msg,
2531                 .size = size,
2532                 .flags = flags
2533         };
2534
2535         return unix_stream_read_generic(&state, true);
2536 }
2537
2538 static int unix_stream_splice_actor(struct sk_buff *skb,
2539                                     int skip, int chunk,
2540                                     struct unix_stream_read_state *state)
2541 {
2542         return skb_splice_bits(skb, state->socket->sk,
2543                                UNIXCB(skb).consumed + skip,
2544                                state->pipe, chunk, state->splice_flags);
2545 }
2546
2547 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2548                                        struct pipe_inode_info *pipe,
2549                                        size_t size, unsigned int flags)
2550 {
2551         struct unix_stream_read_state state = {
2552                 .recv_actor = unix_stream_splice_actor,
2553                 .socket = sock,
2554                 .pipe = pipe,
2555                 .size = size,
2556                 .splice_flags = flags,
2557         };
2558
2559         if (unlikely(*ppos))
2560                 return -ESPIPE;
2561
2562         if (sock->file->f_flags & O_NONBLOCK ||
2563             flags & SPLICE_F_NONBLOCK)
2564                 state.flags = MSG_DONTWAIT;
2565
2566         return unix_stream_read_generic(&state, false);
2567 }
2568
2569 static int unix_shutdown(struct socket *sock, int mode)
2570 {
2571         struct sock *sk = sock->sk;
2572         struct sock *other;
2573
2574         if (mode < SHUT_RD || mode > SHUT_RDWR)
2575                 return -EINVAL;
2576         /* This maps:
2577          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2578          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2579          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2580          */
2581         ++mode;
2582
2583         unix_state_lock(sk);
2584         sk->sk_shutdown |= mode;
2585         other = unix_peer(sk);
2586         if (other)
2587                 sock_hold(other);
2588         unix_state_unlock(sk);
2589         sk->sk_state_change(sk);
2590
2591         if (other &&
2592                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2593
2594                 int peer_mode = 0;
2595
2596                 if (mode&RCV_SHUTDOWN)
2597                         peer_mode |= SEND_SHUTDOWN;
2598                 if (mode&SEND_SHUTDOWN)
2599                         peer_mode |= RCV_SHUTDOWN;
2600                 unix_state_lock(other);
2601                 other->sk_shutdown |= peer_mode;
2602                 unix_state_unlock(other);
2603                 other->sk_state_change(other);
2604                 if (peer_mode == SHUTDOWN_MASK)
2605                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2606                 else if (peer_mode & RCV_SHUTDOWN)
2607                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2608         }
2609         if (other)
2610                 sock_put(other);
2611
2612         return 0;
2613 }
2614
2615 long unix_inq_len(struct sock *sk)
2616 {
2617         struct sk_buff *skb;
2618         long amount = 0;
2619
2620         if (sk->sk_state == TCP_LISTEN)
2621                 return -EINVAL;
2622
2623         spin_lock(&sk->sk_receive_queue.lock);
2624         if (sk->sk_type == SOCK_STREAM ||
2625             sk->sk_type == SOCK_SEQPACKET) {
2626                 skb_queue_walk(&sk->sk_receive_queue, skb)
2627                         amount += unix_skb_len(skb);
2628         } else {
2629                 skb = skb_peek(&sk->sk_receive_queue);
2630                 if (skb)
2631                         amount = skb->len;
2632         }
2633         spin_unlock(&sk->sk_receive_queue.lock);
2634
2635         return amount;
2636 }
2637 EXPORT_SYMBOL_GPL(unix_inq_len);
2638
2639 long unix_outq_len(struct sock *sk)
2640 {
2641         return sk_wmem_alloc_get(sk);
2642 }
2643 EXPORT_SYMBOL_GPL(unix_outq_len);
2644
2645 static int unix_open_file(struct sock *sk)
2646 {
2647         struct path path;
2648         struct file *f;
2649         int fd;
2650
2651         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2652                 return -EPERM;
2653
2654         if (!smp_load_acquire(&unix_sk(sk)->addr))
2655                 return -ENOENT;
2656
2657         path = unix_sk(sk)->path;
2658         if (!path.dentry)
2659                 return -ENOENT;
2660
2661         path_get(&path);
2662
2663         fd = get_unused_fd_flags(O_CLOEXEC);
2664         if (fd < 0)
2665                 goto out;
2666
2667         f = dentry_open(&path, O_PATH, current_cred());
2668         if (IS_ERR(f)) {
2669                 put_unused_fd(fd);
2670                 fd = PTR_ERR(f);
2671                 goto out;
2672         }
2673
2674         fd_install(fd, f);
2675 out:
2676         path_put(&path);
2677
2678         return fd;
2679 }
2680
2681 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2682 {
2683         struct sock *sk = sock->sk;
2684         long amount = 0;
2685         int err;
2686
2687         switch (cmd) {
2688         case SIOCOUTQ:
2689                 amount = unix_outq_len(sk);
2690                 err = put_user(amount, (int __user *)arg);
2691                 break;
2692         case SIOCINQ:
2693                 amount = unix_inq_len(sk);
2694                 if (amount < 0)
2695                         err = amount;
2696                 else
2697                         err = put_user(amount, (int __user *)arg);
2698                 break;
2699         case SIOCUNIXFILE:
2700                 err = unix_open_file(sk);
2701                 break;
2702         default:
2703                 err = -ENOIOCTLCMD;
2704                 break;
2705         }
2706         return err;
2707 }
2708
2709 #ifdef CONFIG_COMPAT
2710 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2711 {
2712         return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
2713 }
2714 #endif
2715
2716 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2717 {
2718         struct sock *sk = sock->sk;
2719         __poll_t mask;
2720
2721         sock_poll_wait(file, sock, wait);
2722         mask = 0;
2723
2724         /* exceptional events? */
2725         if (sk->sk_err)
2726                 mask |= EPOLLERR;
2727         if (sk->sk_shutdown == SHUTDOWN_MASK)
2728                 mask |= EPOLLHUP;
2729         if (sk->sk_shutdown & RCV_SHUTDOWN)
2730                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
2731
2732         /* readable? */
2733         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
2734                 mask |= EPOLLIN | EPOLLRDNORM;
2735
2736         /* Connection-based need to check for termination and startup */
2737         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2738             sk->sk_state == TCP_CLOSE)
2739                 mask |= EPOLLHUP;
2740
2741         /*
2742          * we set writable also when the other side has shut down the
2743          * connection. This prevents stuck sockets.
2744          */
2745         if (unix_writable(sk))
2746                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
2747
2748         return mask;
2749 }
2750
2751 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
2752                                     poll_table *wait)
2753 {
2754         struct sock *sk = sock->sk, *other;
2755         unsigned int writable;
2756         __poll_t mask;
2757
2758         sock_poll_wait(file, sock, wait);
2759         mask = 0;
2760
2761         /* exceptional events? */
2762         if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
2763                 mask |= EPOLLERR |
2764                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
2765
2766         if (sk->sk_shutdown & RCV_SHUTDOWN)
2767                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
2768         if (sk->sk_shutdown == SHUTDOWN_MASK)
2769                 mask |= EPOLLHUP;
2770
2771         /* readable? */
2772         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
2773                 mask |= EPOLLIN | EPOLLRDNORM;
2774
2775         /* Connection-based need to check for termination and startup */
2776         if (sk->sk_type == SOCK_SEQPACKET) {
2777                 if (sk->sk_state == TCP_CLOSE)
2778                         mask |= EPOLLHUP;
2779                 /* connection hasn't started yet? */
2780                 if (sk->sk_state == TCP_SYN_SENT)
2781                         return mask;
2782         }
2783
2784         /* No write status requested, avoid expensive OUT tests. */
2785         if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
2786                 return mask;
2787
2788         writable = unix_writable(sk);
2789         if (writable) {
2790                 unix_state_lock(sk);
2791
2792                 other = unix_peer(sk);
2793                 if (other && unix_peer(other) != sk &&
2794                     unix_recvq_full_lockless(other) &&
2795                     unix_dgram_peer_wake_me(sk, other))
2796                         writable = 0;
2797
2798                 unix_state_unlock(sk);
2799         }
2800
2801         if (writable)
2802                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
2803         else
2804                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2805
2806         return mask;
2807 }
2808
2809 #ifdef CONFIG_PROC_FS
2810
2811 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2812
2813 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2814 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2815 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2816
2817 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2818 {
2819         unsigned long offset = get_offset(*pos);
2820         unsigned long bucket = get_bucket(*pos);
2821         struct sock *sk;
2822         unsigned long count = 0;
2823
2824         for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2825                 if (sock_net(sk) != seq_file_net(seq))
2826                         continue;
2827                 if (++count == offset)
2828                         break;
2829         }
2830
2831         return sk;
2832 }
2833
2834 static struct sock *unix_next_socket(struct seq_file *seq,
2835                                      struct sock *sk,
2836                                      loff_t *pos)
2837 {
2838         unsigned long bucket;
2839
2840         while (sk > (struct sock *)SEQ_START_TOKEN) {
2841                 sk = sk_next(sk);
2842                 if (!sk)
2843                         goto next_bucket;
2844                 if (sock_net(sk) == seq_file_net(seq))
2845                         return sk;
2846         }
2847
2848         do {
2849                 sk = unix_from_bucket(seq, pos);
2850                 if (sk)
2851                         return sk;
2852
2853 next_bucket:
2854                 bucket = get_bucket(*pos) + 1;
2855                 *pos = set_bucket_offset(bucket, 1);
2856         } while (bucket < ARRAY_SIZE(unix_socket_table));
2857
2858         return NULL;
2859 }
2860
2861 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2862         __acquires(unix_table_lock)
2863 {
2864         spin_lock(&unix_table_lock);
2865
2866         if (!*pos)
2867                 return SEQ_START_TOKEN;
2868
2869         if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2870                 return NULL;
2871
2872         return unix_next_socket(seq, NULL, pos);
2873 }
2874
2875 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2876 {
2877         ++*pos;
2878         return unix_next_socket(seq, v, pos);
2879 }
2880
2881 static void unix_seq_stop(struct seq_file *seq, void *v)
2882         __releases(unix_table_lock)
2883 {
2884         spin_unlock(&unix_table_lock);
2885 }
2886
2887 static int unix_seq_show(struct seq_file *seq, void *v)
2888 {
2889
2890         if (v == SEQ_START_TOKEN)
2891                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2892                          "Inode Path\n");
2893         else {
2894                 struct sock *s = v;
2895                 struct unix_sock *u = unix_sk(s);
2896                 unix_state_lock(s);
2897
2898                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2899                         s,
2900                         refcount_read(&s->sk_refcnt),
2901                         0,
2902                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2903                         s->sk_type,
2904                         s->sk_socket ?
2905                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2906                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2907                         sock_i_ino(s));
2908
2909                 if (u->addr) {  // under unix_table_lock here
2910                         int i, len;
2911                         seq_putc(seq, ' ');
2912
2913                         i = 0;
2914                         len = u->addr->len - sizeof(short);
2915                         if (!UNIX_ABSTRACT(s))
2916                                 len--;
2917                         else {
2918                                 seq_putc(seq, '@');
2919                                 i++;
2920                         }
2921                         for ( ; i < len; i++)
2922                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
2923                                          '@');
2924                 }
2925                 unix_state_unlock(s);
2926                 seq_putc(seq, '\n');
2927         }
2928
2929         return 0;
2930 }
2931
2932 static const struct seq_operations unix_seq_ops = {
2933         .start  = unix_seq_start,
2934         .next   = unix_seq_next,
2935         .stop   = unix_seq_stop,
2936         .show   = unix_seq_show,
2937 };
2938 #endif
2939
2940 static const struct net_proto_family unix_family_ops = {
2941         .family = PF_UNIX,
2942         .create = unix_create,
2943         .owner  = THIS_MODULE,
2944 };
2945
2946
2947 static int __net_init unix_net_init(struct net *net)
2948 {
2949         int error = -ENOMEM;
2950
2951         net->unx.sysctl_max_dgram_qlen = 10;
2952         if (unix_sysctl_register(net))
2953                 goto out;
2954
2955 #ifdef CONFIG_PROC_FS
2956         if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
2957                         sizeof(struct seq_net_private))) {
2958                 unix_sysctl_unregister(net);
2959                 goto out;
2960         }
2961 #endif
2962         error = 0;
2963 out:
2964         return error;
2965 }
2966
2967 static void __net_exit unix_net_exit(struct net *net)
2968 {
2969         unix_sysctl_unregister(net);
2970         remove_proc_entry("unix", net->proc_net);
2971 }
2972
2973 static struct pernet_operations unix_net_ops = {
2974         .init = unix_net_init,
2975         .exit = unix_net_exit,
2976 };
2977
2978 static int __init af_unix_init(void)
2979 {
2980         int rc = -1;
2981
2982         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
2983
2984         rc = proto_register(&unix_proto, 1);
2985         if (rc != 0) {
2986                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2987                 goto out;
2988         }
2989
2990         sock_register(&unix_family_ops);
2991         register_pernet_subsys(&unix_net_ops);
2992 out:
2993         return rc;
2994 }
2995
2996 static void __exit af_unix_exit(void)
2997 {
2998         sock_unregister(PF_UNIX);
2999         proto_unregister(&unix_proto);
3000         unregister_pernet_subsys(&unix_net_ops);
3001 }
3002
3003 /* Earlier than device_initcall() so that other drivers invoking
3004    request_module() don't end up in a loop when modprobe tries
3005    to use a UNIX socket. But later than subsys_initcall() because
3006    we depend on stuff initialised there */
3007 fs_initcall(af_unix_init);
3008 module_exit(af_unix_exit);
3009
3010 MODULE_LICENSE("GPL");
3011 MODULE_ALIAS_NETPROTO(PF_UNIX);