net/unix/af_unix.c

   1 /*
   2  * NET4:        Implementation of BSD Unix domain sockets.
   3  *
   4  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   5  *
   6  *              This program is free software; you can redistribute it and/or
   7  *              modify it under the terms of the GNU General Public License
   8  *              as published by the Free Software Foundation; either version
   9  *              2 of the License, or (at your option) any later version.
  10  *
  11  * Fixes:
  12  *              Linus Torvalds  :       Assorted bug cures.
  13  *              Niibe Yutaka    :       async I/O support.
  14  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  15  *              Alan Cox        :       Limit size of allocated blocks.
  16  *              Alan Cox        :       Fixed the stupid socketpair bug.
  17  *              Alan Cox        :       BSD compatibility fine tuning.
  18  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  19  *              Alan Cox        :       Sorted out a proper draft version of
  20  *                                      file descriptor passing hacked up from
  21  *                                      Mike Shaver's work.
  22  *              Marty Leisner   :       Fixes to fd passing
  23  *              Nick Nevin      :       recvmsg bugfix.
  24  *              Alan Cox        :       Started proper garbage collector
  25  *              Heiko EiBfeldt  :       Missing verify_area check
  26  *              Alan Cox        :       Started POSIXisms
  27  *              Andreas Schwab  :       Replace inode by dentry for proper
  28  *                                      reference counting
  29  *              Kirk Petersen   :       Made this a module
  30  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  31  *                                      Lots of bug fixes.
  32  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  33  *                                      by above two patches.
  34  *           Andrea Arcangeli   :       If possible we block in connect(2)
  35  *                                      if the max backlog of the listen socket
  36  *                                      is been reached. This won't break
  37  *                                      old apps and it will avoid huge amount
  38  *                                      of socks hashed (this for unix_gc()
  39  *                                      performances reasons).
  40  *                                      Security fix that limits the max
  41  *                                      number of socks to 2*max_files and
  42  *                                      the number of skb queueable in the
  43  *                                      dgram receiver.
  44  *              Artur Skawina   :       Hash function optimizations
  45  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  46  *            Malcolm Beattie   :       Set peercred for socketpair
  47  *           Michal Ostrowski   :       Module initialization cleanup.
  48  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  49  *                                      the core infrastructure is doing that
  50  *                                      for all net proto families now (2.5.69+)
  51  *
  52  *
  53  * Known differences from reference BSD that was tested:
  54  *
  55  *      [TO FIX]
  56  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  57  *              other the moment one end closes.
  58  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  59  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  60  *      [NOT TO FIX]
  61  *      accept() returns a path name even if the connecting socket has closed
  62  *              in the meantime (BSD loses the path and gives up).
  63  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  64  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  65  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  66  *      BSD af_unix apparently has connect forgetting to block properly.
  67  *              (need to check this with the POSIX spec in detail)
  68  *
  69  * Differences from 2.0.0-11-... (ANK)
  70  *      Bug fixes and improvements.
  71  *              - client shutdown killed server socket.
  72  *              - removed all useless cli/sti pairs.
  73  *
  74  *      Semantic changes/extensions.
  75  *              - generic control message passing.
  76  *              - SCM_CREDENTIALS control message.
  77  *              - "Abstract" (not FS based) socket bindings.
  78  *                Abstract names are sequences of bytes (not zero terminated)
  79  *                started by 0, so that this name space does not intersect
  80  *                with BSD names.
  81  */
  82
  83 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  84
  85 #include <linux/module.h>
  86 #include <linux/kernel.h>
  87 #include <linux/signal.h>
  88 #include <linux/sched/signal.h>
  89 #include <linux/errno.h>
  90 #include <linux/string.h>
  91 #include <linux/stat.h>
  92 #include <linux/dcache.h>
  93 #include <linux/namei.h>
  94 #include <linux/socket.h>
  95 #include <linux/un.h>
  96 #include <linux/fcntl.h>
  97 #include <linux/termios.h>
  98 #include <linux/sockios.h>
  99 #include <linux/net.h>
 100 #include <linux/in.h>
 101 #include <linux/fs.h>
 102 #include <linux/slab.h>
 103 #include <linux/uaccess.h>
 104 #include <linux/skbuff.h>
 105 #include <linux/netdevice.h>
 106 #include <net/net_namespace.h>
 107 #include <net/sock.h>
 108 #include <net/tcp_states.h>
 109 #include <net/af_unix.h>
 110 #include <linux/proc_fs.h>
 111 #include <linux/seq_file.h>
 112 #include <net/scm.h>
 113 #include <linux/init.h>
 114 #include <linux/poll.h>
 115 #include <linux/rtnetlink.h>
 116 #include <linux/mount.h>
 117 #include <net/checksum.h>
 118 #include <linux/security.h>
 119 #include <linux/freezer.h>
 120 #include <linux/file.h>
 121
 122 #include "scm.h"
 123
 124 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 125 EXPORT_SYMBOL_GPL(unix_socket_table);
 126 DEFINE_SPINLOCK(unix_table_lock);
 127 EXPORT_SYMBOL_GPL(unix_table_lock);
 128 static atomic_long_t unix_nr_socks;
 129
 130
 131 static struct hlist_head *unix_sockets_unbound(void *addr)
 132 {
 133         unsigned long hash = (unsigned long)addr;
 134
 135         hash ^= hash >> 16;
 136         hash ^= hash >> 8;
 137         hash %= UNIX_HASH_SIZE;
 138         return &unix_socket_table[UNIX_HASH_SIZE + hash];
 139 }
 140
 141 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
 142
 143 #ifdef CONFIG_SECURITY_NETWORK
 144 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 145 {
 146         UNIXCB(skb).secid = scm->secid;
 147 }
 148
 149 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 150 {
 151         scm->secid = UNIXCB(skb).secid;
 152 }
 153
 154 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 155 {
 156         return (scm->secid == UNIXCB(skb).secid);
 157 }
 158 #else
 159 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 160 { }
 161
 162 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 163 { }
 164
 165 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 166 {
 167         return true;
 168 }
 169 #endif /* CONFIG_SECURITY_NETWORK */
 170
 171 /*
 172  *  SMP locking strategy:
 173  *    hash table is protected with spinlock unix_table_lock
 174  *    each socket state is protected by separate spin lock.
 175  */
 176
 177 static inline unsigned int unix_hash_fold(__wsum n)
 178 {
 179         unsigned int hash = (__force unsigned int)csum_fold(n);
 180
 181         hash ^= hash>>8;
 182         return hash&(UNIX_HASH_SIZE-1);
 183 }
 184
 185 #define unix_peer(sk) (unix_sk(sk)->peer)
 186
 187 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 188 {
 189         return unix_peer(osk) == sk;
 190 }
 191
 192 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 193 {
 194         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 195 }
 196
 197 static inline int unix_recvq_full(const struct sock *sk)
 198 {
 199         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 200 }
 201
 202 static inline int unix_recvq_full_lockless(const struct sock *sk)
 203 {
 204         return skb_queue_len_lockless(&sk->sk_receive_queue) >
 205                 READ_ONCE(sk->sk_max_ack_backlog);
 206 }
 207
 208 struct sock *unix_peer_get(struct sock *s)
 209 {
 210         struct sock *peer;
 211
 212         unix_state_lock(s);
 213         peer = unix_peer(s);
 214         if (peer)
 215                 sock_hold(peer);
 216         unix_state_unlock(s);
 217         return peer;
 218 }
 219 EXPORT_SYMBOL_GPL(unix_peer_get);
 220
 221 static inline void unix_release_addr(struct unix_address *addr)
 222 {
 223         if (refcount_dec_and_test(&addr->refcnt))
 224                 kfree(addr);
 225 }
 226
 227 /*
 228  *      Check unix socket name:
 229  *              - should be not zero length.
 230  *              - if started by not zero, should be NULL terminated (FS object)
 231  *              - if started by zero, it is abstract name.
 232  */
 233
 234 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
 235 {
 236         *hashp = 0;
 237
 238         if (len <= sizeof(short) || len > sizeof(*sunaddr))
 239                 return -EINVAL;
 240         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
 241                 return -EINVAL;
 242         if (sunaddr->sun_path[0]) {
 243                 /*
 244                  * This may look like an off by one error but it is a bit more
 245                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
 246                  * sun_path[108] doesn't as such exist.  However in kernel space
 247                  * we are guaranteed that it is a valid memory location in our
 248                  * kernel address buffer.
 249                  */
 250                 ((char *)sunaddr)[len] = 0;
 251                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
 252                 return len;
 253         }
 254
 255         *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
 256         return len;
 257 }
 258
 259 static void __unix_remove_socket(struct sock *sk)
 260 {
 261         sk_del_node_init(sk);
 262 }
 263
 264 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 265 {
 266         WARN_ON(!sk_unhashed(sk));
 267         sk_add_node(sk, list);
 268 }
 269
 270 static inline void unix_remove_socket(struct sock *sk)
 271 {
 272         spin_lock(&unix_table_lock);
 273         __unix_remove_socket(sk);
 274         spin_unlock(&unix_table_lock);
 275 }
 276
 277 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 278 {
 279         spin_lock(&unix_table_lock);
 280         __unix_insert_socket(list, sk);
 281         spin_unlock(&unix_table_lock);
 282 }
 283
 284 static struct sock *__unix_find_socket_byname(struct net *net,
 285                                               struct sockaddr_un *sunname,
 286                                               int len, int type, unsigned int hash)
 287 {
 288         struct sock *s;
 289
 290         sk_for_each(s, &unix_socket_table[hash ^ type]) {
 291                 struct unix_sock *u = unix_sk(s);
 292
 293                 if (!net_eq(sock_net(s), net))
 294                         continue;
 295
 296                 if (u->addr->len == len &&
 297                     !memcmp(u->addr->name, sunname, len))
 298                         goto found;
 299         }
 300         s = NULL;
 301 found:
 302         return s;
 303 }
 304
 305 static inline struct sock *unix_find_socket_byname(struct net *net,
 306                                                    struct sockaddr_un *sunname,
 307                                                    int len, int type,
 308                                                    unsigned int hash)
 309 {
 310         struct sock *s;
 311
 312         spin_lock(&unix_table_lock);
 313         s = __unix_find_socket_byname(net, sunname, len, type, hash);
 314         if (s)
 315                 sock_hold(s);
 316         spin_unlock(&unix_table_lock);
 317         return s;
 318 }
 319
 320 static struct sock *unix_find_socket_byinode(struct inode *i)
 321 {
 322         struct sock *s;
 323
 324         spin_lock(&unix_table_lock);
 325         sk_for_each(s,
 326                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 327                 struct dentry *dentry = unix_sk(s)->path.dentry;
 328
 329                 if (dentry && d_backing_inode(dentry) == i) {
 330                         sock_hold(s);
 331                         goto found;
 332                 }
 333         }
 334         s = NULL;
 335 found:
 336         spin_unlock(&unix_table_lock);
 337         return s;
 338 }
 339
 340 /* Support code for asymmetrically connected dgram sockets
 341  *
 342  * If a datagram socket is connected to a socket not itself connected
 343  * to the first socket (eg, /dev/log), clients may only enqueue more
 344  * messages if the present receive queue of the server socket is not
 345  * "too large". This means there's a second writeability condition
 346  * poll and sendmsg need to test. The dgram recv code will do a wake
 347  * up on the peer_wait wait queue of a socket upon reception of a
 348  * datagram which needs to be propagated to sleeping would-be writers
 349  * since these might not have sent anything so far. This can't be
 350  * accomplished via poll_wait because the lifetime of the server
 351  * socket might be less than that of its clients if these break their
 352  * association with it or if the server socket is closed while clients
 353  * are still connected to it and there's no way to inform "a polling
 354  * implementation" that it should let go of a certain wait queue
 355  *
 356  * In order to propagate a wake up, a wait_queue_entry_t of the client
 357  * socket is enqueued on the peer_wait queue of the server socket
 358  * whose wake function does a wake_up on the ordinary client socket
 359  * wait queue. This connection is established whenever a write (or
 360  * poll for write) hit the flow control condition and broken when the
 361  * association to the server socket is dissolved or after a wake up
 362  * was relayed.
 363  */
 364
 365 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
 366                                       void *key)
 367 {
 368         struct unix_sock *u;
 369         wait_queue_head_t *u_sleep;
 370
 371         u = container_of(q, struct unix_sock, peer_wake);
 372
 373         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
 374                             q);
 375         u->peer_wake.private = NULL;
 376
 377         /* relaying can only happen while the wq still exists */
 378         u_sleep = sk_sleep(&u->sk);
 379         if (u_sleep)
 380                 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
 381
 382         return 0;
 383 }
 384
 385 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
 386 {
 387         struct unix_sock *u, *u_other;
 388         int rc;
 389
 390         u = unix_sk(sk);
 391         u_other = unix_sk(other);
 392         rc = 0;
 393         spin_lock(&u_other->peer_wait.lock);
 394
 395         if (!u->peer_wake.private) {
 396                 u->peer_wake.private = other;
 397                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
 398
 399                 rc = 1;
 400         }
 401
 402         spin_unlock(&u_other->peer_wait.lock);
 403         return rc;
 404 }
 405
 406 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
 407                                             struct sock *other)
 408 {
 409         struct unix_sock *u, *u_other;
 410
 411         u = unix_sk(sk);
 412         u_other = unix_sk(other);
 413         spin_lock(&u_other->peer_wait.lock);
 414
 415         if (u->peer_wake.private == other) {
 416                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
 417                 u->peer_wake.private = NULL;
 418         }
 419
 420         spin_unlock(&u_other->peer_wait.lock);
 421 }
 422
 423 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 424                                                    struct sock *other)
 425 {
 426         unix_dgram_peer_wake_disconnect(sk, other);
 427         wake_up_interruptible_poll(sk_sleep(sk),
 428                                    EPOLLOUT |
 429                                    EPOLLWRNORM |
 430                                    EPOLLWRBAND);
 431 }
 432
 433 /* preconditions:
 434  *      - unix_peer(sk) == other
 435  *      - association is stable
 436  */
 437 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 438 {
 439         int connected;
 440
 441         connected = unix_dgram_peer_wake_connect(sk, other);
 442
 443         /* If other is SOCK_DEAD, we want to make sure we signal
 444          * POLLOUT, such that a subsequent write() can get a
 445          * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
 446          * to other and its full, we will hang waiting for POLLOUT.
 447          */
 448         if (unix_recvq_full(other) && !sock_flag(other, SOCK_DEAD))
 449                 return 1;
 450
 451         if (connected)
 452                 unix_dgram_peer_wake_disconnect(sk, other);
 453
 454         return 0;
 455 }
 456
 457 static int unix_writable(const struct sock *sk)
 458 {
 459         return sk->sk_state != TCP_LISTEN &&
 460                (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 461 }
 462
 463 static void unix_write_space(struct sock *sk)
 464 {
 465         struct socket_wq *wq;
 466
 467         rcu_read_lock();
 468         if (unix_writable(sk)) {
 469                 wq = rcu_dereference(sk->sk_wq);
 470                 if (skwq_has_sleeper(wq))
 471                         wake_up_interruptible_sync_poll(&wq->wait,
 472                                 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
 473                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 474         }
 475         rcu_read_unlock();
 476 }
 477
 478 /* When dgram socket disconnects (or changes its peer), we clear its receive
 479  * queue of packets arrived from previous peer. First, it allows to do
 480  * flow control based only on wmem_alloc; second, sk connected to peer
 481  * may receive messages only from that peer. */
 482 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 483 {
 484         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 485                 skb_queue_purge(&sk->sk_receive_queue);
 486                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 487
 488                 /* If one link of bidirectional dgram pipe is disconnected,
 489                  * we signal error. Messages are lost. Do not make this,
 490                  * when peer was not connected to us.
 491                  */
 492                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 493                         other->sk_err = ECONNRESET;
 494                         other->sk_error_report(other);
 495                 }
 496         }
 497 }
 498
 499 static void unix_sock_destructor(struct sock *sk)
 500 {
 501         struct unix_sock *u = unix_sk(sk);
 502
 503         skb_queue_purge(&sk->sk_receive_queue);
 504
 505         WARN_ON(refcount_read(&sk->sk_wmem_alloc));
 506         WARN_ON(!sk_unhashed(sk));
 507         WARN_ON(sk->sk_socket);
 508         if (!sock_flag(sk, SOCK_DEAD)) {
 509                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 510                 return;
 511         }
 512
 513         if (u->addr)
 514                 unix_release_addr(u->addr);
 515
 516         atomic_long_dec(&unix_nr_socks);
 517         local_bh_disable();
 518         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 519         local_bh_enable();
 520 #ifdef UNIX_REFCNT_DEBUG
 521         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 522                 atomic_long_read(&unix_nr_socks));
 523 #endif
 524 }
 525
 526 static void unix_release_sock(struct sock *sk, int embrion)
 527 {
 528         struct unix_sock *u = unix_sk(sk);
 529         struct path path;
 530         struct sock *skpair;
 531         struct sk_buff *skb;
 532         int state;
 533
 534         unix_remove_socket(sk);
 535
 536         /* Clear state */
 537         unix_state_lock(sk);
 538         sock_orphan(sk);
 539         sk->sk_shutdown = SHUTDOWN_MASK;
 540         path         = u->path;
 541         u->path.dentry = NULL;
 542         u->path.mnt = NULL;
 543         state = sk->sk_state;
 544         sk->sk_state = TCP_CLOSE;
 545
 546         skpair = unix_peer(sk);
 547         unix_peer(sk) = NULL;
 548
 549         unix_state_unlock(sk);
 550
 551         wake_up_interruptible_all(&u->peer_wait);
 552
 553         if (skpair != NULL) {
 554                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 555                         unix_state_lock(skpair);
 556                         /* No more writes */
 557                         skpair->sk_shutdown = SHUTDOWN_MASK;
 558                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 559                                 skpair->sk_err = ECONNRESET;
 560                         unix_state_unlock(skpair);
 561                         skpair->sk_state_change(skpair);
 562                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 563                 }
 564
 565                 unix_dgram_peer_wake_disconnect(sk, skpair);
 566                 sock_put(skpair); /* It may now die */
 567         }
 568
 569         /* Try to flush out this socket. Throw out buffers at least */
 570
 571         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 572                 if (state == TCP_LISTEN)
 573                         unix_release_sock(skb->sk, 1);
 574                 /* passed fds are erased in the kfree_skb hook        */
 575                 UNIXCB(skb).consumed = skb->len;
 576                 kfree_skb(skb);
 577         }
 578
 579         if (path.dentry)
 580                 path_put(&path);
 581
 582         sock_put(sk);
 583
 584         /* ---- Socket is dead now and most probably destroyed ---- */
 585
 586         /*
 587          * Fixme: BSD difference: In BSD all sockets connected to us get
 588          *        ECONNRESET and we die on the spot. In Linux we behave
 589          *        like files and pipes do and wait for the last
 590          *        dereference.
 591          *
 592          * Can't we simply set sock->err?
 593          *
 594          *        What the above comment does talk about? --ANK(980817)
 595          */
 596
 597         if (unix_tot_inflight)
 598                 unix_gc();              /* Garbage collect fds */
 599 }
 600
 601 static void init_peercred(struct sock *sk)
 602 {
 603         const struct cred *old_cred;
 604         struct pid *old_pid;
 605
 606         spin_lock(&sk->sk_peer_lock);
 607         old_pid = sk->sk_peer_pid;
 608         old_cred = sk->sk_peer_cred;
 609         sk->sk_peer_pid  = get_pid(task_tgid(current));
 610         sk->sk_peer_cred = get_current_cred();
 611         spin_unlock(&sk->sk_peer_lock);
 612
 613         put_pid(old_pid);
 614         put_cred(old_cred);
 615 }
 616
 617 static void copy_peercred(struct sock *sk, struct sock *peersk)
 618 {
 619         const struct cred *old_cred;
 620         struct pid *old_pid;
 621
 622         if (sk < peersk) {
 623                 spin_lock(&sk->sk_peer_lock);
 624                 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 625         } else {
 626                 spin_lock(&peersk->sk_peer_lock);
 627                 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 628         }
 629         old_pid = sk->sk_peer_pid;
 630         old_cred = sk->sk_peer_cred;
 631         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 632         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 633
 634         spin_unlock(&sk->sk_peer_lock);
 635         spin_unlock(&peersk->sk_peer_lock);
 636
 637         put_pid(old_pid);
 638         put_cred(old_cred);
 639 }
 640
 641 static int unix_listen(struct socket *sock, int backlog)
 642 {
 643         int err;
 644         struct sock *sk = sock->sk;
 645         struct unix_sock *u = unix_sk(sk);
 646         struct pid *old_pid = NULL;
 647
 648         err = -EOPNOTSUPP;
 649         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 650                 goto out;       /* Only stream/seqpacket sockets accept */
 651         err = -EINVAL;
 652         if (!u->addr)
 653                 goto out;       /* No listens on an unbound socket */
 654         unix_state_lock(sk);
 655         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 656                 goto out_unlock;
 657         if (backlog > sk->sk_max_ack_backlog)
 658                 wake_up_interruptible_all(&u->peer_wait);
 659         sk->sk_max_ack_backlog  = backlog;
 660         sk->sk_state            = TCP_LISTEN;
 661         /* set credentials so connect can copy them */
 662         init_peercred(sk);
 663         err = 0;
 664
 665 out_unlock:
 666         unix_state_unlock(sk);
 667         put_pid(old_pid);
 668 out:
 669         return err;
 670 }
 671
 672 static int unix_release(struct socket *);
 673 static int unix_bind(struct socket *, struct sockaddr *, int);
 674 static int unix_stream_connect(struct socket *, struct sockaddr *,
 675                                int addr_len, int flags);
 676 static int unix_socketpair(struct socket *, struct socket *);
 677 static int unix_accept(struct socket *, struct socket *, int, bool);
 678 static int unix_getname(struct socket *, struct sockaddr *, int);
 679 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
 680 static __poll_t unix_dgram_poll(struct file *, struct socket *,
 681                                     poll_table *);
 682 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 683 #ifdef CONFIG_COMPAT
 684 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 685 #endif
 686 static int unix_shutdown(struct socket *, int);
 687 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 688 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 689 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
 690                                     size_t size, int flags);
 691 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 692                                        struct pipe_inode_info *, size_t size,
 693                                        unsigned int flags);
 694 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 695 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 696 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 697                               int, int);
 698 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 699 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 700                                   int);
 701
 702 static int unix_set_peek_off(struct sock *sk, int val)
 703 {
 704         struct unix_sock *u = unix_sk(sk);
 705
 706         if (mutex_lock_interruptible(&u->iolock))
 707                 return -EINTR;
 708
 709         sk->sk_peek_off = val;
 710         mutex_unlock(&u->iolock);
 711
 712         return 0;
 713 }
 714
 715
 716 static const struct proto_ops unix_stream_ops = {
 717         .family =       PF_UNIX,
 718         .owner =        THIS_MODULE,
 719         .release =      unix_release,
 720         .bind =         unix_bind,
 721         .connect =      unix_stream_connect,
 722         .socketpair =   unix_socketpair,
 723         .accept =       unix_accept,
 724         .getname =      unix_getname,
 725         .poll =         unix_poll,
 726         .ioctl =        unix_ioctl,
 727 #ifdef CONFIG_COMPAT
 728         .compat_ioctl = unix_compat_ioctl,
 729 #endif
 730         .listen =       unix_listen,
 731         .shutdown =     unix_shutdown,
 732         .setsockopt =   sock_no_setsockopt,
 733         .getsockopt =   sock_no_getsockopt,
 734         .sendmsg =      unix_stream_sendmsg,
 735         .recvmsg =      unix_stream_recvmsg,
 736         .mmap =         sock_no_mmap,
 737         .sendpage =     unix_stream_sendpage,
 738         .splice_read =  unix_stream_splice_read,
 739         .set_peek_off = unix_set_peek_off,
 740 };
 741
 742 static const struct proto_ops unix_dgram_ops = {
 743         .family =       PF_UNIX,
 744         .owner =        THIS_MODULE,
 745         .release =      unix_release,
 746         .bind =         unix_bind,
 747         .connect =      unix_dgram_connect,
 748         .socketpair =   unix_socketpair,
 749         .accept =       sock_no_accept,
 750         .getname =      unix_getname,
 751         .poll =         unix_dgram_poll,
 752         .ioctl =        unix_ioctl,
 753 #ifdef CONFIG_COMPAT
 754         .compat_ioctl = unix_compat_ioctl,
 755 #endif
 756         .listen =       sock_no_listen,
 757         .shutdown =     unix_shutdown,
 758         .setsockopt =   sock_no_setsockopt,
 759         .getsockopt =   sock_no_getsockopt,
 760         .sendmsg =      unix_dgram_sendmsg,
 761         .recvmsg =      unix_dgram_recvmsg,
 762         .mmap =         sock_no_mmap,
 763         .sendpage =     sock_no_sendpage,
 764         .set_peek_off = unix_set_peek_off,
 765 };
 766
 767 static const struct proto_ops unix_seqpacket_ops = {
 768         .family =       PF_UNIX,
 769         .owner =        THIS_MODULE,
 770         .release =      unix_release,
 771         .bind =         unix_bind,
 772         .connect =      unix_stream_connect,
 773         .socketpair =   unix_socketpair,
 774         .accept =       unix_accept,
 775         .getname =      unix_getname,
 776         .poll =         unix_dgram_poll,
 777         .ioctl =        unix_ioctl,
 778 #ifdef CONFIG_COMPAT
 779         .compat_ioctl = unix_compat_ioctl,
 780 #endif
 781         .listen =       unix_listen,
 782         .shutdown =     unix_shutdown,
 783         .setsockopt =   sock_no_setsockopt,
 784         .getsockopt =   sock_no_getsockopt,
 785         .sendmsg =      unix_seqpacket_sendmsg,
 786         .recvmsg =      unix_seqpacket_recvmsg,
 787         .mmap =         sock_no_mmap,
 788         .sendpage =     sock_no_sendpage,
 789         .set_peek_off = unix_set_peek_off,
 790 };
 791
 792 static struct proto unix_proto = {
 793         .name                   = "UNIX",
 794         .owner                  = THIS_MODULE,
 795         .obj_size               = sizeof(struct unix_sock),
 796 };
 797
 798 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 799 {
 800         struct sock *sk = NULL;
 801         struct unix_sock *u;
 802
 803         atomic_long_inc(&unix_nr_socks);
 804         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 805                 goto out;
 806
 807         sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
 808         if (!sk)
 809                 goto out;
 810
 811         sock_init_data(sock, sk);
 812
 813         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
 814         sk->sk_write_space      = unix_write_space;
 815         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 816         sk->sk_destruct         = unix_sock_destructor;
 817         u         = unix_sk(sk);
 818         u->path.dentry = NULL;
 819         u->path.mnt = NULL;
 820         spin_lock_init(&u->lock);
 821         atomic_long_set(&u->inflight, 0);
 822         INIT_LIST_HEAD(&u->link);
 823         mutex_init(&u->iolock); /* single task reading lock */
 824         mutex_init(&u->bindlock); /* single task binding lock */
 825         init_waitqueue_head(&u->peer_wait);
 826         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 827         unix_insert_socket(unix_sockets_unbound(sk), sk);
 828 out:
 829         if (sk == NULL)
 830                 atomic_long_dec(&unix_nr_socks);
 831         else {
 832                 local_bh_disable();
 833                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 834                 local_bh_enable();
 835         }
 836         return sk;
 837 }
 838
 839 static int unix_create(struct net *net, struct socket *sock, int protocol,
 840                        int kern)
 841 {
 842         if (protocol && protocol != PF_UNIX)
 843                 return -EPROTONOSUPPORT;
 844
 845         sock->state = SS_UNCONNECTED;
 846
 847         switch (sock->type) {
 848         case SOCK_STREAM:
 849                 sock->ops = &unix_stream_ops;
 850                 break;
 851                 /*
 852                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
 853                  *      nothing uses it.
 854                  */
 855         case SOCK_RAW:
 856                 sock->type = SOCK_DGRAM;
 857                 /* fall through */
 858         case SOCK_DGRAM:
 859                 sock->ops = &unix_dgram_ops;
 860                 break;
 861         case SOCK_SEQPACKET:
 862                 sock->ops = &unix_seqpacket_ops;
 863                 break;
 864         default:
 865                 return -ESOCKTNOSUPPORT;
 866         }
 867
 868         return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
 869 }
 870
 871 static int unix_release(struct socket *sock)
 872 {
 873         struct sock *sk = sock->sk;
 874
 875         if (!sk)
 876                 return 0;
 877
 878         unix_release_sock(sk, 0);
 879         sock->sk = NULL;
 880
 881         return 0;
 882 }
 883
 884 static int unix_autobind(struct socket *sock)
 885 {
 886         struct sock *sk = sock->sk;
 887         struct net *net = sock_net(sk);
 888         struct unix_sock *u = unix_sk(sk);
 889         static u32 ordernum = 1;
 890         struct unix_address *addr;
 891         int err;
 892         unsigned int retries = 0;
 893
 894         err = mutex_lock_interruptible(&u->bindlock);
 895         if (err)
 896                 return err;
 897
 898         err = 0;
 899         if (u->addr)
 900                 goto out;
 901
 902         err = -ENOMEM;
 903         addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
 904         if (!addr)
 905                 goto out;
 906
 907         addr->name->sun_family = AF_UNIX;
 908         refcount_set(&addr->refcnt, 1);
 909
 910 retry:
 911         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
 912         addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 913
 914         spin_lock(&unix_table_lock);
 915         ordernum = (ordernum+1)&0xFFFFF;
 916
 917         if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
 918                                       addr->hash)) {
 919                 spin_unlock(&unix_table_lock);
 920                 /*
 921                  * __unix_find_socket_byname() may take long time if many names
 922                  * are already in use.
 923                  */
 924                 cond_resched();
 925                 /* Give up if all names seems to be in use. */
 926                 if (retries++ == 0xFFFFF) {
 927                         err = -ENOSPC;
 928                         kfree(addr);
 929                         goto out;
 930                 }
 931                 goto retry;
 932         }
 933         addr->hash ^= sk->sk_type;
 934
 935         __unix_remove_socket(sk);
 936         smp_store_release(&u->addr, addr);
 937         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
 938         spin_unlock(&unix_table_lock);
 939         err = 0;
 940
 941 out:    mutex_unlock(&u->bindlock);
 942         return err;
 943 }
 944
 945 static struct sock *unix_find_other(struct net *net,
 946                                     struct sockaddr_un *sunname, int len,
 947                                     int type, unsigned int hash, int *error)
 948 {
 949         struct sock *u;
 950         struct path path;
 951         int err = 0;
 952
 953         if (sunname->sun_path[0]) {
 954                 struct inode *inode;
 955                 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
 956                 if (err)
 957                         goto fail;
 958                 inode = d_backing_inode(path.dentry);
 959                 err = inode_permission(inode, MAY_WRITE);
 960                 if (err)
 961                         goto put_fail;
 962
 963                 err = -ECONNREFUSED;
 964                 if (!S_ISSOCK(inode->i_mode))
 965                         goto put_fail;
 966                 u = unix_find_socket_byinode(inode);
 967                 if (!u)
 968                         goto put_fail;
 969
 970                 if (u->sk_type == type)
 971                         touch_atime(&path);
 972
 973                 path_put(&path);
 974
 975                 err = -EPROTOTYPE;
 976                 if (u->sk_type != type) {
 977                         sock_put(u);
 978                         goto fail;
 979                 }
 980         } else {
 981                 err = -ECONNREFUSED;
 982                 u = unix_find_socket_byname(net, sunname, len, type, hash);
 983                 if (u) {
 984                         struct dentry *dentry;
 985                         dentry = unix_sk(u)->path.dentry;
 986                         if (dentry)
 987                                 touch_atime(&unix_sk(u)->path);
 988                 } else
 989                         goto fail;
 990         }
 991         return u;
 992
 993 put_fail:
 994         path_put(&path);
 995 fail:
 996         *error = err;
 997         return NULL;
 998 }
 999
1000 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
1001 {
1002         struct dentry *dentry;
1003         struct path path;
1004         int err = 0;
1005         /*
1006          * Get the parent directory, calculate the hash for last
1007          * component.
1008          */
1009         dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
1010         err = PTR_ERR(dentry);
1011         if (IS_ERR(dentry))
1012                 return err;
1013
1014         /*
1015          * All right, let's create it.
1016          */
1017         err = security_path_mknod(&path, dentry, mode, 0);
1018         if (!err) {
1019                 err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
1020                 if (!err) {
1021                         res->mnt = mntget(path.mnt);
1022                         res->dentry = dget(dentry);
1023                 }
1024         }
1025         done_path_create(&path, dentry);
1026         return err;
1027 }
1028
1029 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1030 {
1031         struct sock *sk = sock->sk;
1032         struct net *net = sock_net(sk);
1033         struct unix_sock *u = unix_sk(sk);
1034         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1035         char *sun_path = sunaddr->sun_path;
1036         int err;
1037         unsigned int hash;
1038         struct unix_address *addr;
1039         struct hlist_head *list;
1040         struct path path = { };
1041
1042         err = -EINVAL;
1043         if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
1044             sunaddr->sun_family != AF_UNIX)
1045                 goto out;
1046
1047         if (addr_len == sizeof(short)) {
1048                 err = unix_autobind(sock);
1049                 goto out;
1050         }
1051
1052         err = unix_mkname(sunaddr, addr_len, &hash);
1053         if (err < 0)
1054                 goto out;
1055         addr_len = err;
1056
1057         if (sun_path[0]) {
1058                 umode_t mode = S_IFSOCK |
1059                        (SOCK_INODE(sock)->i_mode & ~current_umask());
1060                 err = unix_mknod(sun_path, mode, &path);
1061                 if (err) {
1062                         if (err == -EEXIST)
1063                                 err = -EADDRINUSE;
1064                         goto out;
1065                 }
1066         }
1067
1068         err = mutex_lock_interruptible(&u->bindlock);
1069         if (err)
1070                 goto out_put;
1071
1072         err = -EINVAL;
1073         if (u->addr)
1074                 goto out_up;
1075
1076         err = -ENOMEM;
1077         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
1078         if (!addr)
1079                 goto out_up;
1080
1081         memcpy(addr->name, sunaddr, addr_len);
1082         addr->len = addr_len;
1083         addr->hash = hash ^ sk->sk_type;
1084         refcount_set(&addr->refcnt, 1);
1085
1086         if (sun_path[0]) {
1087                 addr->hash = UNIX_HASH_SIZE;
1088                 hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
1089                 spin_lock(&unix_table_lock);
1090                 u->path = path;
1091                 list = &unix_socket_table[hash];
1092         } else {
1093                 spin_lock(&unix_table_lock);
1094                 err = -EADDRINUSE;
1095                 if (__unix_find_socket_byname(net, sunaddr, addr_len,
1096                                               sk->sk_type, hash)) {
1097                         unix_release_addr(addr);
1098                         goto out_unlock;
1099                 }
1100
1101                 list = &unix_socket_table[addr->hash];
1102         }
1103
1104         err = 0;
1105         __unix_remove_socket(sk);
1106         smp_store_release(&u->addr, addr);
1107         __unix_insert_socket(list, sk);
1108
1109 out_unlock:
1110         spin_unlock(&unix_table_lock);
1111 out_up:
1112         mutex_unlock(&u->bindlock);
1113 out_put:
1114         if (err)
1115                 path_put(&path);
1116 out:
1117         return err;
1118 }
1119
1120 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1121 {
1122         if (unlikely(sk1 == sk2) || !sk2) {
1123                 unix_state_lock(sk1);
1124                 return;
1125         }
1126         if (sk1 < sk2) {
1127                 unix_state_lock(sk1);
1128                 unix_state_lock_nested(sk2);
1129         } else {
1130                 unix_state_lock(sk2);
1131                 unix_state_lock_nested(sk1);
1132         }
1133 }
1134
1135 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1136 {
1137         if (unlikely(sk1 == sk2) || !sk2) {
1138                 unix_state_unlock(sk1);
1139                 return;
1140         }
1141         unix_state_unlock(sk1);
1142         unix_state_unlock(sk2);
1143 }
1144
1145 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1146                               int alen, int flags)
1147 {
1148         struct sock *sk = sock->sk;
1149         struct net *net = sock_net(sk);
1150         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1151         struct sock *other;
1152         unsigned int hash;
1153         int err;
1154
1155         err = -EINVAL;
1156         if (alen < offsetofend(struct sockaddr, sa_family))
1157                 goto out;
1158
1159         if (addr->sa_family != AF_UNSPEC) {
1160                 err = unix_mkname(sunaddr, alen, &hash);
1161                 if (err < 0)
1162                         goto out;
1163                 alen = err;
1164
1165                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1166                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
1167                         goto out;
1168
1169 restart:
1170                 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1171                 if (!other)
1172                         goto out;
1173
1174                 unix_state_double_lock(sk, other);
1175
1176                 /* Apparently VFS overslept socket death. Retry. */
1177                 if (sock_flag(other, SOCK_DEAD)) {
1178                         unix_state_double_unlock(sk, other);
1179                         sock_put(other);
1180                         goto restart;
1181                 }
1182
1183                 err = -EPERM;
1184                 if (!unix_may_send(sk, other))
1185                         goto out_unlock;
1186
1187                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1188                 if (err)
1189                         goto out_unlock;
1190
1191         } else {
1192                 /*
1193                  *      1003.1g breaking connected state with AF_UNSPEC
1194                  */
1195                 other = NULL;
1196                 unix_state_double_lock(sk, other);
1197         }
1198
1199         /*
1200          * If it was connected, reconnect.
1201          */
1202         if (unix_peer(sk)) {
1203                 struct sock *old_peer = unix_peer(sk);
1204                 unix_peer(sk) = other;
1205                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1206
1207                 unix_state_double_unlock(sk, other);
1208
1209                 if (other != old_peer)
1210                         unix_dgram_disconnected(sk, old_peer);
1211                 sock_put(old_peer);
1212         } else {
1213                 unix_peer(sk) = other;
1214                 unix_state_double_unlock(sk, other);
1215         }
1216         return 0;
1217
1218 out_unlock:
1219         unix_state_double_unlock(sk, other);
1220         sock_put(other);
1221 out:
1222         return err;
1223 }
1224
1225 static long unix_wait_for_peer(struct sock *other, long timeo)
1226 {
1227         struct unix_sock *u = unix_sk(other);
1228         int sched;
1229         DEFINE_WAIT(wait);
1230
1231         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1232
1233         sched = !sock_flag(other, SOCK_DEAD) &&
1234                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1235                 unix_recvq_full(other);
1236
1237         unix_state_unlock(other);
1238
1239         if (sched)
1240                 timeo = schedule_timeout(timeo);
1241
1242         finish_wait(&u->peer_wait, &wait);
1243         return timeo;
1244 }
1245
1246 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1247                                int addr_len, int flags)
1248 {
1249         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1250         struct sock *sk = sock->sk;
1251         struct net *net = sock_net(sk);
1252         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1253         struct sock *newsk = NULL;
1254         struct sock *other = NULL;
1255         struct sk_buff *skb = NULL;
1256         unsigned int hash;
1257         int st;
1258         int err;
1259         long timeo;
1260
1261         err = unix_mkname(sunaddr, addr_len, &hash);
1262         if (err < 0)
1263                 goto out;
1264         addr_len = err;
1265
1266         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1267             (err = unix_autobind(sock)) != 0)
1268                 goto out;
1269
1270         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1271
1272         /* First of all allocate resources.
1273            If we will make it after state is locked,
1274            we will have to recheck all again in any case.
1275          */
1276
1277         err = -ENOMEM;
1278
1279         /* create new sock for complete connection */
1280         newsk = unix_create1(sock_net(sk), NULL, 0);
1281         if (newsk == NULL)
1282                 goto out;
1283
1284         /* Allocate skb for sending to listening sock */
1285         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1286         if (skb == NULL)
1287                 goto out;
1288
1289 restart:
1290         /*  Find listening sock. */
1291         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1292         if (!other)
1293                 goto out;
1294
1295         /* Latch state of peer */
1296         unix_state_lock(other);
1297
1298         /* Apparently VFS overslept socket death. Retry. */
1299         if (sock_flag(other, SOCK_DEAD)) {
1300                 unix_state_unlock(other);
1301                 sock_put(other);
1302                 goto restart;
1303         }
1304
1305         err = -ECONNREFUSED;
1306         if (other->sk_state != TCP_LISTEN)
1307                 goto out_unlock;
1308         if (other->sk_shutdown & RCV_SHUTDOWN)
1309                 goto out_unlock;
1310
1311         if (unix_recvq_full(other)) {
1312                 err = -EAGAIN;
1313                 if (!timeo)
1314                         goto out_unlock;
1315
1316                 timeo = unix_wait_for_peer(other, timeo);
1317
1318                 err = sock_intr_errno(timeo);
1319                 if (signal_pending(current))
1320                         goto out;
1321                 sock_put(other);
1322                 goto restart;
1323         }
1324
1325         /* Latch our state.
1326
1327            It is tricky place. We need to grab our state lock and cannot
1328            drop lock on peer. It is dangerous because deadlock is
1329            possible. Connect to self case and simultaneous
1330            attempt to connect are eliminated by checking socket
1331            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1332            check this before attempt to grab lock.
1333
1334            Well, and we have to recheck the state after socket locked.
1335          */
1336         st = sk->sk_state;
1337
1338         switch (st) {
1339         case TCP_CLOSE:
1340                 /* This is ok... continue with connect */
1341                 break;
1342         case TCP_ESTABLISHED:
1343                 /* Socket is already connected */
1344                 err = -EISCONN;
1345                 goto out_unlock;
1346         default:
1347                 err = -EINVAL;
1348                 goto out_unlock;
1349         }
1350
1351         unix_state_lock_nested(sk);
1352
1353         if (sk->sk_state != st) {
1354                 unix_state_unlock(sk);
1355                 unix_state_unlock(other);
1356                 sock_put(other);
1357                 goto restart;
1358         }
1359
1360         err = security_unix_stream_connect(sk, other, newsk);
1361         if (err) {
1362                 unix_state_unlock(sk);
1363                 goto out_unlock;
1364         }
1365
1366         /* The way is open! Fastly set all the necessary fields... */
1367
1368         sock_hold(sk);
1369         unix_peer(newsk)        = sk;
1370         newsk->sk_state         = TCP_ESTABLISHED;
1371         newsk->sk_type          = sk->sk_type;
1372         init_peercred(newsk);
1373         newu = unix_sk(newsk);
1374         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1375         otheru = unix_sk(other);
1376
1377         /* copy address information from listening to new sock
1378          *
1379          * The contents of *(otheru->addr) and otheru->path
1380          * are seen fully set up here, since we have found
1381          * otheru in hash under unix_table_lock.  Insertion
1382          * into the hash chain we'd found it in had been done
1383          * in an earlier critical area protected by unix_table_lock,
1384          * the same one where we'd set *(otheru->addr) contents,
1385          * as well as otheru->path and otheru->addr itself.
1386          *
1387          * Using smp_store_release() here to set newu->addr
1388          * is enough to make those stores, as well as stores
1389          * to newu->path visible to anyone who gets newu->addr
1390          * by smp_load_acquire().  IOW, the same warranties
1391          * as for unix_sock instances bound in unix_bind() or
1392          * in unix_autobind().
1393          */
1394         if (otheru->path.dentry) {
1395                 path_get(&otheru->path);
1396                 newu->path = otheru->path;
1397         }
1398         refcount_inc(&otheru->addr->refcnt);
1399         smp_store_release(&newu->addr, otheru->addr);
1400
1401         /* Set credentials */
1402         copy_peercred(sk, other);
1403
1404         sock->state     = SS_CONNECTED;
1405         sk->sk_state    = TCP_ESTABLISHED;
1406         sock_hold(newsk);
1407
1408         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1409         unix_peer(sk)   = newsk;
1410
1411         unix_state_unlock(sk);
1412
1413         /* take ten and and send info to listening sock */
1414         spin_lock(&other->sk_receive_queue.lock);
1415         __skb_queue_tail(&other->sk_receive_queue, skb);
1416         spin_unlock(&other->sk_receive_queue.lock);
1417         unix_state_unlock(other);
1418         other->sk_data_ready(other);
1419         sock_put(other);
1420         return 0;
1421
1422 out_unlock:
1423         if (other)
1424                 unix_state_unlock(other);
1425
1426 out:
1427         kfree_skb(skb);
1428         if (newsk)
1429                 unix_release_sock(newsk, 0);
1430         if (other)
1431                 sock_put(other);
1432         return err;
1433 }
1434
1435 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1436 {
1437         struct sock *ska = socka->sk, *skb = sockb->sk;
1438
1439         /* Join our sockets back to back */
1440         sock_hold(ska);
1441         sock_hold(skb);
1442         unix_peer(ska) = skb;
1443         unix_peer(skb) = ska;
1444         init_peercred(ska);
1445         init_peercred(skb);
1446
1447         if (ska->sk_type != SOCK_DGRAM) {
1448                 ska->sk_state = TCP_ESTABLISHED;
1449                 skb->sk_state = TCP_ESTABLISHED;
1450                 socka->state  = SS_CONNECTED;
1451                 sockb->state  = SS_CONNECTED;
1452         }
1453         return 0;
1454 }
1455
1456 static void unix_sock_inherit_flags(const struct socket *old,
1457                                     struct socket *new)
1458 {
1459         if (test_bit(SOCK_PASSCRED, &old->flags))
1460                 set_bit(SOCK_PASSCRED, &new->flags);
1461         if (test_bit(SOCK_PASSSEC, &old->flags))
1462                 set_bit(SOCK_PASSSEC, &new->flags);
1463 }
1464
1465 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1466                        bool kern)
1467 {
1468         struct sock *sk = sock->sk;
1469         struct sock *tsk;
1470         struct sk_buff *skb;
1471         int err;
1472
1473         err = -EOPNOTSUPP;
1474         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1475                 goto out;
1476
1477         err = -EINVAL;
1478         if (sk->sk_state != TCP_LISTEN)
1479                 goto out;
1480
1481         /* If socket state is TCP_LISTEN it cannot change (for now...),
1482          * so that no locks are necessary.
1483          */
1484
1485         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1486         if (!skb) {
1487                 /* This means receive shutdown. */
1488                 if (err == 0)
1489                         err = -EINVAL;
1490                 goto out;
1491         }
1492
1493         tsk = skb->sk;
1494         skb_free_datagram(sk, skb);
1495         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1496
1497         /* attach accepted sock to socket */
1498         unix_state_lock(tsk);
1499         newsock->state = SS_CONNECTED;
1500         unix_sock_inherit_flags(sock, newsock);
1501         sock_graft(tsk, newsock);
1502         unix_state_unlock(tsk);
1503         return 0;
1504
1505 out:
1506         return err;
1507 }
1508
1509
1510 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1511 {
1512         struct sock *sk = sock->sk;
1513         struct unix_address *addr;
1514         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1515         int err = 0;
1516
1517         if (peer) {
1518                 sk = unix_peer_get(sk);
1519
1520                 err = -ENOTCONN;
1521                 if (!sk)
1522                         goto out;
1523                 err = 0;
1524         } else {
1525                 sock_hold(sk);
1526         }
1527
1528         addr = smp_load_acquire(&unix_sk(sk)->addr);
1529         if (!addr) {
1530                 sunaddr->sun_family = AF_UNIX;
1531                 sunaddr->sun_path[0] = 0;
1532                 err = sizeof(short);
1533         } else {
1534                 err = addr->len;
1535                 memcpy(sunaddr, addr->name, addr->len);
1536         }
1537         sock_put(sk);
1538 out:
1539         return err;
1540 }
1541
1542 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1543 {
1544         scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1545
1546         /*
1547          * Garbage collection of unix sockets starts by selecting a set of
1548          * candidate sockets which have reference only from being in flight
1549          * (total_refs == inflight_refs).  This condition is checked once during
1550          * the candidate collection phase, and candidates are marked as such, so
1551          * that non-candidates can later be ignored.  While inflight_refs is
1552          * protected by unix_gc_lock, total_refs (file count) is not, hence this
1553          * is an instantaneous decision.
1554          *
1555          * Once a candidate, however, the socket must not be reinstalled into a
1556          * file descriptor while the garbage collection is in progress.
1557          *
1558          * If the above conditions are met, then the directed graph of
1559          * candidates (*) does not change while unix_gc_lock is held.
1560          *
1561          * Any operations that changes the file count through file descriptors
1562          * (dup, close, sendmsg) does not change the graph since candidates are
1563          * not installed in fds.
1564          *
1565          * Dequeing a candidate via recvmsg would install it into an fd, but
1566          * that takes unix_gc_lock to decrement the inflight count, so it's
1567          * serialized with garbage collection.
1568          *
1569          * MSG_PEEK is special in that it does not change the inflight count,
1570          * yet does install the socket into an fd.  The following lock/unlock
1571          * pair is to ensure serialization with garbage collection.  It must be
1572          * done between incrementing the file count and installing the file into
1573          * an fd.
1574          *
1575          * If garbage collection starts after the barrier provided by the
1576          * lock/unlock, then it will see the elevated refcount and not mark this
1577          * as a candidate.  If a garbage collection is already in progress
1578          * before the file count was incremented, then the lock/unlock pair will
1579          * ensure that garbage collection is finished before progressing to
1580          * installing the fd.
1581          *
1582          * (*) A -> B where B is on the queue of A or B is on the queue of C
1583          * which is on the queue of listening socket A.
1584          */
1585         spin_lock(&unix_gc_lock);
1586         spin_unlock(&unix_gc_lock);
1587 }
1588
1589 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1590 {
1591         int err = 0;
1592
1593         UNIXCB(skb).pid  = get_pid(scm->pid);
1594         UNIXCB(skb).uid = scm->creds.uid;
1595         UNIXCB(skb).gid = scm->creds.gid;
1596         UNIXCB(skb).fp = NULL;
1597         unix_get_secdata(scm, skb);
1598         if (scm->fp && send_fds)
1599                 err = unix_attach_fds(scm, skb);
1600
1601         skb->destructor = unix_destruct_scm;
1602         return err;
1603 }
1604
1605 static bool unix_passcred_enabled(const struct socket *sock,
1606                                   const struct sock *other)
1607 {
1608         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1609                !other->sk_socket ||
1610                test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1611 }
1612
1613 /*
1614  * Some apps rely on write() giving SCM_CREDENTIALS
1615  * We include credentials if source or destination socket
1616  * asserted SOCK_PASSCRED.
1617  */
1618 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1619                             const struct sock *other)
1620 {
1621         if (UNIXCB(skb).pid)
1622                 return;
1623         if (unix_passcred_enabled(sock, other)) {
1624                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1625                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1626         }
1627 }
1628
1629 static int maybe_init_creds(struct scm_cookie *scm,
1630                             struct socket *socket,
1631                             const struct sock *other)
1632 {
1633         int err;
1634         struct msghdr msg = { .msg_controllen = 0 };
1635
1636         err = scm_send(socket, &msg, scm, false);
1637         if (err)
1638                 return err;
1639
1640         if (unix_passcred_enabled(socket, other)) {
1641                 scm->pid = get_pid(task_tgid(current));
1642                 current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1643         }
1644         return err;
1645 }
1646
1647 static bool unix_skb_scm_eq(struct sk_buff *skb,
1648                             struct scm_cookie *scm)
1649 {
1650         const struct unix_skb_parms *u = &UNIXCB(skb);
1651
1652         return u->pid == scm->pid &&
1653                uid_eq(u->uid, scm->creds.uid) &&
1654                gid_eq(u->gid, scm->creds.gid) &&
1655                unix_secdata_eq(scm, skb);
1656 }
1657
1658 /*
1659  *      Send AF_UNIX data.
1660  */
1661
1662 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1663                               size_t len)
1664 {
1665         struct sock *sk = sock->sk;
1666         struct net *net = sock_net(sk);
1667         struct unix_sock *u = unix_sk(sk);
1668         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1669         struct sock *other = NULL;
1670         int namelen = 0; /* fake GCC */
1671         int err;
1672         unsigned int hash;
1673         struct sk_buff *skb;
1674         long timeo;
1675         struct scm_cookie scm;
1676         int data_len = 0;
1677         int sk_locked;
1678
1679         wait_for_unix_gc();
1680         err = scm_send(sock, msg, &scm, false);
1681         if (err < 0)
1682                 return err;
1683
1684         err = -EOPNOTSUPP;
1685         if (msg->msg_flags&MSG_OOB)
1686                 goto out;
1687
1688         if (msg->msg_namelen) {
1689                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1690                 if (err < 0)
1691                         goto out;
1692                 namelen = err;
1693         } else {
1694                 sunaddr = NULL;
1695                 err = -ENOTCONN;
1696                 other = unix_peer_get(sk);
1697                 if (!other)
1698                         goto out;
1699         }
1700
1701         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1702             && (err = unix_autobind(sock)) != 0)
1703                 goto out;
1704
1705         err = -EMSGSIZE;
1706         if (len > sk->sk_sndbuf - 32)
1707                 goto out;
1708
1709         if (len > SKB_MAX_ALLOC) {
1710                 data_len = min_t(size_t,
1711                                  len - SKB_MAX_ALLOC,
1712                                  MAX_SKB_FRAGS * PAGE_SIZE);
1713                 data_len = PAGE_ALIGN(data_len);
1714
1715                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1716         }
1717
1718         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1719                                    msg->msg_flags & MSG_DONTWAIT, &err,
1720                                    PAGE_ALLOC_COSTLY_ORDER);
1721         if (skb == NULL)
1722                 goto out;
1723
1724         err = unix_scm_to_skb(&scm, skb, true);
1725         if (err < 0)
1726                 goto out_free;
1727
1728         skb_put(skb, len - data_len);
1729         skb->data_len = data_len;
1730         skb->len = len;
1731         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1732         if (err)
1733                 goto out_free;
1734
1735         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1736
1737 restart:
1738         if (!other) {
1739                 err = -ECONNRESET;
1740                 if (sunaddr == NULL)
1741                         goto out_free;
1742
1743                 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1744                                         hash, &err);
1745                 if (other == NULL)
1746                         goto out_free;
1747         }
1748
1749         if (sk_filter(other, skb) < 0) {
1750                 /* Toss the packet but do not return any error to the sender */
1751                 err = len;
1752                 goto out_free;
1753         }
1754
1755         sk_locked = 0;
1756         unix_state_lock(other);
1757 restart_locked:
1758         err = -EPERM;
1759         if (!unix_may_send(sk, other))
1760                 goto out_unlock;
1761
1762         if (unlikely(sock_flag(other, SOCK_DEAD))) {
1763                 /*
1764                  *      Check with 1003.1g - what should
1765                  *      datagram error
1766                  */
1767                 unix_state_unlock(other);
1768                 sock_put(other);
1769
1770                 if (!sk_locked)
1771                         unix_state_lock(sk);
1772
1773                 err = 0;
1774                 if (unix_peer(sk) == other) {
1775                         unix_peer(sk) = NULL;
1776                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1777
1778                         unix_state_unlock(sk);
1779
1780                         unix_dgram_disconnected(sk, other);
1781                         sock_put(other);
1782                         err = -ECONNREFUSED;
1783                 } else {
1784                         unix_state_unlock(sk);
1785                 }
1786
1787                 other = NULL;
1788                 if (err)
1789                         goto out_free;
1790                 goto restart;
1791         }
1792
1793         err = -EPIPE;
1794         if (other->sk_shutdown & RCV_SHUTDOWN)
1795                 goto out_unlock;
1796
1797         if (sk->sk_type != SOCK_SEQPACKET) {
1798                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1799                 if (err)
1800                         goto out_unlock;
1801         }
1802
1803         /* other == sk && unix_peer(other) != sk if
1804          * - unix_peer(sk) == NULL, destination address bound to sk
1805          * - unix_peer(sk) == sk by time of get but disconnected before lock
1806          */
1807         if (other != sk &&
1808             unlikely(unix_peer(other) != sk &&
1809             unix_recvq_full_lockless(other))) {
1810                 if (timeo) {
1811                         timeo = unix_wait_for_peer(other, timeo);
1812
1813                         err = sock_intr_errno(timeo);
1814                         if (signal_pending(current))
1815                                 goto out_free;
1816
1817                         goto restart;
1818                 }
1819
1820                 if (!sk_locked) {
1821                         unix_state_unlock(other);
1822                         unix_state_double_lock(sk, other);
1823                 }
1824
1825                 if (unix_peer(sk) != other ||
1826                     unix_dgram_peer_wake_me(sk, other)) {
1827                         err = -EAGAIN;
1828                         sk_locked = 1;
1829                         goto out_unlock;
1830                 }
1831
1832                 if (!sk_locked) {
1833                         sk_locked = 1;
1834                         goto restart_locked;
1835                 }
1836         }
1837
1838         if (unlikely(sk_locked))
1839                 unix_state_unlock(sk);
1840
1841         if (sock_flag(other, SOCK_RCVTSTAMP))
1842                 __net_timestamp(skb);
1843         maybe_add_creds(skb, sock, other);
1844         skb_queue_tail(&other->sk_receive_queue, skb);
1845         unix_state_unlock(other);
1846         other->sk_data_ready(other);
1847         sock_put(other);
1848         scm_destroy(&scm);
1849         return len;
1850
1851 out_unlock:
1852         if (sk_locked)
1853                 unix_state_unlock(sk);
1854         unix_state_unlock(other);
1855 out_free:
1856         kfree_skb(skb);
1857 out:
1858         if (other)
1859                 sock_put(other);
1860         scm_destroy(&scm);
1861         return err;
1862 }
1863
1864 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1865  * bytes, and a minimum of a full page.
1866  */
1867 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1868
1869 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1870                                size_t len)
1871 {
1872         struct sock *sk = sock->sk;
1873         struct sock *other = NULL;
1874         int err, size;
1875         struct sk_buff *skb;
1876         int sent = 0;
1877         struct scm_cookie scm;
1878         bool fds_sent = false;
1879         int data_len;
1880
1881         wait_for_unix_gc();
1882         err = scm_send(sock, msg, &scm, false);
1883         if (err < 0)
1884                 return err;
1885
1886         err = -EOPNOTSUPP;
1887         if (msg->msg_flags&MSG_OOB)
1888                 goto out_err;
1889
1890         if (msg->msg_namelen) {
1891                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1892                 goto out_err;
1893         } else {
1894                 err = -ENOTCONN;
1895                 other = unix_peer(sk);
1896                 if (!other)
1897                         goto out_err;
1898         }
1899
1900         if (sk->sk_shutdown & SEND_SHUTDOWN)
1901                 goto pipe_err;
1902
1903         while (sent < len) {
1904                 size = len - sent;
1905
1906                 /* Keep two messages in the pipe so it schedules better */
1907                 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1908
1909                 /* allow fallback to order-0 allocations */
1910                 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1911
1912                 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1913
1914                 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1915
1916                 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1917                                            msg->msg_flags & MSG_DONTWAIT, &err,
1918                                            get_order(UNIX_SKB_FRAGS_SZ));
1919                 if (!skb)
1920                         goto out_err;
1921
1922                 /* Only send the fds in the first buffer */
1923                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
1924                 if (err < 0) {
1925                         kfree_skb(skb);
1926                         goto out_err;
1927                 }
1928                 fds_sent = true;
1929
1930                 skb_put(skb, size - data_len);
1931                 skb->data_len = data_len;
1932                 skb->len = size;
1933                 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1934                 if (err) {
1935                         kfree_skb(skb);
1936                         goto out_err;
1937                 }
1938
1939                 unix_state_lock(other);
1940
1941                 if (sock_flag(other, SOCK_DEAD) ||
1942                     (other->sk_shutdown & RCV_SHUTDOWN))
1943                         goto pipe_err_free;
1944
1945                 maybe_add_creds(skb, sock, other);
1946                 skb_queue_tail(&other->sk_receive_queue, skb);
1947                 unix_state_unlock(other);
1948                 other->sk_data_ready(other);
1949                 sent += size;
1950         }
1951
1952         scm_destroy(&scm);
1953
1954         return sent;
1955
1956 pipe_err_free:
1957         unix_state_unlock(other);
1958         kfree_skb(skb);
1959 pipe_err:
1960         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1961                 send_sig(SIGPIPE, current, 0);
1962         err = -EPIPE;
1963 out_err:
1964         scm_destroy(&scm);
1965         return sent ? : err;
1966 }
1967
1968 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
1969                                     int offset, size_t size, int flags)
1970 {
1971         int err;
1972         bool send_sigpipe = false;
1973         bool init_scm = true;
1974         struct scm_cookie scm;
1975         struct sock *other, *sk = socket->sk;
1976         struct sk_buff *skb, *newskb = NULL, *tail = NULL;
1977
1978         if (flags & MSG_OOB)
1979                 return -EOPNOTSUPP;
1980
1981         other = unix_peer(sk);
1982         if (!other || sk->sk_state != TCP_ESTABLISHED)
1983                 return -ENOTCONN;
1984
1985         if (false) {
1986 alloc_skb:
1987                 unix_state_unlock(other);
1988                 mutex_unlock(&unix_sk(other)->iolock);
1989                 newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
1990                                               &err, 0);
1991                 if (!newskb)
1992                         goto err;
1993         }
1994
1995         /* we must acquire iolock as we modify already present
1996          * skbs in the sk_receive_queue and mess with skb->len
1997          */
1998         err = mutex_lock_interruptible(&unix_sk(other)->iolock);
1999         if (err) {
2000                 err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
2001                 goto err;
2002         }
2003
2004         if (sk->sk_shutdown & SEND_SHUTDOWN) {
2005                 err = -EPIPE;
2006                 send_sigpipe = true;
2007                 goto err_unlock;
2008         }
2009
2010         unix_state_lock(other);
2011
2012         if (sock_flag(other, SOCK_DEAD) ||
2013             other->sk_shutdown & RCV_SHUTDOWN) {
2014                 err = -EPIPE;
2015                 send_sigpipe = true;
2016                 goto err_state_unlock;
2017         }
2018
2019         if (init_scm) {
2020                 err = maybe_init_creds(&scm, socket, other);
2021                 if (err)
2022                         goto err_state_unlock;
2023                 init_scm = false;
2024         }
2025
2026         skb = skb_peek_tail(&other->sk_receive_queue);
2027         if (tail && tail == skb) {
2028                 skb = newskb;
2029         } else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2030                 if (newskb) {
2031                         skb = newskb;
2032                 } else {
2033                         tail = skb;
2034                         goto alloc_skb;
2035                 }
2036         } else if (newskb) {
2037                 /* this is fast path, we don't necessarily need to
2038                  * call to kfree_skb even though with newskb == NULL
2039                  * this - does no harm
2040                  */
2041                 consume_skb(newskb);
2042                 newskb = NULL;
2043         }
2044
2045         if (skb_append_pagefrags(skb, page, offset, size)) {
2046                 tail = skb;
2047                 goto alloc_skb;
2048         }
2049
2050         skb->len += size;
2051         skb->data_len += size;
2052         skb->truesize += size;
2053         refcount_add(size, &sk->sk_wmem_alloc);
2054
2055         if (newskb) {
2056                 err = unix_scm_to_skb(&scm, skb, false);
2057                 if (err)
2058                         goto err_state_unlock;
2059                 spin_lock(&other->sk_receive_queue.lock);
2060                 __skb_queue_tail(&other->sk_receive_queue, newskb);
2061                 spin_unlock(&other->sk_receive_queue.lock);
2062         }
2063
2064         unix_state_unlock(other);
2065         mutex_unlock(&unix_sk(other)->iolock);
2066
2067         other->sk_data_ready(other);
2068         scm_destroy(&scm);
2069         return size;
2070
2071 err_state_unlock:
2072         unix_state_unlock(other);
2073 err_unlock:
2074         mutex_unlock(&unix_sk(other)->iolock);
2075 err:
2076         kfree_skb(newskb);
2077         if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2078                 send_sig(SIGPIPE, current, 0);
2079         if (!init_scm)
2080                 scm_destroy(&scm);
2081         return err;
2082 }
2083
2084 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2085                                   size_t len)
2086 {
2087         int err;
2088         struct sock *sk = sock->sk;
2089
2090         err = sock_error(sk);
2091         if (err)
2092                 return err;
2093
2094         if (sk->sk_state != TCP_ESTABLISHED)
2095                 return -ENOTCONN;
2096
2097         if (msg->msg_namelen)
2098                 msg->msg_namelen = 0;
2099
2100         return unix_dgram_sendmsg(sock, msg, len);
2101 }
2102
2103 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2104                                   size_t size, int flags)
2105 {
2106         struct sock *sk = sock->sk;
2107
2108         if (sk->sk_state != TCP_ESTABLISHED)
2109                 return -ENOTCONN;
2110
2111         return unix_dgram_recvmsg(sock, msg, size, flags);
2112 }
2113
2114 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2115 {
2116         struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2117
2118         if (addr) {
2119                 msg->msg_namelen = addr->len;
2120                 memcpy(msg->msg_name, addr->name, addr->len);
2121         }
2122 }
2123
2124 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
2125                               size_t size, int flags)
2126 {
2127         struct scm_cookie scm;
2128         struct sock *sk = sock->sk;
2129         struct unix_sock *u = unix_sk(sk);
2130         struct sk_buff *skb, *last;
2131         long timeo;
2132         int err;
2133         int peeked, skip;
2134
2135         err = -EOPNOTSUPP;
2136         if (flags&MSG_OOB)
2137                 goto out;
2138
2139         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2140
2141         do {
2142                 mutex_lock(&u->iolock);
2143
2144                 skip = sk_peek_offset(sk, flags);
2145                 skb = __skb_try_recv_datagram(sk, flags, NULL, &peeked, &skip,
2146                                               &err, &last);
2147                 if (skb)
2148                         break;
2149
2150                 mutex_unlock(&u->iolock);
2151
2152                 if (err != -EAGAIN)
2153                         break;
2154         } while (timeo &&
2155                  !__skb_wait_for_more_packets(sk, &err, &timeo, last));
2156
2157         if (!skb) { /* implies iolock unlocked */
2158                 unix_state_lock(sk);
2159                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2160                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2161                     (sk->sk_shutdown & RCV_SHUTDOWN))
2162                         err = 0;
2163                 unix_state_unlock(sk);
2164                 goto out;
2165         }
2166
2167         if (wq_has_sleeper(&u->peer_wait))
2168                 wake_up_interruptible_sync_poll(&u->peer_wait,
2169                                                 EPOLLOUT | EPOLLWRNORM |
2170                                                 EPOLLWRBAND);
2171
2172         if (msg->msg_name)
2173                 unix_copy_addr(msg, skb->sk);
2174
2175         if (size > skb->len - skip)
2176                 size = skb->len - skip;
2177         else if (size < skb->len - skip)
2178                 msg->msg_flags |= MSG_TRUNC;
2179
2180         err = skb_copy_datagram_msg(skb, skip, msg, size);
2181         if (err)
2182                 goto out_free;
2183
2184         if (sock_flag(sk, SOCK_RCVTSTAMP))
2185                 __sock_recv_timestamp(msg, sk, skb);
2186
2187         memset(&scm, 0, sizeof(scm));
2188
2189         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2190         unix_set_secdata(&scm, skb);
2191
2192         if (!(flags & MSG_PEEK)) {
2193                 if (UNIXCB(skb).fp)
2194                         unix_detach_fds(&scm, skb);
2195
2196                 sk_peek_offset_bwd(sk, skb->len);
2197         } else {
2198                 /* It is questionable: on PEEK we could:
2199                    - do not return fds - good, but too simple 8)
2200                    - return fds, and do not return them on read (old strategy,
2201                      apparently wrong)
2202                    - clone fds (I chose it for now, it is the most universal
2203                      solution)
2204
2205                    POSIX 1003.1g does not actually define this clearly
2206                    at all. POSIX 1003.1g doesn't define a lot of things
2207                    clearly however!
2208
2209                 */
2210
2211                 sk_peek_offset_fwd(sk, size);
2212
2213                 if (UNIXCB(skb).fp)
2214                         unix_peek_fds(&scm, skb);
2215         }
2216         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2217
2218         scm_recv(sock, msg, &scm, flags);
2219
2220 out_free:
2221         skb_free_datagram(sk, skb);
2222         mutex_unlock(&u->iolock);
2223 out:
2224         return err;
2225 }
2226
2227 /*
2228  *      Sleep until more data has arrived. But check for races..
2229  */
2230 static long unix_stream_data_wait(struct sock *sk, long timeo,
2231                                   struct sk_buff *last, unsigned int last_len,
2232                                   bool freezable)
2233 {
2234         struct sk_buff *tail;
2235         DEFINE_WAIT(wait);
2236
2237         unix_state_lock(sk);
2238
2239         for (;;) {
2240                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2241
2242                 tail = skb_peek_tail(&sk->sk_receive_queue);
2243                 if (tail != last ||
2244                     (tail && tail->len != last_len) ||
2245                     sk->sk_err ||
2246                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2247                     signal_pending(current) ||
2248                     !timeo)
2249                         break;
2250
2251                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2252                 unix_state_unlock(sk);
2253                 if (freezable)
2254                         timeo = freezable_schedule_timeout(timeo);
2255                 else
2256                         timeo = schedule_timeout(timeo);
2257                 unix_state_lock(sk);
2258
2259                 if (sock_flag(sk, SOCK_DEAD))
2260                         break;
2261
2262                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2263         }
2264
2265         finish_wait(sk_sleep(sk), &wait);
2266         unix_state_unlock(sk);
2267         return timeo;
2268 }
2269
2270 static unsigned int unix_skb_len(const struct sk_buff *skb)
2271 {
2272         return skb->len - UNIXCB(skb).consumed;
2273 }
2274
2275 struct unix_stream_read_state {
2276         int (*recv_actor)(struct sk_buff *, int, int,
2277                           struct unix_stream_read_state *);
2278         struct socket *socket;
2279         struct msghdr *msg;
2280         struct pipe_inode_info *pipe;
2281         size_t size;
2282         int flags;
2283         unsigned int splice_flags;
2284 };
2285
2286 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2287                                     bool freezable)
2288 {
2289         struct scm_cookie scm;
2290         struct socket *sock = state->socket;
2291         struct sock *sk = sock->sk;
2292         struct unix_sock *u = unix_sk(sk);
2293         int copied = 0;
2294         int flags = state->flags;
2295         int noblock = flags & MSG_DONTWAIT;
2296         bool check_creds = false;
2297         int target;
2298         int err = 0;
2299         long timeo;
2300         int skip;
2301         size_t size = state->size;
2302         unsigned int last_len;
2303
2304         if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2305                 err = -EINVAL;
2306                 goto out;
2307         }
2308
2309         if (unlikely(flags & MSG_OOB)) {
2310                 err = -EOPNOTSUPP;
2311                 goto out;
2312         }
2313
2314         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2315         timeo = sock_rcvtimeo(sk, noblock);
2316
2317         memset(&scm, 0, sizeof(scm));
2318
2319         /* Lock the socket to prevent queue disordering
2320          * while sleeps in memcpy_tomsg
2321          */
2322         mutex_lock(&u->iolock);
2323
2324         skip = max(sk_peek_offset(sk, flags), 0);
2325
2326         do {
2327                 int chunk;
2328                 bool drop_skb;
2329                 struct sk_buff *skb, *last;
2330
2331 redo:
2332                 unix_state_lock(sk);
2333                 if (sock_flag(sk, SOCK_DEAD)) {
2334                         err = -ECONNRESET;
2335                         goto unlock;
2336                 }
2337                 last = skb = skb_peek(&sk->sk_receive_queue);
2338                 last_len = last ? last->len : 0;
2339 again:
2340                 if (skb == NULL) {
2341                         if (copied >= target)
2342                                 goto unlock;
2343
2344                         /*
2345                          *      POSIX 1003.1g mandates this order.
2346                          */
2347
2348                         err = sock_error(sk);
2349                         if (err)
2350                                 goto unlock;
2351                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2352                                 goto unlock;
2353
2354                         unix_state_unlock(sk);
2355                         if (!timeo) {
2356                                 err = -EAGAIN;
2357                                 break;
2358                         }
2359
2360                         mutex_unlock(&u->iolock);
2361
2362                         timeo = unix_stream_data_wait(sk, timeo, last,
2363                                                       last_len, freezable);
2364
2365                         if (signal_pending(current)) {
2366                                 err = sock_intr_errno(timeo);
2367                                 scm_destroy(&scm);
2368                                 goto out;
2369                         }
2370
2371                         mutex_lock(&u->iolock);
2372                         goto redo;
2373 unlock:
2374                         unix_state_unlock(sk);
2375                         break;
2376                 }
2377
2378                 while (skip >= unix_skb_len(skb)) {
2379                         skip -= unix_skb_len(skb);
2380                         last = skb;
2381                         last_len = skb->len;
2382                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2383                         if (!skb)
2384                                 goto again;
2385                 }
2386
2387                 unix_state_unlock(sk);
2388
2389                 if (check_creds) {
2390                         /* Never glue messages from different writers */
2391                         if (!unix_skb_scm_eq(skb, &scm))
2392                                 break;
2393                 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2394                         /* Copy credentials */
2395                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2396                         unix_set_secdata(&scm, skb);
2397                         check_creds = true;
2398                 }
2399
2400                 /* Copy address just once */
2401                 if (state->msg && state->msg->msg_name) {
2402                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2403                                          state->msg->msg_name);
2404                         unix_copy_addr(state->msg, skb->sk);
2405                         sunaddr = NULL;
2406                 }
2407
2408                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2409                 skb_get(skb);
2410                 chunk = state->recv_actor(skb, skip, chunk, state);
2411                 drop_skb = !unix_skb_len(skb);
2412                 /* skb is only safe to use if !drop_skb */
2413                 consume_skb(skb);
2414                 if (chunk < 0) {
2415                         if (copied == 0)
2416                                 copied = -EFAULT;
2417                         break;
2418                 }
2419                 copied += chunk;
2420                 size -= chunk;
2421
2422                 if (drop_skb) {
2423                         /* the skb was touched by a concurrent reader;
2424                          * we should not expect anything from this skb
2425                          * anymore and assume it invalid - we can be
2426                          * sure it was dropped from the socket queue
2427                          *
2428                          * let's report a short read
2429                          */
2430                         err = 0;
2431                         break;
2432                 }
2433
2434                 /* Mark read part of skb as used */
2435                 if (!(flags & MSG_PEEK)) {
2436                         UNIXCB(skb).consumed += chunk;
2437
2438                         sk_peek_offset_bwd(sk, chunk);
2439
2440                         if (UNIXCB(skb).fp)
2441                                 unix_detach_fds(&scm, skb);
2442
2443                         if (unix_skb_len(skb))
2444                                 break;
2445
2446                         skb_unlink(skb, &sk->sk_receive_queue);
2447                         consume_skb(skb);
2448
2449                         if (scm.fp)
2450                                 break;
2451                 } else {
2452                         /* It is questionable, see note in unix_dgram_recvmsg.
2453                          */
2454                         if (UNIXCB(skb).fp)
2455                                 unix_peek_fds(&scm, skb);
2456
2457                         sk_peek_offset_fwd(sk, chunk);
2458
2459                         if (UNIXCB(skb).fp)
2460                                 break;
2461
2462                         skip = 0;
2463                         last = skb;
2464                         last_len = skb->len;
2465                         unix_state_lock(sk);
2466                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2467                         if (skb)
2468                                 goto again;
2469                         unix_state_unlock(sk);
2470                         break;
2471                 }
2472         } while (size);
2473
2474         mutex_unlock(&u->iolock);
2475         if (state->msg)
2476                 scm_recv(sock, state->msg, &scm, flags);
2477         else
2478                 scm_destroy(&scm);
2479 out:
2480         return copied ? : err;
2481 }
2482
2483 static int unix_stream_read_actor(struct sk_buff *skb,
2484                                   int skip, int chunk,
2485                                   struct unix_stream_read_state *state)
2486 {
2487         int ret;
2488
2489         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2490                                     state->msg, chunk);
2491         return ret ?: chunk;
2492 }
2493
2494 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2495                                size_t size, int flags)
2496 {
2497         struct unix_stream_read_state state = {
2498                 .recv_actor = unix_stream_read_actor,
2499                 .socket = sock,
2500                 .msg = msg,
2501                 .size = size,
2502                 .flags = flags
2503         };
2504
2505         return unix_stream_read_generic(&state, true);
2506 }
2507
2508 static int unix_stream_splice_actor(struct sk_buff *skb,
2509                                     int skip, int chunk,
2510                                     struct unix_stream_read_state *state)
2511 {
2512         return skb_splice_bits(skb, state->socket->sk,
2513                                UNIXCB(skb).consumed + skip,
2514                                state->pipe, chunk, state->splice_flags);
2515 }
2516
2517 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2518                                        struct pipe_inode_info *pipe,
2519                                        size_t size, unsigned int flags)
2520 {
2521         struct unix_stream_read_state state = {
2522                 .recv_actor = unix_stream_splice_actor,
2523                 .socket = sock,
2524                 .pipe = pipe,
2525                 .size = size,
2526                 .splice_flags = flags,
2527         };
2528
2529         if (unlikely(*ppos))
2530                 return -ESPIPE;
2531
2532         if (sock->file->f_flags & O_NONBLOCK ||
2533             flags & SPLICE_F_NONBLOCK)
2534                 state.flags = MSG_DONTWAIT;
2535
2536         return unix_stream_read_generic(&state, false);
2537 }
2538
2539 static int unix_shutdown(struct socket *sock, int mode)
2540 {
2541         struct sock *sk = sock->sk;
2542         struct sock *other;
2543
2544         if (mode < SHUT_RD || mode > SHUT_RDWR)
2545                 return -EINVAL;
2546         /* This maps:
2547          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2548          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2549          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2550          */
2551         ++mode;
2552
2553         unix_state_lock(sk);
2554         sk->sk_shutdown |= mode;
2555         other = unix_peer(sk);
2556         if (other)
2557                 sock_hold(other);
2558         unix_state_unlock(sk);
2559         sk->sk_state_change(sk);
2560
2561         if (other &&
2562                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2563
2564                 int peer_mode = 0;
2565
2566                 if (mode&RCV_SHUTDOWN)
2567                         peer_mode |= SEND_SHUTDOWN;
2568                 if (mode&SEND_SHUTDOWN)
2569                         peer_mode |= RCV_SHUTDOWN;
2570                 unix_state_lock(other);
2571                 other->sk_shutdown |= peer_mode;
2572                 unix_state_unlock(other);
2573                 other->sk_state_change(other);
2574                 if (peer_mode == SHUTDOWN_MASK)
2575                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2576                 else if (peer_mode & RCV_SHUTDOWN)
2577                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2578         }
2579         if (other)
2580                 sock_put(other);
2581
2582         return 0;
2583 }
2584
2585 long unix_inq_len(struct sock *sk)
2586 {
2587         struct sk_buff *skb;
2588         long amount = 0;
2589
2590         if (sk->sk_state == TCP_LISTEN)
2591                 return -EINVAL;
2592
2593         spin_lock(&sk->sk_receive_queue.lock);
2594         if (sk->sk_type == SOCK_STREAM ||
2595             sk->sk_type == SOCK_SEQPACKET) {
2596                 skb_queue_walk(&sk->sk_receive_queue, skb)
2597                         amount += unix_skb_len(skb);
2598         } else {
2599                 skb = skb_peek(&sk->sk_receive_queue);
2600                 if (skb)
2601                         amount = skb->len;
2602         }
2603         spin_unlock(&sk->sk_receive_queue.lock);
2604
2605         return amount;
2606 }
2607 EXPORT_SYMBOL_GPL(unix_inq_len);
2608
2609 long unix_outq_len(struct sock *sk)
2610 {
2611         return sk_wmem_alloc_get(sk);
2612 }
2613 EXPORT_SYMBOL_GPL(unix_outq_len);
2614
2615 static int unix_open_file(struct sock *sk)
2616 {
2617         struct path path;
2618         struct file *f;
2619         int fd;
2620
2621         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2622                 return -EPERM;
2623
2624         if (!smp_load_acquire(&unix_sk(sk)->addr))
2625                 return -ENOENT;
2626
2627         path = unix_sk(sk)->path;
2628         if (!path.dentry)
2629                 return -ENOENT;
2630
2631         path_get(&path);
2632
2633         fd = get_unused_fd_flags(O_CLOEXEC);
2634         if (fd < 0)
2635                 goto out;
2636
2637         f = dentry_open(&path, O_PATH, current_cred());
2638         if (IS_ERR(f)) {
2639                 put_unused_fd(fd);
2640                 fd = PTR_ERR(f);
2641                 goto out;
2642         }
2643
2644         fd_install(fd, f);
2645 out:
2646         path_put(&path);
2647
2648         return fd;
2649 }
2650
2651 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2652 {
2653         struct sock *sk = sock->sk;
2654         long amount = 0;
2655         int err;
2656
2657         switch (cmd) {
2658         case SIOCOUTQ:
2659                 amount = unix_outq_len(sk);
2660                 err = put_user(amount, (int __user *)arg);
2661                 break;
2662         case SIOCINQ:
2663                 amount = unix_inq_len(sk);
2664                 if (amount < 0)
2665                         err = amount;
2666                 else
2667                         err = put_user(amount, (int __user *)arg);
2668                 break;
2669         case SIOCUNIXFILE:
2670                 err = unix_open_file(sk);
2671                 break;
2672         default:
2673                 err = -ENOIOCTLCMD;
2674                 break;
2675         }
2676         return err;
2677 }
2678
2679 #ifdef CONFIG_COMPAT
2680 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2681 {
2682         return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
2683 }
2684 #endif
2685
2686 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2687 {
2688         struct sock *sk = sock->sk;
2689         __poll_t mask;
2690
2691         sock_poll_wait(file, sock, wait);
2692         mask = 0;
2693
2694         /* exceptional events? */
2695         if (sk->sk_err)
2696                 mask |= EPOLLERR;
2697         if (sk->sk_shutdown == SHUTDOWN_MASK)
2698                 mask |= EPOLLHUP;
2699         if (sk->sk_shutdown & RCV_SHUTDOWN)
2700                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
2701
2702         /* readable? */
2703         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
2704                 mask |= EPOLLIN | EPOLLRDNORM;
2705
2706         /* Connection-based need to check for termination and startup */
2707         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2708             sk->sk_state == TCP_CLOSE)
2709                 mask |= EPOLLHUP;
2710
2711         /*
2712          * we set writable also when the other side has shut down the
2713          * connection. This prevents stuck sockets.
2714          */
2715         if (unix_writable(sk))
2716                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
2717
2718         return mask;
2719 }
2720
2721 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
2722                                     poll_table *wait)
2723 {
2724         struct sock *sk = sock->sk, *other;
2725         unsigned int writable;
2726         __poll_t mask;
2727
2728         sock_poll_wait(file, sock, wait);
2729         mask = 0;
2730
2731         /* exceptional events? */
2732         if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
2733                 mask |= EPOLLERR |
2734                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
2735
2736         if (sk->sk_shutdown & RCV_SHUTDOWN)
2737                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
2738         if (sk->sk_shutdown == SHUTDOWN_MASK)
2739                 mask |= EPOLLHUP;
2740
2741         /* readable? */
2742         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
2743                 mask |= EPOLLIN | EPOLLRDNORM;
2744
2745         /* Connection-based need to check for termination and startup */
2746         if (sk->sk_type == SOCK_SEQPACKET) {
2747                 if (sk->sk_state == TCP_CLOSE)
2748                         mask |= EPOLLHUP;
2749                 /* connection hasn't started yet? */
2750                 if (sk->sk_state == TCP_SYN_SENT)
2751                         return mask;
2752         }
2753
2754         /* No write status requested, avoid expensive OUT tests. */
2755         if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
2756                 return mask;
2757
2758         writable = unix_writable(sk);
2759         if (writable) {
2760                 unix_state_lock(sk);
2761
2762                 other = unix_peer(sk);
2763                 if (other && unix_peer(other) != sk &&
2764                     unix_recvq_full_lockless(other) &&
2765                     unix_dgram_peer_wake_me(sk, other))
2766                         writable = 0;
2767
2768                 unix_state_unlock(sk);
2769         }
2770
2771         if (writable)
2772                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
2773         else
2774                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2775
2776         return mask;
2777 }
2778
2779 #ifdef CONFIG_PROC_FS
2780
2781 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2782
2783 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2784 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2785 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2786
2787 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2788 {
2789         unsigned long offset = get_offset(*pos);
2790         unsigned long bucket = get_bucket(*pos);
2791         struct sock *sk;
2792         unsigned long count = 0;
2793
2794         for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2795                 if (sock_net(sk) != seq_file_net(seq))
2796                         continue;
2797                 if (++count == offset)
2798                         break;
2799         }
2800
2801         return sk;
2802 }
2803
2804 static struct sock *unix_next_socket(struct seq_file *seq,
2805                                      struct sock *sk,
2806                                      loff_t *pos)
2807 {
2808         unsigned long bucket;
2809
2810         while (sk > (struct sock *)SEQ_START_TOKEN) {
2811                 sk = sk_next(sk);
2812                 if (!sk)
2813                         goto next_bucket;
2814                 if (sock_net(sk) == seq_file_net(seq))
2815                         return sk;
2816         }
2817
2818         do {
2819                 sk = unix_from_bucket(seq, pos);
2820                 if (sk)
2821                         return sk;
2822
2823 next_bucket:
2824                 bucket = get_bucket(*pos) + 1;
2825                 *pos = set_bucket_offset(bucket, 1);
2826         } while (bucket < ARRAY_SIZE(unix_socket_table));
2827
2828         return NULL;
2829 }
2830
2831 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2832         __acquires(unix_table_lock)
2833 {
2834         spin_lock(&unix_table_lock);
2835
2836         if (!*pos)
2837                 return SEQ_START_TOKEN;
2838
2839         if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2840                 return NULL;
2841
2842         return unix_next_socket(seq, NULL, pos);
2843 }
2844
2845 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2846 {
2847         ++*pos;
2848         return unix_next_socket(seq, v, pos);
2849 }
2850
2851 static void unix_seq_stop(struct seq_file *seq, void *v)
2852         __releases(unix_table_lock)
2853 {
2854         spin_unlock(&unix_table_lock);
2855 }
2856
2857 static int unix_seq_show(struct seq_file *seq, void *v)
2858 {
2859
2860         if (v == SEQ_START_TOKEN)
2861                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2862                          "Inode Path\n");
2863         else {
2864                 struct sock *s = v;
2865                 struct unix_sock *u = unix_sk(s);
2866                 unix_state_lock(s);
2867
2868                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2869                         s,
2870                         refcount_read(&s->sk_refcnt),
2871                         0,
2872                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2873                         s->sk_type,
2874                         s->sk_socket ?
2875                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2876                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2877                         sock_i_ino(s));
2878
2879                 if (u->addr) {  // under unix_table_lock here
2880                         int i, len;
2881                         seq_putc(seq, ' ');
2882
2883                         i = 0;
2884                         len = u->addr->len - sizeof(short);
2885                         if (!UNIX_ABSTRACT(s))
2886                                 len--;
2887                         else {
2888                                 seq_putc(seq, '@');
2889                                 i++;
2890                         }
2891                         for ( ; i < len; i++)
2892                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
2893                                          '@');
2894                 }
2895                 unix_state_unlock(s);
2896                 seq_putc(seq, '\n');
2897         }
2898
2899         return 0;
2900 }
2901
2902 static const struct seq_operations unix_seq_ops = {
2903         .start  = unix_seq_start,
2904         .next   = unix_seq_next,
2905         .stop   = unix_seq_stop,
2906         .show   = unix_seq_show,
2907 };
2908 #endif
2909
2910 static const struct net_proto_family unix_family_ops = {
2911         .family = PF_UNIX,
2912         .create = unix_create,
2913         .owner  = THIS_MODULE,
2914 };
2915
2916
2917 static int __net_init unix_net_init(struct net *net)
2918 {
2919         int error = -ENOMEM;
2920
2921         net->unx.sysctl_max_dgram_qlen = 10;
2922         if (unix_sysctl_register(net))
2923                 goto out;
2924
2925 #ifdef CONFIG_PROC_FS
2926         if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
2927                         sizeof(struct seq_net_private))) {
2928                 unix_sysctl_unregister(net);
2929                 goto out;
2930         }
2931 #endif
2932         error = 0;
2933 out:
2934         return error;
2935 }
2936
2937 static void __net_exit unix_net_exit(struct net *net)
2938 {
2939         unix_sysctl_unregister(net);
2940         remove_proc_entry("unix", net->proc_net);
2941 }
2942
2943 static struct pernet_operations unix_net_ops = {
2944         .init = unix_net_init,
2945         .exit = unix_net_exit,
2946 };
2947
2948 static int __init af_unix_init(void)
2949 {
2950         int rc = -1;
2951
2952         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2953
2954         rc = proto_register(&unix_proto, 1);
2955         if (rc != 0) {
2956                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2957                 goto out;
2958         }
2959
2960         sock_register(&unix_family_ops);
2961         register_pernet_subsys(&unix_net_ops);
2962 out:
2963         return rc;
2964 }
2965
2966 static void __exit af_unix_exit(void)
2967 {
2968         sock_unregister(PF_UNIX);
2969         proto_unregister(&unix_proto);
2970         unregister_pernet_subsys(&unix_net_ops);
2971 }
2972
2973 /* Earlier than device_initcall() so that other drivers invoking
2974    request_module() don't end up in a loop when modprobe tries
2975    to use a UNIX socket. But later than subsys_initcall() because
2976    we depend on stuff initialised there */
2977 fs_initcall(af_unix_init);
2978 module_exit(af_unix_exit);
2979
2980 MODULE_LICENSE("GPL");
2981 MODULE_ALIAS_NETPROTO(PF_UNIX);