net/unix/af_unix.c

   1 /*
   2  * NET4:        Implementation of BSD Unix domain sockets.
   3  *
   4  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   5  *
   6  *              This program is free software; you can redistribute it and/or
   7  *              modify it under the terms of the GNU General Public License
   8  *              as published by the Free Software Foundation; either version
   9  *              2 of the License, or (at your option) any later version.
  10  *
  11  * Fixes:
  12  *              Linus Torvalds  :       Assorted bug cures.
  13  *              Niibe Yutaka    :       async I/O support.
  14  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  15  *              Alan Cox        :       Limit size of allocated blocks.
  16  *              Alan Cox        :       Fixed the stupid socketpair bug.
  17  *              Alan Cox        :       BSD compatibility fine tuning.
  18  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  19  *              Alan Cox        :       Sorted out a proper draft version of
  20  *                                      file descriptor passing hacked up from
  21  *                                      Mike Shaver's work.
  22  *              Marty Leisner   :       Fixes to fd passing
  23  *              Nick Nevin      :       recvmsg bugfix.
  24  *              Alan Cox        :       Started proper garbage collector
  25  *              Heiko EiBfeldt  :       Missing verify_area check
  26  *              Alan Cox        :       Started POSIXisms
  27  *              Andreas Schwab  :       Replace inode by dentry for proper
  28  *                                      reference counting
  29  *              Kirk Petersen   :       Made this a module
  30  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  31  *                                      Lots of bug fixes.
  32  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  33  *                                      by above two patches.
  34  *           Andrea Arcangeli   :       If possible we block in connect(2)
  35  *                                      if the max backlog of the listen socket
  36  *                                      is been reached. This won't break
  37  *                                      old apps and it will avoid huge amount
  38  *                                      of socks hashed (this for unix_gc()
  39  *                                      performances reasons).
  40  *                                      Security fix that limits the max
  41  *                                      number of socks to 2*max_files and
  42  *                                      the number of skb queueable in the
  43  *                                      dgram receiver.
  44  *              Artur Skawina   :       Hash function optimizations
  45  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  46  *            Malcolm Beattie   :       Set peercred for socketpair
  47  *           Michal Ostrowski   :       Module initialization cleanup.
  48  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  49  *                                      the core infrastructure is doing that
  50  *                                      for all net proto families now (2.5.69+)
  51  *
  52  *
  53  * Known differences from reference BSD that was tested:
  54  *
  55  *      [TO FIX]
  56  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  57  *              other the moment one end closes.
  58  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  59  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  60  *      [NOT TO FIX]
  61  *      accept() returns a path name even if the connecting socket has closed
  62  *              in the meantime (BSD loses the path and gives up).
  63  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  64  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  65  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  66  *      BSD af_unix apparently has connect forgetting to block properly.
  67  *              (need to check this with the POSIX spec in detail)
  68  *
  69  * Differences from 2.0.0-11-... (ANK)
  70  *      Bug fixes and improvements.
  71  *              - client shutdown killed server socket.
  72  *              - removed all useless cli/sti pairs.
  73  *
  74  *      Semantic changes/extensions.
  75  *              - generic control message passing.
  76  *              - SCM_CREDENTIALS control message.
  77  *              - "Abstract" (not FS based) socket bindings.
  78  *                Abstract names are sequences of bytes (not zero terminated)
  79  *                started by 0, so that this name space does not intersect
  80  *                with BSD names.
  81  */
  82
  83 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  84
  85 #include <linux/module.h>
  86 #include <linux/kernel.h>
  87 #include <linux/signal.h>
  88 #include <linux/sched/signal.h>
  89 #include <linux/errno.h>
  90 #include <linux/string.h>
  91 #include <linux/stat.h>
  92 #include <linux/dcache.h>
  93 #include <linux/namei.h>
  94 #include <linux/socket.h>
  95 #include <linux/un.h>
  96 #include <linux/fcntl.h>
  97 #include <linux/termios.h>
  98 #include <linux/sockios.h>
  99 #include <linux/net.h>
 100 #include <linux/in.h>
 101 #include <linux/fs.h>
 102 #include <linux/slab.h>
 103 #include <linux/uaccess.h>
 104 #include <linux/skbuff.h>
 105 #include <linux/netdevice.h>
 106 #include <net/net_namespace.h>
 107 #include <net/sock.h>
 108 #include <net/tcp_states.h>
 109 #include <net/af_unix.h>
 110 #include <linux/proc_fs.h>
 111 #include <linux/seq_file.h>
 112 #include <net/scm.h>
 113 #include <linux/init.h>
 114 #include <linux/poll.h>
 115 #include <linux/rtnetlink.h>
 116 #include <linux/mount.h>
 117 #include <net/checksum.h>
 118 #include <linux/security.h>
 119 #include <linux/freezer.h>
 120 #include <linux/file.h>
 121
 122 #include "scm.h"
 123
 124 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 125 EXPORT_SYMBOL_GPL(unix_socket_table);
 126 DEFINE_SPINLOCK(unix_table_lock);
 127 EXPORT_SYMBOL_GPL(unix_table_lock);
 128 static atomic_long_t unix_nr_socks;
 129
 130
 131 static struct hlist_head *unix_sockets_unbound(void *addr)
 132 {
 133         unsigned long hash = (unsigned long)addr;
 134
 135         hash ^= hash >> 16;
 136         hash ^= hash >> 8;
 137         hash %= UNIX_HASH_SIZE;
 138         return &unix_socket_table[UNIX_HASH_SIZE + hash];
 139 }
 140
 141 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
 142
 143 #ifdef CONFIG_SECURITY_NETWORK
 144 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 145 {
 146         UNIXCB(skb).secid = scm->secid;
 147 }
 148
 149 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 150 {
 151         scm->secid = UNIXCB(skb).secid;
 152 }
 153
 154 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 155 {
 156         return (scm->secid == UNIXCB(skb).secid);
 157 }
 158 #else
 159 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 160 { }
 161
 162 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 163 { }
 164
 165 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 166 {
 167         return true;
 168 }
 169 #endif /* CONFIG_SECURITY_NETWORK */
 170
 171 /*
 172  *  SMP locking strategy:
 173  *    hash table is protected with spinlock unix_table_lock
 174  *    each socket state is protected by separate spin lock.
 175  */
 176
 177 static inline unsigned int unix_hash_fold(__wsum n)
 178 {
 179         unsigned int hash = (__force unsigned int)csum_fold(n);
 180
 181         hash ^= hash>>8;
 182         return hash&(UNIX_HASH_SIZE-1);
 183 }
 184
 185 #define unix_peer(sk) (unix_sk(sk)->peer)
 186
 187 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 188 {
 189         return unix_peer(osk) == sk;
 190 }
 191
 192 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 193 {
 194         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 195 }
 196
 197 static inline int unix_recvq_full(const struct sock *sk)
 198 {
 199         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 200 }
 201
 202 static inline int unix_recvq_full_lockless(const struct sock *sk)
 203 {
 204         return skb_queue_len_lockless(&sk->sk_receive_queue) >
 205                 READ_ONCE(sk->sk_max_ack_backlog);
 206 }
 207
 208 struct sock *unix_peer_get(struct sock *s)
 209 {
 210         struct sock *peer;
 211
 212         unix_state_lock(s);
 213         peer = unix_peer(s);
 214         if (peer)
 215                 sock_hold(peer);
 216         unix_state_unlock(s);
 217         return peer;
 218 }
 219 EXPORT_SYMBOL_GPL(unix_peer_get);
 220
 221 static inline void unix_release_addr(struct unix_address *addr)
 222 {
 223         if (refcount_dec_and_test(&addr->refcnt))
 224                 kfree(addr);
 225 }
 226
 227 /*
 228  *      Check unix socket name:
 229  *              - should be not zero length.
 230  *              - if started by not zero, should be NULL terminated (FS object)
 231  *              - if started by zero, it is abstract name.
 232  */
 233
 234 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
 235 {
 236         *hashp = 0;
 237
 238         if (len <= sizeof(short) || len > sizeof(*sunaddr))
 239                 return -EINVAL;
 240         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
 241                 return -EINVAL;
 242         if (sunaddr->sun_path[0]) {
 243                 /*
 244                  * This may look like an off by one error but it is a bit more
 245                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
 246                  * sun_path[108] doesn't as such exist.  However in kernel space
 247                  * we are guaranteed that it is a valid memory location in our
 248                  * kernel address buffer.
 249                  */
 250                 ((char *)sunaddr)[len] = 0;
 251                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
 252                 return len;
 253         }
 254
 255         *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
 256         return len;
 257 }
 258
 259 static void __unix_remove_socket(struct sock *sk)
 260 {
 261         sk_del_node_init(sk);
 262 }
 263
 264 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 265 {
 266         WARN_ON(!sk_unhashed(sk));
 267         sk_add_node(sk, list);
 268 }
 269
 270 static inline void unix_remove_socket(struct sock *sk)
 271 {
 272         spin_lock(&unix_table_lock);
 273         __unix_remove_socket(sk);
 274         spin_unlock(&unix_table_lock);
 275 }
 276
 277 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 278 {
 279         spin_lock(&unix_table_lock);
 280         __unix_insert_socket(list, sk);
 281         spin_unlock(&unix_table_lock);
 282 }
 283
 284 static struct sock *__unix_find_socket_byname(struct net *net,
 285                                               struct sockaddr_un *sunname,
 286                                               int len, int type, unsigned int hash)
 287 {
 288         struct sock *s;
 289
 290         sk_for_each(s, &unix_socket_table[hash ^ type]) {
 291                 struct unix_sock *u = unix_sk(s);
 292
 293                 if (!net_eq(sock_net(s), net))
 294                         continue;
 295
 296                 if (u->addr->len == len &&
 297                     !memcmp(u->addr->name, sunname, len))
 298                         goto found;
 299         }
 300         s = NULL;
 301 found:
 302         return s;
 303 }
 304
 305 static inline struct sock *unix_find_socket_byname(struct net *net,
 306                                                    struct sockaddr_un *sunname,
 307                                                    int len, int type,
 308                                                    unsigned int hash)
 309 {
 310         struct sock *s;
 311
 312         spin_lock(&unix_table_lock);
 313         s = __unix_find_socket_byname(net, sunname, len, type, hash);
 314         if (s)
 315                 sock_hold(s);
 316         spin_unlock(&unix_table_lock);
 317         return s;
 318 }
 319
 320 static struct sock *unix_find_socket_byinode(struct inode *i)
 321 {
 322         struct sock *s;
 323
 324         spin_lock(&unix_table_lock);
 325         sk_for_each(s,
 326                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 327                 struct dentry *dentry = unix_sk(s)->path.dentry;
 328
 329                 if (dentry && d_backing_inode(dentry) == i) {
 330                         sock_hold(s);
 331                         goto found;
 332                 }
 333         }
 334         s = NULL;
 335 found:
 336         spin_unlock(&unix_table_lock);
 337         return s;
 338 }
 339
 340 /* Support code for asymmetrically connected dgram sockets
 341  *
 342  * If a datagram socket is connected to a socket not itself connected
 343  * to the first socket (eg, /dev/log), clients may only enqueue more
 344  * messages if the present receive queue of the server socket is not
 345  * "too large". This means there's a second writeability condition
 346  * poll and sendmsg need to test. The dgram recv code will do a wake
 347  * up on the peer_wait wait queue of a socket upon reception of a
 348  * datagram which needs to be propagated to sleeping would-be writers
 349  * since these might not have sent anything so far. This can't be
 350  * accomplished via poll_wait because the lifetime of the server
 351  * socket might be less than that of its clients if these break their
 352  * association with it or if the server socket is closed while clients
 353  * are still connected to it and there's no way to inform "a polling
 354  * implementation" that it should let go of a certain wait queue
 355  *
 356  * In order to propagate a wake up, a wait_queue_entry_t of the client
 357  * socket is enqueued on the peer_wait queue of the server socket
 358  * whose wake function does a wake_up on the ordinary client socket
 359  * wait queue. This connection is established whenever a write (or
 360  * poll for write) hit the flow control condition and broken when the
 361  * association to the server socket is dissolved or after a wake up
 362  * was relayed.
 363  */
 364
 365 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
 366                                       void *key)
 367 {
 368         struct unix_sock *u;
 369         wait_queue_head_t *u_sleep;
 370
 371         u = container_of(q, struct unix_sock, peer_wake);
 372
 373         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
 374                             q);
 375         u->peer_wake.private = NULL;
 376
 377         /* relaying can only happen while the wq still exists */
 378         u_sleep = sk_sleep(&u->sk);
 379         if (u_sleep)
 380                 wake_up_interruptible_poll(u_sleep, key);
 381
 382         return 0;
 383 }
 384
 385 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
 386 {
 387         struct unix_sock *u, *u_other;
 388         int rc;
 389
 390         u = unix_sk(sk);
 391         u_other = unix_sk(other);
 392         rc = 0;
 393         spin_lock(&u_other->peer_wait.lock);
 394
 395         if (!u->peer_wake.private) {
 396                 u->peer_wake.private = other;
 397                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
 398
 399                 rc = 1;
 400         }
 401
 402         spin_unlock(&u_other->peer_wait.lock);
 403         return rc;
 404 }
 405
 406 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
 407                                             struct sock *other)
 408 {
 409         struct unix_sock *u, *u_other;
 410
 411         u = unix_sk(sk);
 412         u_other = unix_sk(other);
 413         spin_lock(&u_other->peer_wait.lock);
 414
 415         if (u->peer_wake.private == other) {
 416                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
 417                 u->peer_wake.private = NULL;
 418         }
 419
 420         spin_unlock(&u_other->peer_wait.lock);
 421 }
 422
 423 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 424                                                    struct sock *other)
 425 {
 426         unix_dgram_peer_wake_disconnect(sk, other);
 427         wake_up_interruptible_poll(sk_sleep(sk),
 428                                    POLLOUT |
 429                                    POLLWRNORM |
 430                                    POLLWRBAND);
 431 }
 432
 433 /* preconditions:
 434  *      - unix_peer(sk) == other
 435  *      - association is stable
 436  */
 437 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 438 {
 439         int connected;
 440
 441         connected = unix_dgram_peer_wake_connect(sk, other);
 442
 443         if (unix_recvq_full(other))
 444                 return 1;
 445
 446         if (connected)
 447                 unix_dgram_peer_wake_disconnect(sk, other);
 448
 449         return 0;
 450 }
 451
 452 static int unix_writable(const struct sock *sk)
 453 {
 454         return sk->sk_state != TCP_LISTEN &&
 455                (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 456 }
 457
 458 static void unix_write_space(struct sock *sk)
 459 {
 460         struct socket_wq *wq;
 461
 462         rcu_read_lock();
 463         if (unix_writable(sk)) {
 464                 wq = rcu_dereference(sk->sk_wq);
 465                 if (skwq_has_sleeper(wq))
 466                         wake_up_interruptible_sync_poll(&wq->wait,
 467                                 POLLOUT | POLLWRNORM | POLLWRBAND);
 468                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 469         }
 470         rcu_read_unlock();
 471 }
 472
 473 /* When dgram socket disconnects (or changes its peer), we clear its receive
 474  * queue of packets arrived from previous peer. First, it allows to do
 475  * flow control based only on wmem_alloc; second, sk connected to peer
 476  * may receive messages only from that peer. */
 477 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 478 {
 479         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 480                 skb_queue_purge(&sk->sk_receive_queue);
 481                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 482
 483                 /* If one link of bidirectional dgram pipe is disconnected,
 484                  * we signal error. Messages are lost. Do not make this,
 485                  * when peer was not connected to us.
 486                  */
 487                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 488                         other->sk_err = ECONNRESET;
 489                         other->sk_error_report(other);
 490                 }
 491         }
 492 }
 493
 494 static void unix_sock_destructor(struct sock *sk)
 495 {
 496         struct unix_sock *u = unix_sk(sk);
 497
 498         skb_queue_purge(&sk->sk_receive_queue);
 499
 500         WARN_ON(refcount_read(&sk->sk_wmem_alloc));
 501         WARN_ON(!sk_unhashed(sk));
 502         WARN_ON(sk->sk_socket);
 503         if (!sock_flag(sk, SOCK_DEAD)) {
 504                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 505                 return;
 506         }
 507
 508         if (u->addr)
 509                 unix_release_addr(u->addr);
 510
 511         atomic_long_dec(&unix_nr_socks);
 512         local_bh_disable();
 513         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 514         local_bh_enable();
 515 #ifdef UNIX_REFCNT_DEBUG
 516         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 517                 atomic_long_read(&unix_nr_socks));
 518 #endif
 519 }
 520
 521 static void unix_release_sock(struct sock *sk, int embrion)
 522 {
 523         struct unix_sock *u = unix_sk(sk);
 524         struct path path;
 525         struct sock *skpair;
 526         struct sk_buff *skb;
 527         int state;
 528
 529         unix_remove_socket(sk);
 530
 531         /* Clear state */
 532         unix_state_lock(sk);
 533         sock_orphan(sk);
 534         sk->sk_shutdown = SHUTDOWN_MASK;
 535         path         = u->path;
 536         u->path.dentry = NULL;
 537         u->path.mnt = NULL;
 538         state = sk->sk_state;
 539         sk->sk_state = TCP_CLOSE;
 540
 541         skpair = unix_peer(sk);
 542         unix_peer(sk) = NULL;
 543
 544         unix_state_unlock(sk);
 545
 546         wake_up_interruptible_all(&u->peer_wait);
 547
 548         if (skpair != NULL) {
 549                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 550                         unix_state_lock(skpair);
 551                         /* No more writes */
 552                         skpair->sk_shutdown = SHUTDOWN_MASK;
 553                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 554                                 skpair->sk_err = ECONNRESET;
 555                         unix_state_unlock(skpair);
 556                         skpair->sk_state_change(skpair);
 557                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 558                 }
 559
 560                 unix_dgram_peer_wake_disconnect(sk, skpair);
 561                 sock_put(skpair); /* It may now die */
 562         }
 563
 564         /* Try to flush out this socket. Throw out buffers at least */
 565
 566         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 567                 if (state == TCP_LISTEN)
 568                         unix_release_sock(skb->sk, 1);
 569                 /* passed fds are erased in the kfree_skb hook        */
 570                 UNIXCB(skb).consumed = skb->len;
 571                 kfree_skb(skb);
 572         }
 573
 574         if (path.dentry)
 575                 path_put(&path);
 576
 577         sock_put(sk);
 578
 579         /* ---- Socket is dead now and most probably destroyed ---- */
 580
 581         /*
 582          * Fixme: BSD difference: In BSD all sockets connected to us get
 583          *        ECONNRESET and we die on the spot. In Linux we behave
 584          *        like files and pipes do and wait for the last
 585          *        dereference.
 586          *
 587          * Can't we simply set sock->err?
 588          *
 589          *        What the above comment does talk about? --ANK(980817)
 590          */
 591
 592         if (READ_ONCE(unix_tot_inflight))
 593                 unix_gc();              /* Garbage collect fds */
 594 }
 595
 596 static void init_peercred(struct sock *sk)
 597 {
 598         const struct cred *old_cred;
 599         struct pid *old_pid;
 600
 601         spin_lock(&sk->sk_peer_lock);
 602         old_pid = sk->sk_peer_pid;
 603         old_cred = sk->sk_peer_cred;
 604         sk->sk_peer_pid  = get_pid(task_tgid(current));
 605         sk->sk_peer_cred = get_current_cred();
 606         spin_unlock(&sk->sk_peer_lock);
 607
 608         put_pid(old_pid);
 609         put_cred(old_cred);
 610 }
 611
 612 static void copy_peercred(struct sock *sk, struct sock *peersk)
 613 {
 614         const struct cred *old_cred;
 615         struct pid *old_pid;
 616
 617         if (sk < peersk) {
 618                 spin_lock(&sk->sk_peer_lock);
 619                 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 620         } else {
 621                 spin_lock(&peersk->sk_peer_lock);
 622                 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 623         }
 624         old_pid = sk->sk_peer_pid;
 625         old_cred = sk->sk_peer_cred;
 626         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 627         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 628
 629         spin_unlock(&sk->sk_peer_lock);
 630         spin_unlock(&peersk->sk_peer_lock);
 631
 632         put_pid(old_pid);
 633         put_cred(old_cred);
 634 }
 635
 636 static int unix_listen(struct socket *sock, int backlog)
 637 {
 638         int err;
 639         struct sock *sk = sock->sk;
 640         struct unix_sock *u = unix_sk(sk);
 641         struct pid *old_pid = NULL;
 642
 643         err = -EOPNOTSUPP;
 644         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 645                 goto out;       /* Only stream/seqpacket sockets accept */
 646         err = -EINVAL;
 647         if (!u->addr)
 648                 goto out;       /* No listens on an unbound socket */
 649         unix_state_lock(sk);
 650         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 651                 goto out_unlock;
 652         if (backlog > sk->sk_max_ack_backlog)
 653                 wake_up_interruptible_all(&u->peer_wait);
 654         sk->sk_max_ack_backlog  = backlog;
 655         sk->sk_state            = TCP_LISTEN;
 656         /* set credentials so connect can copy them */
 657         init_peercred(sk);
 658         err = 0;
 659
 660 out_unlock:
 661         unix_state_unlock(sk);
 662         put_pid(old_pid);
 663 out:
 664         return err;
 665 }
 666
 667 static int unix_release(struct socket *);
 668 static int unix_bind(struct socket *, struct sockaddr *, int);
 669 static int unix_stream_connect(struct socket *, struct sockaddr *,
 670                                int addr_len, int flags);
 671 static int unix_socketpair(struct socket *, struct socket *);
 672 static int unix_accept(struct socket *, struct socket *, int, bool);
 673 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
 674 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
 675 static unsigned int unix_dgram_poll(struct file *, struct socket *,
 676                                     poll_table *);
 677 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 678 #ifdef CONFIG_COMPAT
 679 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 680 #endif
 681 static int unix_shutdown(struct socket *, int);
 682 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 683 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 684 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
 685                                     size_t size, int flags);
 686 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 687                                        struct pipe_inode_info *, size_t size,
 688                                        unsigned int flags);
 689 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 690 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 691 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 692                               int, int);
 693 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 694 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 695                                   int);
 696
 697 static int unix_set_peek_off(struct sock *sk, int val)
 698 {
 699         struct unix_sock *u = unix_sk(sk);
 700
 701         if (mutex_lock_interruptible(&u->iolock))
 702                 return -EINTR;
 703
 704         WRITE_ONCE(sk->sk_peek_off, val);
 705         mutex_unlock(&u->iolock);
 706
 707         return 0;
 708 }
 709
 710
 711 static const struct proto_ops unix_stream_ops = {
 712         .family =       PF_UNIX,
 713         .owner =        THIS_MODULE,
 714         .release =      unix_release,
 715         .bind =         unix_bind,
 716         .connect =      unix_stream_connect,
 717         .socketpair =   unix_socketpair,
 718         .accept =       unix_accept,
 719         .getname =      unix_getname,
 720         .poll =         unix_poll,
 721         .ioctl =        unix_ioctl,
 722 #ifdef CONFIG_COMPAT
 723         .compat_ioctl = unix_compat_ioctl,
 724 #endif
 725         .listen =       unix_listen,
 726         .shutdown =     unix_shutdown,
 727         .setsockopt =   sock_no_setsockopt,
 728         .getsockopt =   sock_no_getsockopt,
 729         .sendmsg =      unix_stream_sendmsg,
 730         .recvmsg =      unix_stream_recvmsg,
 731         .mmap =         sock_no_mmap,
 732         .sendpage =     unix_stream_sendpage,
 733         .splice_read =  unix_stream_splice_read,
 734         .set_peek_off = unix_set_peek_off,
 735 };
 736
 737 static const struct proto_ops unix_dgram_ops = {
 738         .family =       PF_UNIX,
 739         .owner =        THIS_MODULE,
 740         .release =      unix_release,
 741         .bind =         unix_bind,
 742         .connect =      unix_dgram_connect,
 743         .socketpair =   unix_socketpair,
 744         .accept =       sock_no_accept,
 745         .getname =      unix_getname,
 746         .poll =         unix_dgram_poll,
 747         .ioctl =        unix_ioctl,
 748 #ifdef CONFIG_COMPAT
 749         .compat_ioctl = unix_compat_ioctl,
 750 #endif
 751         .listen =       sock_no_listen,
 752         .shutdown =     unix_shutdown,
 753         .setsockopt =   sock_no_setsockopt,
 754         .getsockopt =   sock_no_getsockopt,
 755         .sendmsg =      unix_dgram_sendmsg,
 756         .recvmsg =      unix_dgram_recvmsg,
 757         .mmap =         sock_no_mmap,
 758         .sendpage =     sock_no_sendpage,
 759         .set_peek_off = unix_set_peek_off,
 760 };
 761
 762 static const struct proto_ops unix_seqpacket_ops = {
 763         .family =       PF_UNIX,
 764         .owner =        THIS_MODULE,
 765         .release =      unix_release,
 766         .bind =         unix_bind,
 767         .connect =      unix_stream_connect,
 768         .socketpair =   unix_socketpair,
 769         .accept =       unix_accept,
 770         .getname =      unix_getname,
 771         .poll =         unix_dgram_poll,
 772         .ioctl =        unix_ioctl,
 773 #ifdef CONFIG_COMPAT
 774         .compat_ioctl = unix_compat_ioctl,
 775 #endif
 776         .listen =       unix_listen,
 777         .shutdown =     unix_shutdown,
 778         .setsockopt =   sock_no_setsockopt,
 779         .getsockopt =   sock_no_getsockopt,
 780         .sendmsg =      unix_seqpacket_sendmsg,
 781         .recvmsg =      unix_seqpacket_recvmsg,
 782         .mmap =         sock_no_mmap,
 783         .sendpage =     sock_no_sendpage,
 784         .set_peek_off = unix_set_peek_off,
 785 };
 786
 787 static struct proto unix_proto = {
 788         .name                   = "UNIX",
 789         .owner                  = THIS_MODULE,
 790         .obj_size               = sizeof(struct unix_sock),
 791 };
 792
 793 /*
 794  * AF_UNIX sockets do not interact with hardware, hence they
 795  * dont trigger interrupts - so it's safe for them to have
 796  * bh-unsafe locking for their sk_receive_queue.lock. Split off
 797  * this special lock-class by reinitializing the spinlock key:
 798  */
 799 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
 800
 801 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 802 {
 803         struct sock *sk = NULL;
 804         struct unix_sock *u;
 805
 806         atomic_long_inc(&unix_nr_socks);
 807         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 808                 goto out;
 809
 810         sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
 811         if (!sk)
 812                 goto out;
 813
 814         sock_init_data(sock, sk);
 815         lockdep_set_class(&sk->sk_receive_queue.lock,
 816                                 &af_unix_sk_receive_queue_lock_key);
 817
 818         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
 819         sk->sk_write_space      = unix_write_space;
 820         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 821         sk->sk_destruct         = unix_sock_destructor;
 822         u         = unix_sk(sk);
 823         u->path.dentry = NULL;
 824         u->path.mnt = NULL;
 825         spin_lock_init(&u->lock);
 826         atomic_long_set(&u->inflight, 0);
 827         INIT_LIST_HEAD(&u->link);
 828         mutex_init(&u->iolock); /* single task reading lock */
 829         mutex_init(&u->bindlock); /* single task binding lock */
 830         init_waitqueue_head(&u->peer_wait);
 831         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 832         unix_insert_socket(unix_sockets_unbound(sk), sk);
 833 out:
 834         if (sk == NULL)
 835                 atomic_long_dec(&unix_nr_socks);
 836         else {
 837                 local_bh_disable();
 838                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 839                 local_bh_enable();
 840         }
 841         return sk;
 842 }
 843
 844 static int unix_create(struct net *net, struct socket *sock, int protocol,
 845                        int kern)
 846 {
 847         if (protocol && protocol != PF_UNIX)
 848                 return -EPROTONOSUPPORT;
 849
 850         sock->state = SS_UNCONNECTED;
 851
 852         switch (sock->type) {
 853         case SOCK_STREAM:
 854                 sock->ops = &unix_stream_ops;
 855                 break;
 856                 /*
 857                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
 858                  *      nothing uses it.
 859                  */
 860         case SOCK_RAW:
 861                 sock->type = SOCK_DGRAM;
 862         case SOCK_DGRAM:
 863                 sock->ops = &unix_dgram_ops;
 864                 break;
 865         case SOCK_SEQPACKET:
 866                 sock->ops = &unix_seqpacket_ops;
 867                 break;
 868         default:
 869                 return -ESOCKTNOSUPPORT;
 870         }
 871
 872         return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
 873 }
 874
 875 static int unix_release(struct socket *sock)
 876 {
 877         struct sock *sk = sock->sk;
 878
 879         if (!sk)
 880                 return 0;
 881
 882         unix_release_sock(sk, 0);
 883         sock->sk = NULL;
 884
 885         return 0;
 886 }
 887
 888 static int unix_autobind(struct socket *sock)
 889 {
 890         struct sock *sk = sock->sk;
 891         struct net *net = sock_net(sk);
 892         struct unix_sock *u = unix_sk(sk);
 893         static u32 ordernum = 1;
 894         struct unix_address *addr;
 895         int err;
 896         unsigned int retries = 0;
 897
 898         err = mutex_lock_interruptible(&u->bindlock);
 899         if (err)
 900                 return err;
 901
 902         err = 0;
 903         if (u->addr)
 904                 goto out;
 905
 906         err = -ENOMEM;
 907         addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
 908         if (!addr)
 909                 goto out;
 910
 911         addr->name->sun_family = AF_UNIX;
 912         refcount_set(&addr->refcnt, 1);
 913
 914 retry:
 915         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
 916         addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 917
 918         spin_lock(&unix_table_lock);
 919         ordernum = (ordernum+1)&0xFFFFF;
 920
 921         if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
 922                                       addr->hash)) {
 923                 spin_unlock(&unix_table_lock);
 924                 /*
 925                  * __unix_find_socket_byname() may take long time if many names
 926                  * are already in use.
 927                  */
 928                 cond_resched();
 929                 /* Give up if all names seems to be in use. */
 930                 if (retries++ == 0xFFFFF) {
 931                         err = -ENOSPC;
 932                         kfree(addr);
 933                         goto out;
 934                 }
 935                 goto retry;
 936         }
 937         addr->hash ^= sk->sk_type;
 938
 939         __unix_remove_socket(sk);
 940         smp_store_release(&u->addr, addr);
 941         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
 942         spin_unlock(&unix_table_lock);
 943         err = 0;
 944
 945 out:    mutex_unlock(&u->bindlock);
 946         return err;
 947 }
 948
 949 static struct sock *unix_find_other(struct net *net,
 950                                     struct sockaddr_un *sunname, int len,
 951                                     int type, unsigned int hash, int *error)
 952 {
 953         struct sock *u;
 954         struct path path;
 955         int err = 0;
 956
 957         if (sunname->sun_path[0]) {
 958                 struct inode *inode;
 959                 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
 960                 if (err)
 961                         goto fail;
 962                 inode = d_backing_inode(path.dentry);
 963                 err = inode_permission(inode, MAY_WRITE);
 964                 if (err)
 965                         goto put_fail;
 966
 967                 err = -ECONNREFUSED;
 968                 if (!S_ISSOCK(inode->i_mode))
 969                         goto put_fail;
 970                 u = unix_find_socket_byinode(inode);
 971                 if (!u)
 972                         goto put_fail;
 973
 974                 if (u->sk_type == type)
 975                         touch_atime(&path);
 976
 977                 path_put(&path);
 978
 979                 err = -EPROTOTYPE;
 980                 if (u->sk_type != type) {
 981                         sock_put(u);
 982                         goto fail;
 983                 }
 984         } else {
 985                 err = -ECONNREFUSED;
 986                 u = unix_find_socket_byname(net, sunname, len, type, hash);
 987                 if (u) {
 988                         struct dentry *dentry;
 989                         dentry = unix_sk(u)->path.dentry;
 990                         if (dentry)
 991                                 touch_atime(&unix_sk(u)->path);
 992                 } else
 993                         goto fail;
 994         }
 995         return u;
 996
 997 put_fail:
 998         path_put(&path);
 999 fail:
1000         *error = err;
1001         return NULL;
1002 }
1003
1004 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
1005 {
1006         struct dentry *dentry;
1007         struct path path;
1008         int err = 0;
1009         /*
1010          * Get the parent directory, calculate the hash for last
1011          * component.
1012          */
1013         dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
1014         err = PTR_ERR(dentry);
1015         if (IS_ERR(dentry))
1016                 return err;
1017
1018         /*
1019          * All right, let's create it.
1020          */
1021         err = security_path_mknod(&path, dentry, mode, 0);
1022         if (!err) {
1023                 err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
1024                 if (!err) {
1025                         res->mnt = mntget(path.mnt);
1026                         res->dentry = dget(dentry);
1027                 }
1028         }
1029         done_path_create(&path, dentry);
1030         return err;
1031 }
1032
1033 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1034 {
1035         struct sock *sk = sock->sk;
1036         struct net *net = sock_net(sk);
1037         struct unix_sock *u = unix_sk(sk);
1038         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1039         char *sun_path = sunaddr->sun_path;
1040         int err;
1041         unsigned int hash;
1042         struct unix_address *addr;
1043         struct hlist_head *list;
1044         struct path path = { };
1045
1046         err = -EINVAL;
1047         if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
1048             sunaddr->sun_family != AF_UNIX)
1049                 goto out;
1050
1051         if (addr_len == sizeof(short)) {
1052                 err = unix_autobind(sock);
1053                 goto out;
1054         }
1055
1056         err = unix_mkname(sunaddr, addr_len, &hash);
1057         if (err < 0)
1058                 goto out;
1059         addr_len = err;
1060
1061         if (sun_path[0]) {
1062                 umode_t mode = S_IFSOCK |
1063                        (SOCK_INODE(sock)->i_mode & ~current_umask());
1064                 err = unix_mknod(sun_path, mode, &path);
1065                 if (err) {
1066                         if (err == -EEXIST)
1067                                 err = -EADDRINUSE;
1068                         goto out;
1069                 }
1070         }
1071
1072         err = mutex_lock_interruptible(&u->bindlock);
1073         if (err)
1074                 goto out_put;
1075
1076         err = -EINVAL;
1077         if (u->addr)
1078                 goto out_up;
1079
1080         err = -ENOMEM;
1081         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
1082         if (!addr)
1083                 goto out_up;
1084
1085         memcpy(addr->name, sunaddr, addr_len);
1086         addr->len = addr_len;
1087         addr->hash = hash ^ sk->sk_type;
1088         refcount_set(&addr->refcnt, 1);
1089
1090         if (sun_path[0]) {
1091                 addr->hash = UNIX_HASH_SIZE;
1092                 hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
1093                 spin_lock(&unix_table_lock);
1094                 u->path = path;
1095                 list = &unix_socket_table[hash];
1096         } else {
1097                 spin_lock(&unix_table_lock);
1098                 err = -EADDRINUSE;
1099                 if (__unix_find_socket_byname(net, sunaddr, addr_len,
1100                                               sk->sk_type, hash)) {
1101                         unix_release_addr(addr);
1102                         goto out_unlock;
1103                 }
1104
1105                 list = &unix_socket_table[addr->hash];
1106         }
1107
1108         err = 0;
1109         __unix_remove_socket(sk);
1110         smp_store_release(&u->addr, addr);
1111         __unix_insert_socket(list, sk);
1112
1113 out_unlock:
1114         spin_unlock(&unix_table_lock);
1115 out_up:
1116         mutex_unlock(&u->bindlock);
1117 out_put:
1118         if (err)
1119                 path_put(&path);
1120 out:
1121         return err;
1122 }
1123
1124 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1125 {
1126         if (unlikely(sk1 == sk2) || !sk2) {
1127                 unix_state_lock(sk1);
1128                 return;
1129         }
1130         if (sk1 < sk2) {
1131                 unix_state_lock(sk1);
1132                 unix_state_lock_nested(sk2);
1133         } else {
1134                 unix_state_lock(sk2);
1135                 unix_state_lock_nested(sk1);
1136         }
1137 }
1138
1139 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1140 {
1141         if (unlikely(sk1 == sk2) || !sk2) {
1142                 unix_state_unlock(sk1);
1143                 return;
1144         }
1145         unix_state_unlock(sk1);
1146         unix_state_unlock(sk2);
1147 }
1148
1149 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1150                               int alen, int flags)
1151 {
1152         struct sock *sk = sock->sk;
1153         struct net *net = sock_net(sk);
1154         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1155         struct sock *other;
1156         unsigned int hash;
1157         int err;
1158
1159         err = -EINVAL;
1160         if (alen < offsetofend(struct sockaddr, sa_family))
1161                 goto out;
1162
1163         if (addr->sa_family != AF_UNSPEC) {
1164                 err = unix_mkname(sunaddr, alen, &hash);
1165                 if (err < 0)
1166                         goto out;
1167                 alen = err;
1168
1169                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1170                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
1171                         goto out;
1172
1173 restart:
1174                 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1175                 if (!other)
1176                         goto out;
1177
1178                 unix_state_double_lock(sk, other);
1179
1180                 /* Apparently VFS overslept socket death. Retry. */
1181                 if (sock_flag(other, SOCK_DEAD)) {
1182                         unix_state_double_unlock(sk, other);
1183                         sock_put(other);
1184                         goto restart;
1185                 }
1186
1187                 err = -EPERM;
1188                 if (!unix_may_send(sk, other))
1189                         goto out_unlock;
1190
1191                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1192                 if (err)
1193                         goto out_unlock;
1194
1195         } else {
1196                 /*
1197                  *      1003.1g breaking connected state with AF_UNSPEC
1198                  */
1199                 other = NULL;
1200                 unix_state_double_lock(sk, other);
1201         }
1202
1203         /*
1204          * If it was connected, reconnect.
1205          */
1206         if (unix_peer(sk)) {
1207                 struct sock *old_peer = unix_peer(sk);
1208                 unix_peer(sk) = other;
1209                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1210
1211                 unix_state_double_unlock(sk, other);
1212
1213                 if (other != old_peer)
1214                         unix_dgram_disconnected(sk, old_peer);
1215                 sock_put(old_peer);
1216         } else {
1217                 unix_peer(sk) = other;
1218                 unix_state_double_unlock(sk, other);
1219         }
1220         return 0;
1221
1222 out_unlock:
1223         unix_state_double_unlock(sk, other);
1224         sock_put(other);
1225 out:
1226         return err;
1227 }
1228
1229 static long unix_wait_for_peer(struct sock *other, long timeo)
1230 {
1231         struct unix_sock *u = unix_sk(other);
1232         int sched;
1233         DEFINE_WAIT(wait);
1234
1235         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1236
1237         sched = !sock_flag(other, SOCK_DEAD) &&
1238                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1239                 unix_recvq_full_lockless(other);
1240
1241         unix_state_unlock(other);
1242
1243         if (sched)
1244                 timeo = schedule_timeout(timeo);
1245
1246         finish_wait(&u->peer_wait, &wait);
1247         return timeo;
1248 }
1249
1250 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1251                                int addr_len, int flags)
1252 {
1253         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1254         struct sock *sk = sock->sk;
1255         struct net *net = sock_net(sk);
1256         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1257         struct sock *newsk = NULL;
1258         struct sock *other = NULL;
1259         struct sk_buff *skb = NULL;
1260         unsigned int hash;
1261         int st;
1262         int err;
1263         long timeo;
1264
1265         err = unix_mkname(sunaddr, addr_len, &hash);
1266         if (err < 0)
1267                 goto out;
1268         addr_len = err;
1269
1270         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1271             (err = unix_autobind(sock)) != 0)
1272                 goto out;
1273
1274         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1275
1276         /* First of all allocate resources.
1277            If we will make it after state is locked,
1278            we will have to recheck all again in any case.
1279          */
1280
1281         err = -ENOMEM;
1282
1283         /* create new sock for complete connection */
1284         newsk = unix_create1(sock_net(sk), NULL, 0);
1285         if (newsk == NULL)
1286                 goto out;
1287
1288         /* Allocate skb for sending to listening sock */
1289         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1290         if (skb == NULL)
1291                 goto out;
1292
1293 restart:
1294         /*  Find listening sock. */
1295         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1296         if (!other)
1297                 goto out;
1298
1299         /* Latch state of peer */
1300         unix_state_lock(other);
1301
1302         /* Apparently VFS overslept socket death. Retry. */
1303         if (sock_flag(other, SOCK_DEAD)) {
1304                 unix_state_unlock(other);
1305                 sock_put(other);
1306                 goto restart;
1307         }
1308
1309         err = -ECONNREFUSED;
1310         if (other->sk_state != TCP_LISTEN)
1311                 goto out_unlock;
1312         if (other->sk_shutdown & RCV_SHUTDOWN)
1313                 goto out_unlock;
1314
1315         if (unix_recvq_full(other)) {
1316                 err = -EAGAIN;
1317                 if (!timeo)
1318                         goto out_unlock;
1319
1320                 timeo = unix_wait_for_peer(other, timeo);
1321
1322                 err = sock_intr_errno(timeo);
1323                 if (signal_pending(current))
1324                         goto out;
1325                 sock_put(other);
1326                 goto restart;
1327         }
1328
1329         /* Latch our state.
1330
1331            It is tricky place. We need to grab our state lock and cannot
1332            drop lock on peer. It is dangerous because deadlock is
1333            possible. Connect to self case and simultaneous
1334            attempt to connect are eliminated by checking socket
1335            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1336            check this before attempt to grab lock.
1337
1338            Well, and we have to recheck the state after socket locked.
1339          */
1340         st = sk->sk_state;
1341
1342         switch (st) {
1343         case TCP_CLOSE:
1344                 /* This is ok... continue with connect */
1345                 break;
1346         case TCP_ESTABLISHED:
1347                 /* Socket is already connected */
1348                 err = -EISCONN;
1349                 goto out_unlock;
1350         default:
1351                 err = -EINVAL;
1352                 goto out_unlock;
1353         }
1354
1355         unix_state_lock_nested(sk);
1356
1357         if (sk->sk_state != st) {
1358                 unix_state_unlock(sk);
1359                 unix_state_unlock(other);
1360                 sock_put(other);
1361                 goto restart;
1362         }
1363
1364         err = security_unix_stream_connect(sk, other, newsk);
1365         if (err) {
1366                 unix_state_unlock(sk);
1367                 goto out_unlock;
1368         }
1369
1370         /* The way is open! Fastly set all the necessary fields... */
1371
1372         sock_hold(sk);
1373         unix_peer(newsk)        = sk;
1374         newsk->sk_state         = TCP_ESTABLISHED;
1375         newsk->sk_type          = sk->sk_type;
1376         init_peercred(newsk);
1377         newu = unix_sk(newsk);
1378         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1379         otheru = unix_sk(other);
1380
1381         /* copy address information from listening to new sock
1382          *
1383          * The contents of *(otheru->addr) and otheru->path
1384          * are seen fully set up here, since we have found
1385          * otheru in hash under unix_table_lock.  Insertion
1386          * into the hash chain we'd found it in had been done
1387          * in an earlier critical area protected by unix_table_lock,
1388          * the same one where we'd set *(otheru->addr) contents,
1389          * as well as otheru->path and otheru->addr itself.
1390          *
1391          * Using smp_store_release() here to set newu->addr
1392          * is enough to make those stores, as well as stores
1393          * to newu->path visible to anyone who gets newu->addr
1394          * by smp_load_acquire().  IOW, the same warranties
1395          * as for unix_sock instances bound in unix_bind() or
1396          * in unix_autobind().
1397          */
1398         if (otheru->path.dentry) {
1399                 path_get(&otheru->path);
1400                 newu->path = otheru->path;
1401         }
1402         refcount_inc(&otheru->addr->refcnt);
1403         smp_store_release(&newu->addr, otheru->addr);
1404
1405         /* Set credentials */
1406         copy_peercred(sk, other);
1407
1408         sock->state     = SS_CONNECTED;
1409         sk->sk_state    = TCP_ESTABLISHED;
1410         sock_hold(newsk);
1411
1412         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1413         unix_peer(sk)   = newsk;
1414
1415         unix_state_unlock(sk);
1416
1417         /* take ten and and send info to listening sock */
1418         spin_lock(&other->sk_receive_queue.lock);
1419         __skb_queue_tail(&other->sk_receive_queue, skb);
1420         spin_unlock(&other->sk_receive_queue.lock);
1421         unix_state_unlock(other);
1422         other->sk_data_ready(other);
1423         sock_put(other);
1424         return 0;
1425
1426 out_unlock:
1427         if (other)
1428                 unix_state_unlock(other);
1429
1430 out:
1431         kfree_skb(skb);
1432         if (newsk)
1433                 unix_release_sock(newsk, 0);
1434         if (other)
1435                 sock_put(other);
1436         return err;
1437 }
1438
1439 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1440 {
1441         struct sock *ska = socka->sk, *skb = sockb->sk;
1442
1443         /* Join our sockets back to back */
1444         sock_hold(ska);
1445         sock_hold(skb);
1446         unix_peer(ska) = skb;
1447         unix_peer(skb) = ska;
1448         init_peercred(ska);
1449         init_peercred(skb);
1450
1451         if (ska->sk_type != SOCK_DGRAM) {
1452                 ska->sk_state = TCP_ESTABLISHED;
1453                 skb->sk_state = TCP_ESTABLISHED;
1454                 socka->state  = SS_CONNECTED;
1455                 sockb->state  = SS_CONNECTED;
1456         }
1457         return 0;
1458 }
1459
1460 static void unix_sock_inherit_flags(const struct socket *old,
1461                                     struct socket *new)
1462 {
1463         if (test_bit(SOCK_PASSCRED, &old->flags))
1464                 set_bit(SOCK_PASSCRED, &new->flags);
1465         if (test_bit(SOCK_PASSSEC, &old->flags))
1466                 set_bit(SOCK_PASSSEC, &new->flags);
1467 }
1468
1469 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1470                        bool kern)
1471 {
1472         struct sock *sk = sock->sk;
1473         struct sock *tsk;
1474         struct sk_buff *skb;
1475         int err;
1476
1477         err = -EOPNOTSUPP;
1478         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1479                 goto out;
1480
1481         err = -EINVAL;
1482         if (sk->sk_state != TCP_LISTEN)
1483                 goto out;
1484
1485         /* If socket state is TCP_LISTEN it cannot change (for now...),
1486          * so that no locks are necessary.
1487          */
1488
1489         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1490         if (!skb) {
1491                 /* This means receive shutdown. */
1492                 if (err == 0)
1493                         err = -EINVAL;
1494                 goto out;
1495         }
1496
1497         tsk = skb->sk;
1498         skb_free_datagram(sk, skb);
1499         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1500
1501         /* attach accepted sock to socket */
1502         unix_state_lock(tsk);
1503         newsock->state = SS_CONNECTED;
1504         unix_sock_inherit_flags(sock, newsock);
1505         sock_graft(tsk, newsock);
1506         unix_state_unlock(tsk);
1507         return 0;
1508
1509 out:
1510         return err;
1511 }
1512
1513
1514 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1515 {
1516         struct sock *sk = sock->sk;
1517         struct unix_address *addr;
1518         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1519         int err = 0;
1520
1521         if (peer) {
1522                 sk = unix_peer_get(sk);
1523
1524                 err = -ENOTCONN;
1525                 if (!sk)
1526                         goto out;
1527                 err = 0;
1528         } else {
1529                 sock_hold(sk);
1530         }
1531
1532         addr = smp_load_acquire(&unix_sk(sk)->addr);
1533         if (!addr) {
1534                 sunaddr->sun_family = AF_UNIX;
1535                 sunaddr->sun_path[0] = 0;
1536                 *uaddr_len = sizeof(short);
1537         } else {
1538                 *uaddr_len = addr->len;
1539                 memcpy(sunaddr, addr->name, *uaddr_len);
1540         }
1541         sock_put(sk);
1542 out:
1543         return err;
1544 }
1545
1546 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1547 {
1548         scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1549
1550         /*
1551          * Garbage collection of unix sockets starts by selecting a set of
1552          * candidate sockets which have reference only from being in flight
1553          * (total_refs == inflight_refs).  This condition is checked once during
1554          * the candidate collection phase, and candidates are marked as such, so
1555          * that non-candidates can later be ignored.  While inflight_refs is
1556          * protected by unix_gc_lock, total_refs (file count) is not, hence this
1557          * is an instantaneous decision.
1558          *
1559          * Once a candidate, however, the socket must not be reinstalled into a
1560          * file descriptor while the garbage collection is in progress.
1561          *
1562          * If the above conditions are met, then the directed graph of
1563          * candidates (*) does not change while unix_gc_lock is held.
1564          *
1565          * Any operations that changes the file count through file descriptors
1566          * (dup, close, sendmsg) does not change the graph since candidates are
1567          * not installed in fds.
1568          *
1569          * Dequeing a candidate via recvmsg would install it into an fd, but
1570          * that takes unix_gc_lock to decrement the inflight count, so it's
1571          * serialized with garbage collection.
1572          *
1573          * MSG_PEEK is special in that it does not change the inflight count,
1574          * yet does install the socket into an fd.  The following lock/unlock
1575          * pair is to ensure serialization with garbage collection.  It must be
1576          * done between incrementing the file count and installing the file into
1577          * an fd.
1578          *
1579          * If garbage collection starts after the barrier provided by the
1580          * lock/unlock, then it will see the elevated refcount and not mark this
1581          * as a candidate.  If a garbage collection is already in progress
1582          * before the file count was incremented, then the lock/unlock pair will
1583          * ensure that garbage collection is finished before progressing to
1584          * installing the fd.
1585          *
1586          * (*) A -> B where B is on the queue of A or B is on the queue of C
1587          * which is on the queue of listening socket A.
1588          */
1589         spin_lock(&unix_gc_lock);
1590         spin_unlock(&unix_gc_lock);
1591 }
1592
1593 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1594 {
1595         int err = 0;
1596
1597         UNIXCB(skb).pid  = get_pid(scm->pid);
1598         UNIXCB(skb).uid = scm->creds.uid;
1599         UNIXCB(skb).gid = scm->creds.gid;
1600         UNIXCB(skb).fp = NULL;
1601         unix_get_secdata(scm, skb);
1602         if (scm->fp && send_fds)
1603                 err = unix_attach_fds(scm, skb);
1604
1605         skb->destructor = unix_destruct_scm;
1606         return err;
1607 }
1608
1609 static bool unix_passcred_enabled(const struct socket *sock,
1610                                   const struct sock *other)
1611 {
1612         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1613                !other->sk_socket ||
1614                test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1615 }
1616
1617 /*
1618  * Some apps rely on write() giving SCM_CREDENTIALS
1619  * We include credentials if source or destination socket
1620  * asserted SOCK_PASSCRED.
1621  */
1622 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1623                             const struct sock *other)
1624 {
1625         if (UNIXCB(skb).pid)
1626                 return;
1627         if (unix_passcred_enabled(sock, other)) {
1628                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1629                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1630         }
1631 }
1632
1633 static int maybe_init_creds(struct scm_cookie *scm,
1634                             struct socket *socket,
1635                             const struct sock *other)
1636 {
1637         int err;
1638         struct msghdr msg = { .msg_controllen = 0 };
1639
1640         err = scm_send(socket, &msg, scm, false);
1641         if (err)
1642                 return err;
1643
1644         if (unix_passcred_enabled(socket, other)) {
1645                 scm->pid = get_pid(task_tgid(current));
1646                 current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1647         }
1648         return err;
1649 }
1650
1651 static bool unix_skb_scm_eq(struct sk_buff *skb,
1652                             struct scm_cookie *scm)
1653 {
1654         const struct unix_skb_parms *u = &UNIXCB(skb);
1655
1656         return u->pid == scm->pid &&
1657                uid_eq(u->uid, scm->creds.uid) &&
1658                gid_eq(u->gid, scm->creds.gid) &&
1659                unix_secdata_eq(scm, skb);
1660 }
1661
1662 /*
1663  *      Send AF_UNIX data.
1664  */
1665
1666 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1667                               size_t len)
1668 {
1669         struct sock *sk = sock->sk;
1670         struct net *net = sock_net(sk);
1671         struct unix_sock *u = unix_sk(sk);
1672         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1673         struct sock *other = NULL;
1674         int namelen = 0; /* fake GCC */
1675         int err;
1676         unsigned int hash;
1677         struct sk_buff *skb;
1678         long timeo;
1679         struct scm_cookie scm;
1680         int data_len = 0;
1681         int sk_locked;
1682
1683         wait_for_unix_gc();
1684         err = scm_send(sock, msg, &scm, false);
1685         if (err < 0)
1686                 return err;
1687
1688         err = -EOPNOTSUPP;
1689         if (msg->msg_flags&MSG_OOB)
1690                 goto out;
1691
1692         if (msg->msg_namelen) {
1693                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1694                 if (err < 0)
1695                         goto out;
1696                 namelen = err;
1697         } else {
1698                 sunaddr = NULL;
1699                 err = -ENOTCONN;
1700                 other = unix_peer_get(sk);
1701                 if (!other)
1702                         goto out;
1703         }
1704
1705         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1706             && (err = unix_autobind(sock)) != 0)
1707                 goto out;
1708
1709         err = -EMSGSIZE;
1710         if (len > sk->sk_sndbuf - 32)
1711                 goto out;
1712
1713         if (len > SKB_MAX_ALLOC) {
1714                 data_len = min_t(size_t,
1715                                  len - SKB_MAX_ALLOC,
1716                                  MAX_SKB_FRAGS * PAGE_SIZE);
1717                 data_len = PAGE_ALIGN(data_len);
1718
1719                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1720         }
1721
1722         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1723                                    msg->msg_flags & MSG_DONTWAIT, &err,
1724                                    PAGE_ALLOC_COSTLY_ORDER);
1725         if (skb == NULL)
1726                 goto out;
1727
1728         err = unix_scm_to_skb(&scm, skb, true);
1729         if (err < 0)
1730                 goto out_free;
1731
1732         skb_put(skb, len - data_len);
1733         skb->data_len = data_len;
1734         skb->len = len;
1735         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1736         if (err)
1737                 goto out_free;
1738
1739         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1740
1741 restart:
1742         if (!other) {
1743                 err = -ECONNRESET;
1744                 if (sunaddr == NULL)
1745                         goto out_free;
1746
1747                 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1748                                         hash, &err);
1749                 if (other == NULL)
1750                         goto out_free;
1751         }
1752
1753         if (sk_filter(other, skb) < 0) {
1754                 /* Toss the packet but do not return any error to the sender */
1755                 err = len;
1756                 goto out_free;
1757         }
1758
1759         sk_locked = 0;
1760         unix_state_lock(other);
1761 restart_locked:
1762         err = -EPERM;
1763         if (!unix_may_send(sk, other))
1764                 goto out_unlock;
1765
1766         if (unlikely(sock_flag(other, SOCK_DEAD))) {
1767                 /*
1768                  *      Check with 1003.1g - what should
1769                  *      datagram error
1770                  */
1771                 unix_state_unlock(other);
1772                 sock_put(other);
1773
1774                 if (!sk_locked)
1775                         unix_state_lock(sk);
1776
1777                 err = 0;
1778                 if (unix_peer(sk) == other) {
1779                         unix_peer(sk) = NULL;
1780                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1781
1782                         unix_state_unlock(sk);
1783
1784                         unix_dgram_disconnected(sk, other);
1785                         sock_put(other);
1786                         err = -ECONNREFUSED;
1787                 } else {
1788                         unix_state_unlock(sk);
1789                 }
1790
1791                 other = NULL;
1792                 if (err)
1793                         goto out_free;
1794                 goto restart;
1795         }
1796
1797         err = -EPIPE;
1798         if (other->sk_shutdown & RCV_SHUTDOWN)
1799                 goto out_unlock;
1800
1801         if (sk->sk_type != SOCK_SEQPACKET) {
1802                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1803                 if (err)
1804                         goto out_unlock;
1805         }
1806
1807         /* other == sk && unix_peer(other) != sk if
1808          * - unix_peer(sk) == NULL, destination address bound to sk
1809          * - unix_peer(sk) == sk by time of get but disconnected before lock
1810          */
1811         if (other != sk &&
1812             unlikely(unix_peer(other) != sk &&
1813             unix_recvq_full_lockless(other))) {
1814                 if (timeo) {
1815                         timeo = unix_wait_for_peer(other, timeo);
1816
1817                         err = sock_intr_errno(timeo);
1818                         if (signal_pending(current))
1819                                 goto out_free;
1820
1821                         goto restart;
1822                 }
1823
1824                 if (!sk_locked) {
1825                         unix_state_unlock(other);
1826                         unix_state_double_lock(sk, other);
1827                 }
1828
1829                 if (unix_peer(sk) != other ||
1830                     unix_dgram_peer_wake_me(sk, other)) {
1831                         err = -EAGAIN;
1832                         sk_locked = 1;
1833                         goto out_unlock;
1834                 }
1835
1836                 if (!sk_locked) {
1837                         sk_locked = 1;
1838                         goto restart_locked;
1839                 }
1840         }
1841
1842         if (unlikely(sk_locked))
1843                 unix_state_unlock(sk);
1844
1845         if (sock_flag(other, SOCK_RCVTSTAMP))
1846                 __net_timestamp(skb);
1847         maybe_add_creds(skb, sock, other);
1848         skb_queue_tail(&other->sk_receive_queue, skb);
1849         unix_state_unlock(other);
1850         other->sk_data_ready(other);
1851         sock_put(other);
1852         scm_destroy(&scm);
1853         return len;
1854
1855 out_unlock:
1856         if (sk_locked)
1857                 unix_state_unlock(sk);
1858         unix_state_unlock(other);
1859 out_free:
1860         kfree_skb(skb);
1861 out:
1862         if (other)
1863                 sock_put(other);
1864         scm_destroy(&scm);
1865         return err;
1866 }
1867
1868 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1869  * bytes, and a minimun of a full page.
1870  */
1871 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1872
1873 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1874                                size_t len)
1875 {
1876         struct sock *sk = sock->sk;
1877         struct sock *other = NULL;
1878         int err, size;
1879         struct sk_buff *skb;
1880         int sent = 0;
1881         struct scm_cookie scm;
1882         bool fds_sent = false;
1883         int data_len;
1884
1885         wait_for_unix_gc();
1886         err = scm_send(sock, msg, &scm, false);
1887         if (err < 0)
1888                 return err;
1889
1890         err = -EOPNOTSUPP;
1891         if (msg->msg_flags&MSG_OOB)
1892                 goto out_err;
1893
1894         if (msg->msg_namelen) {
1895                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1896                 goto out_err;
1897         } else {
1898                 err = -ENOTCONN;
1899                 other = unix_peer(sk);
1900                 if (!other)
1901                         goto out_err;
1902         }
1903
1904         if (sk->sk_shutdown & SEND_SHUTDOWN)
1905                 goto pipe_err;
1906
1907         while (sent < len) {
1908                 size = len - sent;
1909
1910                 /* Keep two messages in the pipe so it schedules better */
1911                 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1912
1913                 /* allow fallback to order-0 allocations */
1914                 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1915
1916                 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1917
1918                 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1919
1920                 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1921                                            msg->msg_flags & MSG_DONTWAIT, &err,
1922                                            get_order(UNIX_SKB_FRAGS_SZ));
1923                 if (!skb)
1924                         goto out_err;
1925
1926                 /* Only send the fds in the first buffer */
1927                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
1928                 if (err < 0) {
1929                         kfree_skb(skb);
1930                         goto out_err;
1931                 }
1932                 fds_sent = true;
1933
1934                 skb_put(skb, size - data_len);
1935                 skb->data_len = data_len;
1936                 skb->len = size;
1937                 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1938                 if (err) {
1939                         kfree_skb(skb);
1940                         goto out_err;
1941                 }
1942
1943                 unix_state_lock(other);
1944
1945                 if (sock_flag(other, SOCK_DEAD) ||
1946                     (other->sk_shutdown & RCV_SHUTDOWN))
1947                         goto pipe_err_free;
1948
1949                 maybe_add_creds(skb, sock, other);
1950                 skb_queue_tail(&other->sk_receive_queue, skb);
1951                 unix_state_unlock(other);
1952                 other->sk_data_ready(other);
1953                 sent += size;
1954         }
1955
1956         scm_destroy(&scm);
1957
1958         return sent;
1959
1960 pipe_err_free:
1961         unix_state_unlock(other);
1962         kfree_skb(skb);
1963 pipe_err:
1964         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1965                 send_sig(SIGPIPE, current, 0);
1966         err = -EPIPE;
1967 out_err:
1968         scm_destroy(&scm);
1969         return sent ? : err;
1970 }
1971
1972 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
1973                                     int offset, size_t size, int flags)
1974 {
1975         int err;
1976         bool send_sigpipe = false;
1977         bool init_scm = true;
1978         struct scm_cookie scm;
1979         struct sock *other, *sk = socket->sk;
1980         struct sk_buff *skb, *newskb = NULL, *tail = NULL;
1981
1982         if (flags & MSG_OOB)
1983                 return -EOPNOTSUPP;
1984
1985         other = unix_peer(sk);
1986         if (!other || sk->sk_state != TCP_ESTABLISHED)
1987                 return -ENOTCONN;
1988
1989         if (false) {
1990 alloc_skb:
1991                 spin_unlock(&other->sk_receive_queue.lock);
1992                 unix_state_unlock(other);
1993                 mutex_unlock(&unix_sk(other)->iolock);
1994                 newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
1995                                               &err, 0);
1996                 if (!newskb)
1997                         goto err;
1998         }
1999
2000         /* we must acquire iolock as we modify already present
2001          * skbs in the sk_receive_queue and mess with skb->len
2002          */
2003         err = mutex_lock_interruptible(&unix_sk(other)->iolock);
2004         if (err) {
2005                 err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
2006                 goto err;
2007         }
2008
2009         if (sk->sk_shutdown & SEND_SHUTDOWN) {
2010                 err = -EPIPE;
2011                 send_sigpipe = true;
2012                 goto err_unlock;
2013         }
2014
2015         unix_state_lock(other);
2016
2017         if (sock_flag(other, SOCK_DEAD) ||
2018             other->sk_shutdown & RCV_SHUTDOWN) {
2019                 err = -EPIPE;
2020                 send_sigpipe = true;
2021                 goto err_state_unlock;
2022         }
2023
2024         if (init_scm) {
2025                 err = maybe_init_creds(&scm, socket, other);
2026                 if (err)
2027                         goto err_state_unlock;
2028                 init_scm = false;
2029         }
2030
2031         spin_lock(&other->sk_receive_queue.lock);
2032         skb = skb_peek_tail(&other->sk_receive_queue);
2033         if (tail && tail == skb) {
2034                 skb = newskb;
2035         } else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2036                 if (newskb) {
2037                         skb = newskb;
2038                 } else {
2039                         tail = skb;
2040                         goto alloc_skb;
2041                 }
2042         } else if (newskb) {
2043                 /* this is fast path, we don't necessarily need to
2044                  * call to kfree_skb even though with newskb == NULL
2045                  * this - does no harm
2046                  */
2047                 consume_skb(newskb);
2048                 newskb = NULL;
2049         }
2050
2051         if (skb_append_pagefrags(skb, page, offset, size)) {
2052                 tail = skb;
2053                 goto alloc_skb;
2054         }
2055
2056         skb->len += size;
2057         skb->data_len += size;
2058         skb->truesize += size;
2059         refcount_add(size, &sk->sk_wmem_alloc);
2060
2061         if (newskb) {
2062                 unix_scm_to_skb(&scm, skb, false);
2063                 __skb_queue_tail(&other->sk_receive_queue, newskb);
2064         }
2065
2066         spin_unlock(&other->sk_receive_queue.lock);
2067         unix_state_unlock(other);
2068         mutex_unlock(&unix_sk(other)->iolock);
2069
2070         other->sk_data_ready(other);
2071         scm_destroy(&scm);
2072         return size;
2073
2074 err_state_unlock:
2075         unix_state_unlock(other);
2076 err_unlock:
2077         mutex_unlock(&unix_sk(other)->iolock);
2078 err:
2079         kfree_skb(newskb);
2080         if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2081                 send_sig(SIGPIPE, current, 0);
2082         if (!init_scm)
2083                 scm_destroy(&scm);
2084         return err;
2085 }
2086
2087 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2088                                   size_t len)
2089 {
2090         int err;
2091         struct sock *sk = sock->sk;
2092
2093         err = sock_error(sk);
2094         if (err)
2095                 return err;
2096
2097         if (sk->sk_state != TCP_ESTABLISHED)
2098                 return -ENOTCONN;
2099
2100         if (msg->msg_namelen)
2101                 msg->msg_namelen = 0;
2102
2103         return unix_dgram_sendmsg(sock, msg, len);
2104 }
2105
2106 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2107                                   size_t size, int flags)
2108 {
2109         struct sock *sk = sock->sk;
2110
2111         if (sk->sk_state != TCP_ESTABLISHED)
2112                 return -ENOTCONN;
2113
2114         return unix_dgram_recvmsg(sock, msg, size, flags);
2115 }
2116
2117 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2118 {
2119         struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2120
2121         if (addr) {
2122                 msg->msg_namelen = addr->len;
2123                 memcpy(msg->msg_name, addr->name, addr->len);
2124         }
2125 }
2126
2127 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
2128                               size_t size, int flags)
2129 {
2130         struct scm_cookie scm;
2131         struct sock *sk = sock->sk;
2132         struct unix_sock *u = unix_sk(sk);
2133         struct sk_buff *skb, *last;
2134         long timeo;
2135         int err;
2136         int peeked, skip;
2137
2138         err = -EOPNOTSUPP;
2139         if (flags&MSG_OOB)
2140                 goto out;
2141
2142         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2143
2144         do {
2145                 mutex_lock(&u->iolock);
2146
2147                 skip = sk_peek_offset(sk, flags);
2148                 skb = __skb_try_recv_datagram(sk, flags, NULL, &peeked, &skip,
2149                                               &err, &last);
2150                 if (skb)
2151                         break;
2152
2153                 mutex_unlock(&u->iolock);
2154
2155                 if (err != -EAGAIN)
2156                         break;
2157         } while (timeo &&
2158                  !__skb_wait_for_more_packets(sk, &err, &timeo, last));
2159
2160         if (!skb) { /* implies iolock unlocked */
2161                 unix_state_lock(sk);
2162                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2163                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2164                     (sk->sk_shutdown & RCV_SHUTDOWN))
2165                         err = 0;
2166                 unix_state_unlock(sk);
2167                 goto out;
2168         }
2169
2170         if (wq_has_sleeper(&u->peer_wait))
2171                 wake_up_interruptible_sync_poll(&u->peer_wait,
2172                                                 POLLOUT | POLLWRNORM |
2173                                                 POLLWRBAND);
2174
2175         if (msg->msg_name)
2176                 unix_copy_addr(msg, skb->sk);
2177
2178         if (size > skb->len - skip)
2179                 size = skb->len - skip;
2180         else if (size < skb->len - skip)
2181                 msg->msg_flags |= MSG_TRUNC;
2182
2183         err = skb_copy_datagram_msg(skb, skip, msg, size);
2184         if (err)
2185                 goto out_free;
2186
2187         if (sock_flag(sk, SOCK_RCVTSTAMP))
2188                 __sock_recv_timestamp(msg, sk, skb);
2189
2190         memset(&scm, 0, sizeof(scm));
2191
2192         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2193         unix_set_secdata(&scm, skb);
2194
2195         if (!(flags & MSG_PEEK)) {
2196                 if (UNIXCB(skb).fp)
2197                         unix_detach_fds(&scm, skb);
2198
2199                 sk_peek_offset_bwd(sk, skb->len);
2200         } else {
2201                 /* It is questionable: on PEEK we could:
2202                    - do not return fds - good, but too simple 8)
2203                    - return fds, and do not return them on read (old strategy,
2204                      apparently wrong)
2205                    - clone fds (I chose it for now, it is the most universal
2206                      solution)
2207
2208                    POSIX 1003.1g does not actually define this clearly
2209                    at all. POSIX 1003.1g doesn't define a lot of things
2210                    clearly however!
2211
2212                 */
2213
2214                 sk_peek_offset_fwd(sk, size);
2215
2216                 if (UNIXCB(skb).fp)
2217                         unix_peek_fds(&scm, skb);
2218         }
2219         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2220
2221         scm_recv(sock, msg, &scm, flags);
2222
2223 out_free:
2224         skb_free_datagram(sk, skb);
2225         mutex_unlock(&u->iolock);
2226 out:
2227         return err;
2228 }
2229
2230 /*
2231  *      Sleep until more data has arrived. But check for races..
2232  */
2233 static long unix_stream_data_wait(struct sock *sk, long timeo,
2234                                   struct sk_buff *last, unsigned int last_len,
2235                                   bool freezable)
2236 {
2237         struct sk_buff *tail;
2238         DEFINE_WAIT(wait);
2239
2240         unix_state_lock(sk);
2241
2242         for (;;) {
2243                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2244
2245                 tail = skb_peek_tail(&sk->sk_receive_queue);
2246                 if (tail != last ||
2247                     (tail && tail->len != last_len) ||
2248                     sk->sk_err ||
2249                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2250                     signal_pending(current) ||
2251                     !timeo)
2252                         break;
2253
2254                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2255                 unix_state_unlock(sk);
2256                 if (freezable)
2257                         timeo = freezable_schedule_timeout(timeo);
2258                 else
2259                         timeo = schedule_timeout(timeo);
2260                 unix_state_lock(sk);
2261
2262                 if (sock_flag(sk, SOCK_DEAD))
2263                         break;
2264
2265                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2266         }
2267
2268         finish_wait(sk_sleep(sk), &wait);
2269         unix_state_unlock(sk);
2270         return timeo;
2271 }
2272
2273 static unsigned int unix_skb_len(const struct sk_buff *skb)
2274 {
2275         return skb->len - UNIXCB(skb).consumed;
2276 }
2277
2278 struct unix_stream_read_state {
2279         int (*recv_actor)(struct sk_buff *, int, int,
2280                           struct unix_stream_read_state *);
2281         struct socket *socket;
2282         struct msghdr *msg;
2283         struct pipe_inode_info *pipe;
2284         size_t size;
2285         int flags;
2286         unsigned int splice_flags;
2287 };
2288
2289 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2290                                     bool freezable)
2291 {
2292         struct scm_cookie scm;
2293         struct socket *sock = state->socket;
2294         struct sock *sk = sock->sk;
2295         struct unix_sock *u = unix_sk(sk);
2296         int copied = 0;
2297         int flags = state->flags;
2298         int noblock = flags & MSG_DONTWAIT;
2299         bool check_creds = false;
2300         int target;
2301         int err = 0;
2302         long timeo;
2303         int skip;
2304         size_t size = state->size;
2305         unsigned int last_len;
2306
2307         if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2308                 err = -EINVAL;
2309                 goto out;
2310         }
2311
2312         if (unlikely(flags & MSG_OOB)) {
2313                 err = -EOPNOTSUPP;
2314                 goto out;
2315         }
2316
2317         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2318         timeo = sock_rcvtimeo(sk, noblock);
2319
2320         memset(&scm, 0, sizeof(scm));
2321
2322         /* Lock the socket to prevent queue disordering
2323          * while sleeps in memcpy_tomsg
2324          */
2325         mutex_lock(&u->iolock);
2326
2327         skip = max(sk_peek_offset(sk, flags), 0);
2328
2329         do {
2330                 int chunk;
2331                 bool drop_skb;
2332                 struct sk_buff *skb, *last;
2333
2334 redo:
2335                 unix_state_lock(sk);
2336                 if (sock_flag(sk, SOCK_DEAD)) {
2337                         err = -ECONNRESET;
2338                         goto unlock;
2339                 }
2340                 last = skb = skb_peek(&sk->sk_receive_queue);
2341                 last_len = last ? last->len : 0;
2342 again:
2343                 if (skb == NULL) {
2344                         if (copied >= target)
2345                                 goto unlock;
2346
2347                         /*
2348                          *      POSIX 1003.1g mandates this order.
2349                          */
2350
2351                         err = sock_error(sk);
2352                         if (err)
2353                                 goto unlock;
2354                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2355                                 goto unlock;
2356
2357                         unix_state_unlock(sk);
2358                         if (!timeo) {
2359                                 err = -EAGAIN;
2360                                 break;
2361                         }
2362
2363                         mutex_unlock(&u->iolock);
2364
2365                         timeo = unix_stream_data_wait(sk, timeo, last,
2366                                                       last_len, freezable);
2367
2368                         if (signal_pending(current)) {
2369                                 err = sock_intr_errno(timeo);
2370                                 scm_destroy(&scm);
2371                                 goto out;
2372                         }
2373
2374                         mutex_lock(&u->iolock);
2375                         goto redo;
2376 unlock:
2377                         unix_state_unlock(sk);
2378                         break;
2379                 }
2380
2381                 while (skip >= unix_skb_len(skb)) {
2382                         skip -= unix_skb_len(skb);
2383                         last = skb;
2384                         last_len = skb->len;
2385                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2386                         if (!skb)
2387                                 goto again;
2388                 }
2389
2390                 unix_state_unlock(sk);
2391
2392                 if (check_creds) {
2393                         /* Never glue messages from different writers */
2394                         if (!unix_skb_scm_eq(skb, &scm))
2395                                 break;
2396                 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2397                         /* Copy credentials */
2398                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2399                         unix_set_secdata(&scm, skb);
2400                         check_creds = true;
2401                 }
2402
2403                 /* Copy address just once */
2404                 if (state->msg && state->msg->msg_name) {
2405                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2406                                          state->msg->msg_name);
2407                         unix_copy_addr(state->msg, skb->sk);
2408                         sunaddr = NULL;
2409                 }
2410
2411                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2412                 skb_get(skb);
2413                 chunk = state->recv_actor(skb, skip, chunk, state);
2414                 drop_skb = !unix_skb_len(skb);
2415                 /* skb is only safe to use if !drop_skb */
2416                 consume_skb(skb);
2417                 if (chunk < 0) {
2418                         if (copied == 0)
2419                                 copied = -EFAULT;
2420                         break;
2421                 }
2422                 copied += chunk;
2423                 size -= chunk;
2424
2425                 if (drop_skb) {
2426                         /* the skb was touched by a concurrent reader;
2427                          * we should not expect anything from this skb
2428                          * anymore and assume it invalid - we can be
2429                          * sure it was dropped from the socket queue
2430                          *
2431                          * let's report a short read
2432                          */
2433                         err = 0;
2434                         break;
2435                 }
2436
2437                 /* Mark read part of skb as used */
2438                 if (!(flags & MSG_PEEK)) {
2439                         UNIXCB(skb).consumed += chunk;
2440
2441                         sk_peek_offset_bwd(sk, chunk);
2442
2443                         if (UNIXCB(skb).fp)
2444                                 unix_detach_fds(&scm, skb);
2445
2446                         if (unix_skb_len(skb))
2447                                 break;
2448
2449                         skb_unlink(skb, &sk->sk_receive_queue);
2450                         consume_skb(skb);
2451
2452                         if (scm.fp)
2453                                 break;
2454                 } else {
2455                         /* It is questionable, see note in unix_dgram_recvmsg.
2456                          */
2457                         if (UNIXCB(skb).fp)
2458                                 unix_peek_fds(&scm, skb);
2459
2460                         sk_peek_offset_fwd(sk, chunk);
2461
2462                         if (UNIXCB(skb).fp)
2463                                 break;
2464
2465                         skip = 0;
2466                         last = skb;
2467                         last_len = skb->len;
2468                         unix_state_lock(sk);
2469                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2470                         if (skb)
2471                                 goto again;
2472                         unix_state_unlock(sk);
2473                         break;
2474                 }
2475         } while (size);
2476
2477         mutex_unlock(&u->iolock);
2478         if (state->msg)
2479                 scm_recv(sock, state->msg, &scm, flags);
2480         else
2481                 scm_destroy(&scm);
2482 out:
2483         return copied ? : err;
2484 }
2485
2486 static int unix_stream_read_actor(struct sk_buff *skb,
2487                                   int skip, int chunk,
2488                                   struct unix_stream_read_state *state)
2489 {
2490         int ret;
2491
2492         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2493                                     state->msg, chunk);
2494         return ret ?: chunk;
2495 }
2496
2497 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2498                                size_t size, int flags)
2499 {
2500         struct unix_stream_read_state state = {
2501                 .recv_actor = unix_stream_read_actor,
2502                 .socket = sock,
2503                 .msg = msg,
2504                 .size = size,
2505                 .flags = flags
2506         };
2507
2508         return unix_stream_read_generic(&state, true);
2509 }
2510
2511 static int unix_stream_splice_actor(struct sk_buff *skb,
2512                                     int skip, int chunk,
2513                                     struct unix_stream_read_state *state)
2514 {
2515         return skb_splice_bits(skb, state->socket->sk,
2516                                UNIXCB(skb).consumed + skip,
2517                                state->pipe, chunk, state->splice_flags);
2518 }
2519
2520 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2521                                        struct pipe_inode_info *pipe,
2522                                        size_t size, unsigned int flags)
2523 {
2524         struct unix_stream_read_state state = {
2525                 .recv_actor = unix_stream_splice_actor,
2526                 .socket = sock,
2527                 .pipe = pipe,
2528                 .size = size,
2529                 .splice_flags = flags,
2530         };
2531
2532         if (unlikely(*ppos))
2533                 return -ESPIPE;
2534
2535         if (sock->file->f_flags & O_NONBLOCK ||
2536             flags & SPLICE_F_NONBLOCK)
2537                 state.flags = MSG_DONTWAIT;
2538
2539         return unix_stream_read_generic(&state, false);
2540 }
2541
2542 static int unix_shutdown(struct socket *sock, int mode)
2543 {
2544         struct sock *sk = sock->sk;
2545         struct sock *other;
2546
2547         if (mode < SHUT_RD || mode > SHUT_RDWR)
2548                 return -EINVAL;
2549         /* This maps:
2550          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2551          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2552          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2553          */
2554         ++mode;
2555
2556         unix_state_lock(sk);
2557         sk->sk_shutdown |= mode;
2558         other = unix_peer(sk);
2559         if (other)
2560                 sock_hold(other);
2561         unix_state_unlock(sk);
2562         sk->sk_state_change(sk);
2563
2564         if (other &&
2565                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2566
2567                 int peer_mode = 0;
2568
2569                 if (mode&RCV_SHUTDOWN)
2570                         peer_mode |= SEND_SHUTDOWN;
2571                 if (mode&SEND_SHUTDOWN)
2572                         peer_mode |= RCV_SHUTDOWN;
2573                 unix_state_lock(other);
2574                 other->sk_shutdown |= peer_mode;
2575                 unix_state_unlock(other);
2576                 other->sk_state_change(other);
2577                 if (peer_mode == SHUTDOWN_MASK)
2578                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2579                 else if (peer_mode & RCV_SHUTDOWN)
2580                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2581         }
2582         if (other)
2583                 sock_put(other);
2584
2585         return 0;
2586 }
2587
2588 long unix_inq_len(struct sock *sk)
2589 {
2590         struct sk_buff *skb;
2591         long amount = 0;
2592
2593         if (sk->sk_state == TCP_LISTEN)
2594                 return -EINVAL;
2595
2596         spin_lock(&sk->sk_receive_queue.lock);
2597         if (sk->sk_type == SOCK_STREAM ||
2598             sk->sk_type == SOCK_SEQPACKET) {
2599                 skb_queue_walk(&sk->sk_receive_queue, skb)
2600                         amount += unix_skb_len(skb);
2601         } else {
2602                 skb = skb_peek(&sk->sk_receive_queue);
2603                 if (skb)
2604                         amount = skb->len;
2605         }
2606         spin_unlock(&sk->sk_receive_queue.lock);
2607
2608         return amount;
2609 }
2610 EXPORT_SYMBOL_GPL(unix_inq_len);
2611
2612 long unix_outq_len(struct sock *sk)
2613 {
2614         return sk_wmem_alloc_get(sk);
2615 }
2616 EXPORT_SYMBOL_GPL(unix_outq_len);
2617
2618 static int unix_open_file(struct sock *sk)
2619 {
2620         struct path path;
2621         struct file *f;
2622         int fd;
2623
2624         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2625                 return -EPERM;
2626
2627         if (!smp_load_acquire(&unix_sk(sk)->addr))
2628                 return -ENOENT;
2629
2630         path = unix_sk(sk)->path;
2631         if (!path.dentry)
2632                 return -ENOENT;
2633
2634         path_get(&path);
2635
2636         fd = get_unused_fd_flags(O_CLOEXEC);
2637         if (fd < 0)
2638                 goto out;
2639
2640         f = dentry_open(&path, O_PATH, current_cred());
2641         if (IS_ERR(f)) {
2642                 put_unused_fd(fd);
2643                 fd = PTR_ERR(f);
2644                 goto out;
2645         }
2646
2647         fd_install(fd, f);
2648 out:
2649         path_put(&path);
2650
2651         return fd;
2652 }
2653
2654 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2655 {
2656         struct sock *sk = sock->sk;
2657         long amount = 0;
2658         int err;
2659
2660         switch (cmd) {
2661         case SIOCOUTQ:
2662                 amount = unix_outq_len(sk);
2663                 err = put_user(amount, (int __user *)arg);
2664                 break;
2665         case SIOCINQ:
2666                 amount = unix_inq_len(sk);
2667                 if (amount < 0)
2668                         err = amount;
2669                 else
2670                         err = put_user(amount, (int __user *)arg);
2671                 break;
2672         case SIOCUNIXFILE:
2673                 err = unix_open_file(sk);
2674                 break;
2675         default:
2676                 err = -ENOIOCTLCMD;
2677                 break;
2678         }
2679         return err;
2680 }
2681
2682 #ifdef CONFIG_COMPAT
2683 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2684 {
2685         return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
2686 }
2687 #endif
2688
2689 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2690 {
2691         struct sock *sk = sock->sk;
2692         unsigned int mask;
2693
2694         sock_poll_wait(file, sk_sleep(sk), wait);
2695         mask = 0;
2696
2697         /* exceptional events? */
2698         if (sk->sk_err)
2699                 mask |= POLLERR;
2700         if (sk->sk_shutdown == SHUTDOWN_MASK)
2701                 mask |= POLLHUP;
2702         if (sk->sk_shutdown & RCV_SHUTDOWN)
2703                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2704
2705         /* readable? */
2706         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
2707                 mask |= POLLIN | POLLRDNORM;
2708
2709         /* Connection-based need to check for termination and startup */
2710         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2711             sk->sk_state == TCP_CLOSE)
2712                 mask |= POLLHUP;
2713
2714         /*
2715          * we set writable also when the other side has shut down the
2716          * connection. This prevents stuck sockets.
2717          */
2718         if (unix_writable(sk))
2719                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2720
2721         return mask;
2722 }
2723
2724 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2725                                     poll_table *wait)
2726 {
2727         struct sock *sk = sock->sk, *other;
2728         unsigned int mask, writable;
2729
2730         sock_poll_wait(file, sk_sleep(sk), wait);
2731         mask = 0;
2732
2733         /* exceptional events? */
2734         if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
2735                 mask |= POLLERR |
2736                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2737
2738         if (sk->sk_shutdown & RCV_SHUTDOWN)
2739                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2740         if (sk->sk_shutdown == SHUTDOWN_MASK)
2741                 mask |= POLLHUP;
2742
2743         /* readable? */
2744         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
2745                 mask |= POLLIN | POLLRDNORM;
2746
2747         /* Connection-based need to check for termination and startup */
2748         if (sk->sk_type == SOCK_SEQPACKET) {
2749                 if (sk->sk_state == TCP_CLOSE)
2750                         mask |= POLLHUP;
2751                 /* connection hasn't started yet? */
2752                 if (sk->sk_state == TCP_SYN_SENT)
2753                         return mask;
2754         }
2755
2756         /* No write status requested, avoid expensive OUT tests. */
2757         if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2758                 return mask;
2759
2760         writable = unix_writable(sk);
2761         if (writable) {
2762                 unix_state_lock(sk);
2763
2764                 other = unix_peer(sk);
2765                 if (other && unix_peer(other) != sk &&
2766                     unix_recvq_full_lockless(other) &&
2767                     unix_dgram_peer_wake_me(sk, other))
2768                         writable = 0;
2769
2770                 unix_state_unlock(sk);
2771         }
2772
2773         if (writable)
2774                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2775         else
2776                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2777
2778         return mask;
2779 }
2780
2781 #ifdef CONFIG_PROC_FS
2782
2783 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2784
2785 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2786 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2787 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2788
2789 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2790 {
2791         unsigned long offset = get_offset(*pos);
2792         unsigned long bucket = get_bucket(*pos);
2793         struct sock *sk;
2794         unsigned long count = 0;
2795
2796         for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2797                 if (sock_net(sk) != seq_file_net(seq))
2798                         continue;
2799                 if (++count == offset)
2800                         break;
2801         }
2802
2803         return sk;
2804 }
2805
2806 static struct sock *unix_next_socket(struct seq_file *seq,
2807                                      struct sock *sk,
2808                                      loff_t *pos)
2809 {
2810         unsigned long bucket;
2811
2812         while (sk > (struct sock *)SEQ_START_TOKEN) {
2813                 sk = sk_next(sk);
2814                 if (!sk)
2815                         goto next_bucket;
2816                 if (sock_net(sk) == seq_file_net(seq))
2817                         return sk;
2818         }
2819
2820         do {
2821                 sk = unix_from_bucket(seq, pos);
2822                 if (sk)
2823                         return sk;
2824
2825 next_bucket:
2826                 bucket = get_bucket(*pos) + 1;
2827                 *pos = set_bucket_offset(bucket, 1);
2828         } while (bucket < ARRAY_SIZE(unix_socket_table));
2829
2830         return NULL;
2831 }
2832
2833 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2834         __acquires(unix_table_lock)
2835 {
2836         spin_lock(&unix_table_lock);
2837
2838         if (!*pos)
2839                 return SEQ_START_TOKEN;
2840
2841         if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2842                 return NULL;
2843
2844         return unix_next_socket(seq, NULL, pos);
2845 }
2846
2847 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2848 {
2849         ++*pos;
2850         return unix_next_socket(seq, v, pos);
2851 }
2852
2853 static void unix_seq_stop(struct seq_file *seq, void *v)
2854         __releases(unix_table_lock)
2855 {
2856         spin_unlock(&unix_table_lock);
2857 }
2858
2859 static int unix_seq_show(struct seq_file *seq, void *v)
2860 {
2861
2862         if (v == SEQ_START_TOKEN)
2863                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2864                          "Inode Path\n");
2865         else {
2866                 struct sock *s = v;
2867                 struct unix_sock *u = unix_sk(s);
2868                 unix_state_lock(s);
2869
2870                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2871                         s,
2872                         refcount_read(&s->sk_refcnt),
2873                         0,
2874                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2875                         s->sk_type,
2876                         s->sk_socket ?
2877                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2878                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2879                         sock_i_ino(s));
2880
2881                 if (u->addr) {  // under unix_table_lock here
2882                         int i, len;
2883                         seq_putc(seq, ' ');
2884
2885                         i = 0;
2886                         len = u->addr->len - sizeof(short);
2887                         if (!UNIX_ABSTRACT(s))
2888                                 len--;
2889                         else {
2890                                 seq_putc(seq, '@');
2891                                 i++;
2892                         }
2893                         for ( ; i < len; i++)
2894                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
2895                                          '@');
2896                 }
2897                 unix_state_unlock(s);
2898                 seq_putc(seq, '\n');
2899         }
2900
2901         return 0;
2902 }
2903
2904 static const struct seq_operations unix_seq_ops = {
2905         .start  = unix_seq_start,
2906         .next   = unix_seq_next,
2907         .stop   = unix_seq_stop,
2908         .show   = unix_seq_show,
2909 };
2910
2911 static int unix_seq_open(struct inode *inode, struct file *file)
2912 {
2913         return seq_open_net(inode, file, &unix_seq_ops,
2914                             sizeof(struct seq_net_private));
2915 }
2916
2917 static const struct file_operations unix_seq_fops = {
2918         .owner          = THIS_MODULE,
2919         .open           = unix_seq_open,
2920         .read           = seq_read,
2921         .llseek         = seq_lseek,
2922         .release        = seq_release_net,
2923 };
2924
2925 #endif
2926
2927 static const struct net_proto_family unix_family_ops = {
2928         .family = PF_UNIX,
2929         .create = unix_create,
2930         .owner  = THIS_MODULE,
2931 };
2932
2933
2934 static int __net_init unix_net_init(struct net *net)
2935 {
2936         int error = -ENOMEM;
2937
2938         net->unx.sysctl_max_dgram_qlen = 10;
2939         if (unix_sysctl_register(net))
2940                 goto out;
2941
2942 #ifdef CONFIG_PROC_FS
2943         if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2944                 unix_sysctl_unregister(net);
2945                 goto out;
2946         }
2947 #endif
2948         error = 0;
2949 out:
2950         return error;
2951 }
2952
2953 static void __net_exit unix_net_exit(struct net *net)
2954 {
2955         unix_sysctl_unregister(net);
2956         remove_proc_entry("unix", net->proc_net);
2957 }
2958
2959 static struct pernet_operations unix_net_ops = {
2960         .init = unix_net_init,
2961         .exit = unix_net_exit,
2962 };
2963
2964 static int __init af_unix_init(void)
2965 {
2966         int rc = -1;
2967
2968         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2969
2970         rc = proto_register(&unix_proto, 1);
2971         if (rc != 0) {
2972                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2973                 goto out;
2974         }
2975
2976         sock_register(&unix_family_ops);
2977         register_pernet_subsys(&unix_net_ops);
2978 out:
2979         return rc;
2980 }
2981
2982 static void __exit af_unix_exit(void)
2983 {
2984         sock_unregister(PF_UNIX);
2985         proto_unregister(&unix_proto);
2986         unregister_pernet_subsys(&unix_net_ops);
2987 }
2988
2989 /* Earlier than device_initcall() so that other drivers invoking
2990    request_module() don't end up in a loop when modprobe tries
2991    to use a UNIX socket. But later than subsys_initcall() because
2992    we depend on stuff initialised there */
2993 fs_initcall(af_unix_init);
2994 module_exit(af_unix_exit);
2995
2996 MODULE_LICENSE("GPL");
2997 MODULE_ALIAS_NETPROTO(PF_UNIX);