unix/af_unix.c

   1 /*
   2  * NET4:        Implementation of BSD Unix domain sockets.
   3  *
   4  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   5  *
   6  *              This program is free software; you can redistribute it and/or
   7  *              modify it under the terms of the GNU General Public License
   8  *              as published by the Free Software Foundation; either version
   9  *              2 of the License, or (at your option) any later version.
  10  *
  11  * Fixes:
  12  *              Linus Torvalds  :       Assorted bug cures.
  13  *              Niibe Yutaka    :       async I/O support.
  14  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  15  *              Alan Cox        :       Limit size of allocated blocks.
  16  *              Alan Cox        :       Fixed the stupid socketpair bug.
  17  *              Alan Cox        :       BSD compatibility fine tuning.
  18  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  19  *              Alan Cox        :       Sorted out a proper draft version of
  20  *                                      file descriptor passing hacked up from
  21  *                                      Mike Shaver's work.
  22  *              Marty Leisner   :       Fixes to fd passing
  23  *              Nick Nevin      :       recvmsg bugfix.
  24  *              Alan Cox        :       Started proper garbage collector
  25  *              Heiko EiBfeldt  :       Missing verify_area check
  26  *              Alan Cox        :       Started POSIXisms
  27  *              Andreas Schwab  :       Replace inode by dentry for proper
  28  *                                      reference counting
  29  *              Kirk Petersen   :       Made this a module
  30  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  31  *                                      Lots of bug fixes.
  32  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  33  *                                      by above two patches.
  34  *           Andrea Arcangeli   :       If possible we block in connect(2)
  35  *                                      if the max backlog of the listen socket
  36  *                                      is been reached. This won't break
  37  *                                      old apps and it will avoid huge amount
  38  *                                      of socks hashed (this for unix_gc()
  39  *                                      performances reasons).
  40  *                                      Security fix that limits the max
  41  *                                      number of socks to 2*max_files and
  42  *                                      the number of skb queueable in the
  43  *                                      dgram receiver.
  44  *              Artur Skawina   :       Hash function optimizations
  45  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  46  *            Malcolm Beattie   :       Set peercred for socketpair
  47  *           Michal Ostrowski   :       Module initialization cleanup.
  48  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  49  *                                      the core infrastructure is doing that
  50  *                                      for all net proto families now (2.5.69+)
  51  *
  52  *
  53  * Known differences from reference BSD that was tested:
  54  *
  55  *      [TO FIX]
  56  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  57  *              other the moment one end closes.
  58  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  59  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  60  *      [NOT TO FIX]
  61  *      accept() returns a path name even if the connecting socket has closed
  62  *              in the meantime (BSD loses the path and gives up).
  63  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  64  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  65  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  66  *      BSD af_unix apparently has connect forgetting to block properly.
  67  *              (need to check this with the POSIX spec in detail)
  68  *
  69  * Differences from 2.0.0-11-... (ANK)
  70  *      Bug fixes and improvements.
  71  *              - client shutdown killed server socket.
  72  *              - removed all useless cli/sti pairs.
  73  *
  74  *      Semantic changes/extensions.
  75  *              - generic control message passing.
  76  *              - SCM_CREDENTIALS control message.
  77  *              - "Abstract" (not FS based) socket bindings.
  78  *                Abstract names are sequences of bytes (not zero terminated)
  79  *                started by 0, so that this name space does not intersect
  80  *                with BSD names.
  81  */
  82
  83 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  84
  85 #include <linux/module.h>
  86 #include <linux/kernel.h>
  87 #include <linux/signal.h>
  88 #include <linux/sched.h>
  89 #include <linux/errno.h>
  90 #include <linux/string.h>
  91 #include <linux/stat.h>
  92 #include <linux/dcache.h>
  93 #include <linux/namei.h>
  94 #include <linux/socket.h>
  95 #include <linux/un.h>
  96 #include <linux/fcntl.h>
  97 #include <linux/termios.h>
  98 #include <linux/sockios.h>
  99 #include <linux/net.h>
 100 #include <linux/in.h>
 101 #include <linux/fs.h>
 102 #include <linux/slab.h>
 103 #include <asm/uaccess.h>
 104 #include <linux/skbuff.h>
 105 #include <linux/netdevice.h>
 106 #include <net/net_namespace.h>
 107 #include <net/sock.h>
 108 #include <net/tcp_states.h>
 109 #include <net/af_unix.h>
 110 #include <linux/proc_fs.h>
 111 #include <linux/seq_file.h>
 112 #include <net/scm.h>
 113 #include <linux/init.h>
 114 #include <linux/poll.h>
 115 #include <linux/rtnetlink.h>
 116 #include <linux/mount.h>
 117 #include <net/checksum.h>
 118 #include <linux/security.h>
 119 #include <linux/freezer.h>
 120
 121 #include "scm.h"
 122
 123 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 124 EXPORT_SYMBOL_GPL(unix_socket_table);
 125 DEFINE_SPINLOCK(unix_table_lock);
 126 EXPORT_SYMBOL_GPL(unix_table_lock);
 127 static atomic_long_t unix_nr_socks;
 128
 129
 130 static struct hlist_head *unix_sockets_unbound(void *addr)
 131 {
 132         unsigned long hash = (unsigned long)addr;
 133
 134         hash ^= hash >> 16;
 135         hash ^= hash >> 8;
 136         hash %= UNIX_HASH_SIZE;
 137         return &unix_socket_table[UNIX_HASH_SIZE + hash];
 138 }
 139
 140 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
 141
 142 #ifdef CONFIG_SECURITY_NETWORK
 143 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 144 {
 145         UNIXCB(skb).secid = scm->secid;
 146 }
 147
 148 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 149 {
 150         scm->secid = UNIXCB(skb).secid;
 151 }
 152
 153 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 154 {
 155         return (scm->secid == UNIXCB(skb).secid);
 156 }
 157 #else
 158 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 159 { }
 160
 161 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 162 { }
 163
 164 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 165 {
 166         return true;
 167 }
 168 #endif /* CONFIG_SECURITY_NETWORK */
 169
 170 /*
 171  *  SMP locking strategy:
 172  *    hash table is protected with spinlock unix_table_lock
 173  *    each socket state is protected by separate spin lock.
 174  */
 175
 176 static inline unsigned int unix_hash_fold(__wsum n)
 177 {
 178         unsigned int hash = (__force unsigned int)csum_fold(n);
 179
 180         hash ^= hash>>8;
 181         return hash&(UNIX_HASH_SIZE-1);
 182 }
 183
 184 #define unix_peer(sk) (unix_sk(sk)->peer)
 185
 186 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 187 {
 188         return unix_peer(osk) == sk;
 189 }
 190
 191 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 192 {
 193         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 194 }
 195
 196 static inline int unix_recvq_full(const struct sock *sk)
 197 {
 198         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 199 }
 200
 201 static inline int unix_recvq_full_lockless(const struct sock *sk)
 202 {
 203         return skb_queue_len_lockless(&sk->sk_receive_queue) >
 204                 READ_ONCE(sk->sk_max_ack_backlog);
 205 }
 206
 207 struct sock *unix_peer_get(struct sock *s)
 208 {
 209         struct sock *peer;
 210
 211         unix_state_lock(s);
 212         peer = unix_peer(s);
 213         if (peer)
 214                 sock_hold(peer);
 215         unix_state_unlock(s);
 216         return peer;
 217 }
 218 EXPORT_SYMBOL_GPL(unix_peer_get);
 219
 220 static inline void unix_release_addr(struct unix_address *addr)
 221 {
 222         if (atomic_dec_and_test(&addr->refcnt))
 223                 kfree(addr);
 224 }
 225
 226 /*
 227  *      Check unix socket name:
 228  *              - should be not zero length.
 229  *              - if started by not zero, should be NULL terminated (FS object)
 230  *              - if started by zero, it is abstract name.
 231  */
 232
 233 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
 234 {
 235         *hashp = 0;
 236
 237         if (len <= sizeof(short) || len > sizeof(*sunaddr))
 238                 return -EINVAL;
 239         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
 240                 return -EINVAL;
 241         if (sunaddr->sun_path[0]) {
 242                 /*
 243                  * This may look like an off by one error but it is a bit more
 244                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
 245                  * sun_path[108] doesn't as such exist.  However in kernel space
 246                  * we are guaranteed that it is a valid memory location in our
 247                  * kernel address buffer.
 248                  */
 249                 ((char *)sunaddr)[len] = 0;
 250                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
 251                 return len;
 252         }
 253
 254         *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
 255         return len;
 256 }
 257
 258 static void __unix_remove_socket(struct sock *sk)
 259 {
 260         sk_del_node_init(sk);
 261 }
 262
 263 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 264 {
 265         WARN_ON(!sk_unhashed(sk));
 266         sk_add_node(sk, list);
 267 }
 268
 269 static inline void unix_remove_socket(struct sock *sk)
 270 {
 271         spin_lock(&unix_table_lock);
 272         __unix_remove_socket(sk);
 273         spin_unlock(&unix_table_lock);
 274 }
 275
 276 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 277 {
 278         spin_lock(&unix_table_lock);
 279         __unix_insert_socket(list, sk);
 280         spin_unlock(&unix_table_lock);
 281 }
 282
 283 static struct sock *__unix_find_socket_byname(struct net *net,
 284                                               struct sockaddr_un *sunname,
 285                                               int len, int type, unsigned int hash)
 286 {
 287         struct sock *s;
 288
 289         sk_for_each(s, &unix_socket_table[hash ^ type]) {
 290                 struct unix_sock *u = unix_sk(s);
 291
 292                 if (!net_eq(sock_net(s), net))
 293                         continue;
 294
 295                 if (u->addr->len == len &&
 296                     !memcmp(u->addr->name, sunname, len))
 297                         goto found;
 298         }
 299         s = NULL;
 300 found:
 301         return s;
 302 }
 303
 304 static inline struct sock *unix_find_socket_byname(struct net *net,
 305                                                    struct sockaddr_un *sunname,
 306                                                    int len, int type,
 307                                                    unsigned int hash)
 308 {
 309         struct sock *s;
 310
 311         spin_lock(&unix_table_lock);
 312         s = __unix_find_socket_byname(net, sunname, len, type, hash);
 313         if (s)
 314                 sock_hold(s);
 315         spin_unlock(&unix_table_lock);
 316         return s;
 317 }
 318
 319 static struct sock *unix_find_socket_byinode(struct inode *i)
 320 {
 321         struct sock *s;
 322
 323         spin_lock(&unix_table_lock);
 324         sk_for_each(s,
 325                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 326                 struct dentry *dentry = unix_sk(s)->path.dentry;
 327
 328                 if (dentry && d_real_inode(dentry) == i) {
 329                         sock_hold(s);
 330                         goto found;
 331                 }
 332         }
 333         s = NULL;
 334 found:
 335         spin_unlock(&unix_table_lock);
 336         return s;
 337 }
 338
 339 /* Support code for asymmetrically connected dgram sockets
 340  *
 341  * If a datagram socket is connected to a socket not itself connected
 342  * to the first socket (eg, /dev/log), clients may only enqueue more
 343  * messages if the present receive queue of the server socket is not
 344  * "too large". This means there's a second writeability condition
 345  * poll and sendmsg need to test. The dgram recv code will do a wake
 346  * up on the peer_wait wait queue of a socket upon reception of a
 347  * datagram which needs to be propagated to sleeping would-be writers
 348  * since these might not have sent anything so far. This can't be
 349  * accomplished via poll_wait because the lifetime of the server
 350  * socket might be less than that of its clients if these break their
 351  * association with it or if the server socket is closed while clients
 352  * are still connected to it and there's no way to inform "a polling
 353  * implementation" that it should let go of a certain wait queue
 354  *
 355  * In order to propagate a wake up, a wait_queue_t of the client
 356  * socket is enqueued on the peer_wait queue of the server socket
 357  * whose wake function does a wake_up on the ordinary client socket
 358  * wait queue. This connection is established whenever a write (or
 359  * poll for write) hit the flow control condition and broken when the
 360  * association to the server socket is dissolved or after a wake up
 361  * was relayed.
 362  */
 363
 364 static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags,
 365                                       void *key)
 366 {
 367         struct unix_sock *u;
 368         wait_queue_head_t *u_sleep;
 369
 370         u = container_of(q, struct unix_sock, peer_wake);
 371
 372         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
 373                             q);
 374         u->peer_wake.private = NULL;
 375
 376         /* relaying can only happen while the wq still exists */
 377         u_sleep = sk_sleep(&u->sk);
 378         if (u_sleep)
 379                 wake_up_interruptible_poll(u_sleep, key);
 380
 381         return 0;
 382 }
 383
 384 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
 385 {
 386         struct unix_sock *u, *u_other;
 387         int rc;
 388
 389         u = unix_sk(sk);
 390         u_other = unix_sk(other);
 391         rc = 0;
 392         spin_lock(&u_other->peer_wait.lock);
 393
 394         if (!u->peer_wake.private) {
 395                 u->peer_wake.private = other;
 396                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
 397
 398                 rc = 1;
 399         }
 400
 401         spin_unlock(&u_other->peer_wait.lock);
 402         return rc;
 403 }
 404
 405 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
 406                                             struct sock *other)
 407 {
 408         struct unix_sock *u, *u_other;
 409
 410         u = unix_sk(sk);
 411         u_other = unix_sk(other);
 412         spin_lock(&u_other->peer_wait.lock);
 413
 414         if (u->peer_wake.private == other) {
 415                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
 416                 u->peer_wake.private = NULL;
 417         }
 418
 419         spin_unlock(&u_other->peer_wait.lock);
 420 }
 421
 422 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 423                                                    struct sock *other)
 424 {
 425         unix_dgram_peer_wake_disconnect(sk, other);
 426         wake_up_interruptible_poll(sk_sleep(sk),
 427                                    POLLOUT |
 428                                    POLLWRNORM |
 429                                    POLLWRBAND);
 430 }
 431
 432 /* preconditions:
 433  *      - unix_peer(sk) == other
 434  *      - association is stable
 435  */
 436 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 437 {
 438         int connected;
 439
 440         connected = unix_dgram_peer_wake_connect(sk, other);
 441
 442         if (unix_recvq_full(other))
 443                 return 1;
 444
 445         if (connected)
 446                 unix_dgram_peer_wake_disconnect(sk, other);
 447
 448         return 0;
 449 }
 450
 451 static int unix_writable(const struct sock *sk)
 452 {
 453         return sk->sk_state != TCP_LISTEN &&
 454                (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 455 }
 456
 457 static void unix_write_space(struct sock *sk)
 458 {
 459         struct socket_wq *wq;
 460
 461         rcu_read_lock();
 462         if (unix_writable(sk)) {
 463                 wq = rcu_dereference(sk->sk_wq);
 464                 if (skwq_has_sleeper(wq))
 465                         wake_up_interruptible_sync_poll(&wq->wait,
 466                                 POLLOUT | POLLWRNORM | POLLWRBAND);
 467                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 468         }
 469         rcu_read_unlock();
 470 }
 471
 472 /* When dgram socket disconnects (or changes its peer), we clear its receive
 473  * queue of packets arrived from previous peer. First, it allows to do
 474  * flow control based only on wmem_alloc; second, sk connected to peer
 475  * may receive messages only from that peer. */
 476 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 477 {
 478         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 479                 skb_queue_purge(&sk->sk_receive_queue);
 480                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 481
 482                 /* If one link of bidirectional dgram pipe is disconnected,
 483                  * we signal error. Messages are lost. Do not make this,
 484                  * when peer was not connected to us.
 485                  */
 486                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 487                         other->sk_err = ECONNRESET;
 488                         other->sk_error_report(other);
 489                 }
 490         }
 491 }
 492
 493 static void unix_sock_destructor(struct sock *sk)
 494 {
 495         struct unix_sock *u = unix_sk(sk);
 496
 497         skb_queue_purge(&sk->sk_receive_queue);
 498
 499         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
 500         WARN_ON(!sk_unhashed(sk));
 501         WARN_ON(sk->sk_socket);
 502         if (!sock_flag(sk, SOCK_DEAD)) {
 503                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 504                 return;
 505         }
 506
 507         if (u->addr)
 508                 unix_release_addr(u->addr);
 509
 510         atomic_long_dec(&unix_nr_socks);
 511         local_bh_disable();
 512         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 513         local_bh_enable();
 514 #ifdef UNIX_REFCNT_DEBUG
 515         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 516                 atomic_long_read(&unix_nr_socks));
 517 #endif
 518 }
 519
 520 static void unix_release_sock(struct sock *sk, int embrion)
 521 {
 522         struct unix_sock *u = unix_sk(sk);
 523         struct path path;
 524         struct sock *skpair;
 525         struct sk_buff *skb;
 526         int state;
 527
 528         unix_remove_socket(sk);
 529
 530         /* Clear state */
 531         unix_state_lock(sk);
 532         sock_orphan(sk);
 533         sk->sk_shutdown = SHUTDOWN_MASK;
 534         path         = u->path;
 535         u->path.dentry = NULL;
 536         u->path.mnt = NULL;
 537         state = sk->sk_state;
 538         sk->sk_state = TCP_CLOSE;
 539
 540         skpair = unix_peer(sk);
 541         unix_peer(sk) = NULL;
 542
 543         unix_state_unlock(sk);
 544
 545         wake_up_interruptible_all(&u->peer_wait);
 546
 547         if (skpair != NULL) {
 548                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 549                         unix_state_lock(skpair);
 550                         /* No more writes */
 551                         skpair->sk_shutdown = SHUTDOWN_MASK;
 552                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 553                                 skpair->sk_err = ECONNRESET;
 554                         unix_state_unlock(skpair);
 555                         skpair->sk_state_change(skpair);
 556                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 557                 }
 558
 559                 unix_dgram_peer_wake_disconnect(sk, skpair);
 560                 sock_put(skpair); /* It may now die */
 561         }
 562
 563         /* Try to flush out this socket. Throw out buffers at least */
 564
 565         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 566                 if (state == TCP_LISTEN)
 567                         unix_release_sock(skb->sk, 1);
 568                 /* passed fds are erased in the kfree_skb hook        */
 569                 UNIXCB(skb).consumed = skb->len;
 570                 kfree_skb(skb);
 571         }
 572
 573         if (path.dentry)
 574                 path_put(&path);
 575
 576         sock_put(sk);
 577
 578         /* ---- Socket is dead now and most probably destroyed ---- */
 579
 580         /*
 581          * Fixme: BSD difference: In BSD all sockets connected to us get
 582          *        ECONNRESET and we die on the spot. In Linux we behave
 583          *        like files and pipes do and wait for the last
 584          *        dereference.
 585          *
 586          * Can't we simply set sock->err?
 587          *
 588          *        What the above comment does talk about? --ANK(980817)
 589          */
 590
 591         if (unix_tot_inflight)
 592                 unix_gc();              /* Garbage collect fds */
 593 }
 594
 595 static void init_peercred(struct sock *sk)
 596 {
 597         const struct cred *old_cred;
 598         struct pid *old_pid;
 599
 600         spin_lock(&sk->sk_peer_lock);
 601         old_pid = sk->sk_peer_pid;
 602         old_cred = sk->sk_peer_cred;
 603         sk->sk_peer_pid  = get_pid(task_tgid(current));
 604         sk->sk_peer_cred = get_current_cred();
 605         spin_unlock(&sk->sk_peer_lock);
 606
 607         put_pid(old_pid);
 608         put_cred(old_cred);
 609 }
 610
 611 static void copy_peercred(struct sock *sk, struct sock *peersk)
 612 {
 613         const struct cred *old_cred;
 614         struct pid *old_pid;
 615
 616         if (sk < peersk) {
 617                 spin_lock(&sk->sk_peer_lock);
 618                 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 619         } else {
 620                 spin_lock(&peersk->sk_peer_lock);
 621                 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 622         }
 623         old_pid = sk->sk_peer_pid;
 624         old_cred = sk->sk_peer_cred;
 625         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 626         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 627
 628         spin_unlock(&sk->sk_peer_lock);
 629         spin_unlock(&peersk->sk_peer_lock);
 630
 631         put_pid(old_pid);
 632         put_cred(old_cred);
 633 }
 634
 635 static int unix_listen(struct socket *sock, int backlog)
 636 {
 637         int err;
 638         struct sock *sk = sock->sk;
 639         struct unix_sock *u = unix_sk(sk);
 640         struct pid *old_pid = NULL;
 641
 642         err = -EOPNOTSUPP;
 643         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 644                 goto out;       /* Only stream/seqpacket sockets accept */
 645         err = -EINVAL;
 646         if (!u->addr)
 647                 goto out;       /* No listens on an unbound socket */
 648         unix_state_lock(sk);
 649         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 650                 goto out_unlock;
 651         if (backlog > sk->sk_max_ack_backlog)
 652                 wake_up_interruptible_all(&u->peer_wait);
 653         sk->sk_max_ack_backlog  = backlog;
 654         sk->sk_state            = TCP_LISTEN;
 655         /* set credentials so connect can copy them */
 656         init_peercred(sk);
 657         err = 0;
 658
 659 out_unlock:
 660         unix_state_unlock(sk);
 661         put_pid(old_pid);
 662 out:
 663         return err;
 664 }
 665
 666 static int unix_release(struct socket *);
 667 static int unix_bind(struct socket *, struct sockaddr *, int);
 668 static int unix_stream_connect(struct socket *, struct sockaddr *,
 669                                int addr_len, int flags);
 670 static int unix_socketpair(struct socket *, struct socket *);
 671 static int unix_accept(struct socket *, struct socket *, int);
 672 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
 673 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
 674 static unsigned int unix_dgram_poll(struct file *, struct socket *,
 675                                     poll_table *);
 676 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 677 static int unix_shutdown(struct socket *, int);
 678 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 679 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 680 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
 681                                     size_t size, int flags);
 682 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 683                                        struct pipe_inode_info *, size_t size,
 684                                        unsigned int flags);
 685 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 686 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 687 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 688                               int, int);
 689 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 690 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 691                                   int);
 692
 693 static int unix_set_peek_off(struct sock *sk, int val)
 694 {
 695         struct unix_sock *u = unix_sk(sk);
 696
 697         if (mutex_lock_interruptible(&u->iolock))
 698                 return -EINTR;
 699
 700         sk->sk_peek_off = val;
 701         mutex_unlock(&u->iolock);
 702
 703         return 0;
 704 }
 705
 706
 707 static const struct proto_ops unix_stream_ops = {
 708         .family =       PF_UNIX,
 709         .owner =        THIS_MODULE,
 710         .release =      unix_release,
 711         .bind =         unix_bind,
 712         .connect =      unix_stream_connect,
 713         .socketpair =   unix_socketpair,
 714         .accept =       unix_accept,
 715         .getname =      unix_getname,
 716         .poll =         unix_poll,
 717         .ioctl =        unix_ioctl,
 718         .listen =       unix_listen,
 719         .shutdown =     unix_shutdown,
 720         .setsockopt =   sock_no_setsockopt,
 721         .getsockopt =   sock_no_getsockopt,
 722         .sendmsg =      unix_stream_sendmsg,
 723         .recvmsg =      unix_stream_recvmsg,
 724         .mmap =         sock_no_mmap,
 725         .sendpage =     unix_stream_sendpage,
 726         .splice_read =  unix_stream_splice_read,
 727         .set_peek_off = unix_set_peek_off,
 728 };
 729
 730 static const struct proto_ops unix_dgram_ops = {
 731         .family =       PF_UNIX,
 732         .owner =        THIS_MODULE,
 733         .release =      unix_release,
 734         .bind =         unix_bind,
 735         .connect =      unix_dgram_connect,
 736         .socketpair =   unix_socketpair,
 737         .accept =       sock_no_accept,
 738         .getname =      unix_getname,
 739         .poll =         unix_dgram_poll,
 740         .ioctl =        unix_ioctl,
 741         .listen =       sock_no_listen,
 742         .shutdown =     unix_shutdown,
 743         .setsockopt =   sock_no_setsockopt,
 744         .getsockopt =   sock_no_getsockopt,
 745         .sendmsg =      unix_dgram_sendmsg,
 746         .recvmsg =      unix_dgram_recvmsg,
 747         .mmap =         sock_no_mmap,
 748         .sendpage =     sock_no_sendpage,
 749         .set_peek_off = unix_set_peek_off,
 750 };
 751
 752 static const struct proto_ops unix_seqpacket_ops = {
 753         .family =       PF_UNIX,
 754         .owner =        THIS_MODULE,
 755         .release =      unix_release,
 756         .bind =         unix_bind,
 757         .connect =      unix_stream_connect,
 758         .socketpair =   unix_socketpair,
 759         .accept =       unix_accept,
 760         .getname =      unix_getname,
 761         .poll =         unix_dgram_poll,
 762         .ioctl =        unix_ioctl,
 763         .listen =       unix_listen,
 764         .shutdown =     unix_shutdown,
 765         .setsockopt =   sock_no_setsockopt,
 766         .getsockopt =   sock_no_getsockopt,
 767         .sendmsg =      unix_seqpacket_sendmsg,
 768         .recvmsg =      unix_seqpacket_recvmsg,
 769         .mmap =         sock_no_mmap,
 770         .sendpage =     sock_no_sendpage,
 771         .set_peek_off = unix_set_peek_off,
 772 };
 773
 774 static struct proto unix_proto = {
 775         .name                   = "UNIX",
 776         .owner                  = THIS_MODULE,
 777         .obj_size               = sizeof(struct unix_sock),
 778 };
 779
 780 /*
 781  * AF_UNIX sockets do not interact with hardware, hence they
 782  * dont trigger interrupts - so it's safe for them to have
 783  * bh-unsafe locking for their sk_receive_queue.lock. Split off
 784  * this special lock-class by reinitializing the spinlock key:
 785  */
 786 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
 787
 788 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 789 {
 790         struct sock *sk = NULL;
 791         struct unix_sock *u;
 792
 793         atomic_long_inc(&unix_nr_socks);
 794         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 795                 goto out;
 796
 797         sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
 798         if (!sk)
 799                 goto out;
 800
 801         sock_init_data(sock, sk);
 802         lockdep_set_class(&sk->sk_receive_queue.lock,
 803                                 &af_unix_sk_receive_queue_lock_key);
 804
 805         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
 806         sk->sk_write_space      = unix_write_space;
 807         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 808         sk->sk_destruct         = unix_sock_destructor;
 809         u         = unix_sk(sk);
 810         u->path.dentry = NULL;
 811         u->path.mnt = NULL;
 812         spin_lock_init(&u->lock);
 813         atomic_long_set(&u->inflight, 0);
 814         INIT_LIST_HEAD(&u->link);
 815         mutex_init(&u->iolock); /* single task reading lock */
 816         mutex_init(&u->bindlock); /* single task binding lock */
 817         init_waitqueue_head(&u->peer_wait);
 818         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 819         unix_insert_socket(unix_sockets_unbound(sk), sk);
 820 out:
 821         if (sk == NULL)
 822                 atomic_long_dec(&unix_nr_socks);
 823         else {
 824                 local_bh_disable();
 825                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 826                 local_bh_enable();
 827         }
 828         return sk;
 829 }
 830
 831 static int unix_create(struct net *net, struct socket *sock, int protocol,
 832                        int kern)
 833 {
 834         if (protocol && protocol != PF_UNIX)
 835                 return -EPROTONOSUPPORT;
 836
 837         sock->state = SS_UNCONNECTED;
 838
 839         switch (sock->type) {
 840         case SOCK_STREAM:
 841                 sock->ops = &unix_stream_ops;
 842                 break;
 843                 /*
 844                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
 845                  *      nothing uses it.
 846                  */
 847         case SOCK_RAW:
 848                 sock->type = SOCK_DGRAM;
 849         case SOCK_DGRAM:
 850                 sock->ops = &unix_dgram_ops;
 851                 break;
 852         case SOCK_SEQPACKET:
 853                 sock->ops = &unix_seqpacket_ops;
 854                 break;
 855         default:
 856                 return -ESOCKTNOSUPPORT;
 857         }
 858
 859         return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
 860 }
 861
 862 static int unix_release(struct socket *sock)
 863 {
 864         struct sock *sk = sock->sk;
 865
 866         if (!sk)
 867                 return 0;
 868
 869         unix_release_sock(sk, 0);
 870         sock->sk = NULL;
 871
 872         return 0;
 873 }
 874
 875 static int unix_autobind(struct socket *sock)
 876 {
 877         struct sock *sk = sock->sk;
 878         struct net *net = sock_net(sk);
 879         struct unix_sock *u = unix_sk(sk);
 880         static u32 ordernum = 1;
 881         struct unix_address *addr;
 882         int err;
 883         unsigned int retries = 0;
 884
 885         err = mutex_lock_interruptible(&u->bindlock);
 886         if (err)
 887                 return err;
 888
 889         err = 0;
 890         if (u->addr)
 891                 goto out;
 892
 893         err = -ENOMEM;
 894         addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
 895         if (!addr)
 896                 goto out;
 897
 898         addr->name->sun_family = AF_UNIX;
 899         atomic_set(&addr->refcnt, 1);
 900
 901 retry:
 902         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
 903         addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 904
 905         spin_lock(&unix_table_lock);
 906         ordernum = (ordernum+1)&0xFFFFF;
 907
 908         if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
 909                                       addr->hash)) {
 910                 spin_unlock(&unix_table_lock);
 911                 /*
 912                  * __unix_find_socket_byname() may take long time if many names
 913                  * are already in use.
 914                  */
 915                 cond_resched();
 916                 /* Give up if all names seems to be in use. */
 917                 if (retries++ == 0xFFFFF) {
 918                         err = -ENOSPC;
 919                         kfree(addr);
 920                         goto out;
 921                 }
 922                 goto retry;
 923         }
 924         addr->hash ^= sk->sk_type;
 925
 926         __unix_remove_socket(sk);
 927         smp_store_release(&u->addr, addr);
 928         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
 929         spin_unlock(&unix_table_lock);
 930         err = 0;
 931
 932 out:    mutex_unlock(&u->bindlock);
 933         return err;
 934 }
 935
 936 static struct sock *unix_find_other(struct net *net,
 937                                     struct sockaddr_un *sunname, int len,
 938                                     int type, unsigned int hash, int *error)
 939 {
 940         struct sock *u;
 941         struct path path;
 942         int err = 0;
 943
 944         if (sunname->sun_path[0]) {
 945                 struct inode *inode;
 946                 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
 947                 if (err)
 948                         goto fail;
 949                 inode = d_real_inode(path.dentry);
 950                 err = inode_permission(inode, MAY_WRITE);
 951                 if (err)
 952                         goto put_fail;
 953
 954                 err = -ECONNREFUSED;
 955                 if (!S_ISSOCK(inode->i_mode))
 956                         goto put_fail;
 957                 u = unix_find_socket_byinode(inode);
 958                 if (!u)
 959                         goto put_fail;
 960
 961                 if (u->sk_type == type)
 962                         touch_atime(&path);
 963
 964                 path_put(&path);
 965
 966                 err = -EPROTOTYPE;
 967                 if (u->sk_type != type) {
 968                         sock_put(u);
 969                         goto fail;
 970                 }
 971         } else {
 972                 err = -ECONNREFUSED;
 973                 u = unix_find_socket_byname(net, sunname, len, type, hash);
 974                 if (u) {
 975                         struct dentry *dentry;
 976                         dentry = unix_sk(u)->path.dentry;
 977                         if (dentry)
 978                                 touch_atime(&unix_sk(u)->path);
 979                 } else
 980                         goto fail;
 981         }
 982         return u;
 983
 984 put_fail:
 985         path_put(&path);
 986 fail:
 987         *error = err;
 988         return NULL;
 989 }
 990
 991 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
 992 {
 993         struct dentry *dentry;
 994         struct path path;
 995         int err = 0;
 996         /*
 997          * Get the parent directory, calculate the hash for last
 998          * component.
 999          */
1000         dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
1001         err = PTR_ERR(dentry);
1002         if (IS_ERR(dentry))
1003                 return err;
1004
1005         /*
1006          * All right, let's create it.
1007          */
1008         err = security_path_mknod(&path, dentry, mode, 0);
1009         if (!err) {
1010                 err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
1011                 if (!err) {
1012                         res->mnt = mntget(path.mnt);
1013                         res->dentry = dget(dentry);
1014                 }
1015         }
1016         done_path_create(&path, dentry);
1017         return err;
1018 }
1019
1020 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1021 {
1022         struct sock *sk = sock->sk;
1023         struct net *net = sock_net(sk);
1024         struct unix_sock *u = unix_sk(sk);
1025         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1026         char *sun_path = sunaddr->sun_path;
1027         int err;
1028         unsigned int hash;
1029         struct unix_address *addr;
1030         struct hlist_head *list;
1031         struct path path = { NULL, NULL };
1032
1033         err = -EINVAL;
1034         if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
1035             sunaddr->sun_family != AF_UNIX)
1036                 goto out;
1037
1038         if (addr_len == sizeof(short)) {
1039                 err = unix_autobind(sock);
1040                 goto out;
1041         }
1042
1043         err = unix_mkname(sunaddr, addr_len, &hash);
1044         if (err < 0)
1045                 goto out;
1046         addr_len = err;
1047
1048         if (sun_path[0]) {
1049                 umode_t mode = S_IFSOCK |
1050                        (SOCK_INODE(sock)->i_mode & ~current_umask());
1051                 err = unix_mknod(sun_path, mode, &path);
1052                 if (err) {
1053                         if (err == -EEXIST)
1054                                 err = -EADDRINUSE;
1055                         goto out;
1056                 }
1057         }
1058
1059         err = mutex_lock_interruptible(&u->bindlock);
1060         if (err)
1061                 goto out_put;
1062
1063         err = -EINVAL;
1064         if (u->addr)
1065                 goto out_up;
1066
1067         err = -ENOMEM;
1068         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
1069         if (!addr)
1070                 goto out_up;
1071
1072         memcpy(addr->name, sunaddr, addr_len);
1073         addr->len = addr_len;
1074         addr->hash = hash ^ sk->sk_type;
1075         atomic_set(&addr->refcnt, 1);
1076
1077         if (sun_path[0]) {
1078                 addr->hash = UNIX_HASH_SIZE;
1079                 hash = d_real_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
1080                 spin_lock(&unix_table_lock);
1081                 u->path = path;
1082                 list = &unix_socket_table[hash];
1083         } else {
1084                 spin_lock(&unix_table_lock);
1085                 err = -EADDRINUSE;
1086                 if (__unix_find_socket_byname(net, sunaddr, addr_len,
1087                                               sk->sk_type, hash)) {
1088                         unix_release_addr(addr);
1089                         goto out_unlock;
1090                 }
1091
1092                 list = &unix_socket_table[addr->hash];
1093         }
1094
1095         err = 0;
1096         __unix_remove_socket(sk);
1097         smp_store_release(&u->addr, addr);
1098         __unix_insert_socket(list, sk);
1099
1100 out_unlock:
1101         spin_unlock(&unix_table_lock);
1102 out_up:
1103         mutex_unlock(&u->bindlock);
1104 out_put:
1105         if (err)
1106                 path_put(&path);
1107 out:
1108         return err;
1109 }
1110
1111 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1112 {
1113         if (unlikely(sk1 == sk2) || !sk2) {
1114                 unix_state_lock(sk1);
1115                 return;
1116         }
1117         if (sk1 < sk2) {
1118                 unix_state_lock(sk1);
1119                 unix_state_lock_nested(sk2);
1120         } else {
1121                 unix_state_lock(sk2);
1122                 unix_state_lock_nested(sk1);
1123         }
1124 }
1125
1126 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1127 {
1128         if (unlikely(sk1 == sk2) || !sk2) {
1129                 unix_state_unlock(sk1);
1130                 return;
1131         }
1132         unix_state_unlock(sk1);
1133         unix_state_unlock(sk2);
1134 }
1135
1136 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1137                               int alen, int flags)
1138 {
1139         struct sock *sk = sock->sk;
1140         struct net *net = sock_net(sk);
1141         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1142         struct sock *other;
1143         unsigned int hash;
1144         int err;
1145
1146         err = -EINVAL;
1147         if (alen < offsetofend(struct sockaddr, sa_family))
1148                 goto out;
1149
1150         if (addr->sa_family != AF_UNSPEC) {
1151                 err = unix_mkname(sunaddr, alen, &hash);
1152                 if (err < 0)
1153                         goto out;
1154                 alen = err;
1155
1156                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1157                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
1158                         goto out;
1159
1160 restart:
1161                 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1162                 if (!other)
1163                         goto out;
1164
1165                 unix_state_double_lock(sk, other);
1166
1167                 /* Apparently VFS overslept socket death. Retry. */
1168                 if (sock_flag(other, SOCK_DEAD)) {
1169                         unix_state_double_unlock(sk, other);
1170                         sock_put(other);
1171                         goto restart;
1172                 }
1173
1174                 err = -EPERM;
1175                 if (!unix_may_send(sk, other))
1176                         goto out_unlock;
1177
1178                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1179                 if (err)
1180                         goto out_unlock;
1181
1182         } else {
1183                 /*
1184                  *      1003.1g breaking connected state with AF_UNSPEC
1185                  */
1186                 other = NULL;
1187                 unix_state_double_lock(sk, other);
1188         }
1189
1190         /*
1191          * If it was connected, reconnect.
1192          */
1193         if (unix_peer(sk)) {
1194                 struct sock *old_peer = unix_peer(sk);
1195                 unix_peer(sk) = other;
1196                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1197
1198                 unix_state_double_unlock(sk, other);
1199
1200                 if (other != old_peer)
1201                         unix_dgram_disconnected(sk, old_peer);
1202                 sock_put(old_peer);
1203         } else {
1204                 unix_peer(sk) = other;
1205                 unix_state_double_unlock(sk, other);
1206         }
1207         return 0;
1208
1209 out_unlock:
1210         unix_state_double_unlock(sk, other);
1211         sock_put(other);
1212 out:
1213         return err;
1214 }
1215
1216 static long unix_wait_for_peer(struct sock *other, long timeo)
1217 {
1218         struct unix_sock *u = unix_sk(other);
1219         int sched;
1220         DEFINE_WAIT(wait);
1221
1222         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1223
1224         sched = !sock_flag(other, SOCK_DEAD) &&
1225                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1226                 unix_recvq_full(other);
1227
1228         unix_state_unlock(other);
1229
1230         if (sched)
1231                 timeo = schedule_timeout(timeo);
1232
1233         finish_wait(&u->peer_wait, &wait);
1234         return timeo;
1235 }
1236
1237 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1238                                int addr_len, int flags)
1239 {
1240         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1241         struct sock *sk = sock->sk;
1242         struct net *net = sock_net(sk);
1243         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1244         struct sock *newsk = NULL;
1245         struct sock *other = NULL;
1246         struct sk_buff *skb = NULL;
1247         unsigned int hash;
1248         int st;
1249         int err;
1250         long timeo;
1251
1252         err = unix_mkname(sunaddr, addr_len, &hash);
1253         if (err < 0)
1254                 goto out;
1255         addr_len = err;
1256
1257         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1258             (err = unix_autobind(sock)) != 0)
1259                 goto out;
1260
1261         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1262
1263         /* First of all allocate resources.
1264            If we will make it after state is locked,
1265            we will have to recheck all again in any case.
1266          */
1267
1268         err = -ENOMEM;
1269
1270         /* create new sock for complete connection */
1271         newsk = unix_create1(sock_net(sk), NULL, 0);
1272         if (newsk == NULL)
1273                 goto out;
1274
1275         /* Allocate skb for sending to listening sock */
1276         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1277         if (skb == NULL)
1278                 goto out;
1279
1280 restart:
1281         /*  Find listening sock. */
1282         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1283         if (!other)
1284                 goto out;
1285
1286         /* Latch state of peer */
1287         unix_state_lock(other);
1288
1289         /* Apparently VFS overslept socket death. Retry. */
1290         if (sock_flag(other, SOCK_DEAD)) {
1291                 unix_state_unlock(other);
1292                 sock_put(other);
1293                 goto restart;
1294         }
1295
1296         err = -ECONNREFUSED;
1297         if (other->sk_state != TCP_LISTEN)
1298                 goto out_unlock;
1299         if (other->sk_shutdown & RCV_SHUTDOWN)
1300                 goto out_unlock;
1301
1302         if (unix_recvq_full(other)) {
1303                 err = -EAGAIN;
1304                 if (!timeo)
1305                         goto out_unlock;
1306
1307                 timeo = unix_wait_for_peer(other, timeo);
1308
1309                 err = sock_intr_errno(timeo);
1310                 if (signal_pending(current))
1311                         goto out;
1312                 sock_put(other);
1313                 goto restart;
1314         }
1315
1316         /* Latch our state.
1317
1318            It is tricky place. We need to grab our state lock and cannot
1319            drop lock on peer. It is dangerous because deadlock is
1320            possible. Connect to self case and simultaneous
1321            attempt to connect are eliminated by checking socket
1322            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1323            check this before attempt to grab lock.
1324
1325            Well, and we have to recheck the state after socket locked.
1326          */
1327         st = sk->sk_state;
1328
1329         switch (st) {
1330         case TCP_CLOSE:
1331                 /* This is ok... continue with connect */
1332                 break;
1333         case TCP_ESTABLISHED:
1334                 /* Socket is already connected */
1335                 err = -EISCONN;
1336                 goto out_unlock;
1337         default:
1338                 err = -EINVAL;
1339                 goto out_unlock;
1340         }
1341
1342         unix_state_lock_nested(sk);
1343
1344         if (sk->sk_state != st) {
1345                 unix_state_unlock(sk);
1346                 unix_state_unlock(other);
1347                 sock_put(other);
1348                 goto restart;
1349         }
1350
1351         err = security_unix_stream_connect(sk, other, newsk);
1352         if (err) {
1353                 unix_state_unlock(sk);
1354                 goto out_unlock;
1355         }
1356
1357         /* The way is open! Fastly set all the necessary fields... */
1358
1359         sock_hold(sk);
1360         unix_peer(newsk)        = sk;
1361         newsk->sk_state         = TCP_ESTABLISHED;
1362         newsk->sk_type          = sk->sk_type;
1363         init_peercred(newsk);
1364         newu = unix_sk(newsk);
1365         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1366         otheru = unix_sk(other);
1367
1368         /* copy address information from listening to new sock
1369          *
1370          * The contents of *(otheru->addr) and otheru->path
1371          * are seen fully set up here, since we have found
1372          * otheru in hash under unix_table_lock.  Insertion
1373          * into the hash chain we'd found it in had been done
1374          * in an earlier critical area protected by unix_table_lock,
1375          * the same one where we'd set *(otheru->addr) contents,
1376          * as well as otheru->path and otheru->addr itself.
1377          *
1378          * Using smp_store_release() here to set newu->addr
1379          * is enough to make those stores, as well as stores
1380          * to newu->path visible to anyone who gets newu->addr
1381          * by smp_load_acquire().  IOW, the same warranties
1382          * as for unix_sock instances bound in unix_bind() or
1383          * in unix_autobind().
1384          */
1385         if (otheru->path.dentry) {
1386                 path_get(&otheru->path);
1387                 newu->path = otheru->path;
1388         }
1389         atomic_inc(&otheru->addr->refcnt);
1390         smp_store_release(&newu->addr, otheru->addr);
1391
1392         /* Set credentials */
1393         copy_peercred(sk, other);
1394
1395         sock->state     = SS_CONNECTED;
1396         sk->sk_state    = TCP_ESTABLISHED;
1397         sock_hold(newsk);
1398
1399         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1400         unix_peer(sk)   = newsk;
1401
1402         unix_state_unlock(sk);
1403
1404         /* take ten and and send info to listening sock */
1405         spin_lock(&other->sk_receive_queue.lock);
1406         __skb_queue_tail(&other->sk_receive_queue, skb);
1407         spin_unlock(&other->sk_receive_queue.lock);
1408         unix_state_unlock(other);
1409         other->sk_data_ready(other);
1410         sock_put(other);
1411         return 0;
1412
1413 out_unlock:
1414         if (other)
1415                 unix_state_unlock(other);
1416
1417 out:
1418         kfree_skb(skb);
1419         if (newsk)
1420                 unix_release_sock(newsk, 0);
1421         if (other)
1422                 sock_put(other);
1423         return err;
1424 }
1425
1426 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1427 {
1428         struct sock *ska = socka->sk, *skb = sockb->sk;
1429
1430         /* Join our sockets back to back */
1431         sock_hold(ska);
1432         sock_hold(skb);
1433         unix_peer(ska) = skb;
1434         unix_peer(skb) = ska;
1435         init_peercred(ska);
1436         init_peercred(skb);
1437
1438         if (ska->sk_type != SOCK_DGRAM) {
1439                 ska->sk_state = TCP_ESTABLISHED;
1440                 skb->sk_state = TCP_ESTABLISHED;
1441                 socka->state  = SS_CONNECTED;
1442                 sockb->state  = SS_CONNECTED;
1443         }
1444         return 0;
1445 }
1446
1447 static void unix_sock_inherit_flags(const struct socket *old,
1448                                     struct socket *new)
1449 {
1450         if (test_bit(SOCK_PASSCRED, &old->flags))
1451                 set_bit(SOCK_PASSCRED, &new->flags);
1452         if (test_bit(SOCK_PASSSEC, &old->flags))
1453                 set_bit(SOCK_PASSSEC, &new->flags);
1454 }
1455
1456 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1457 {
1458         struct sock *sk = sock->sk;
1459         struct sock *tsk;
1460         struct sk_buff *skb;
1461         int err;
1462
1463         err = -EOPNOTSUPP;
1464         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1465                 goto out;
1466
1467         err = -EINVAL;
1468         if (sk->sk_state != TCP_LISTEN)
1469                 goto out;
1470
1471         /* If socket state is TCP_LISTEN it cannot change (for now...),
1472          * so that no locks are necessary.
1473          */
1474
1475         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1476         if (!skb) {
1477                 /* This means receive shutdown. */
1478                 if (err == 0)
1479                         err = -EINVAL;
1480                 goto out;
1481         }
1482
1483         tsk = skb->sk;
1484         skb_free_datagram(sk, skb);
1485         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1486
1487         /* attach accepted sock to socket */
1488         unix_state_lock(tsk);
1489         newsock->state = SS_CONNECTED;
1490         unix_sock_inherit_flags(sock, newsock);
1491         sock_graft(tsk, newsock);
1492         unix_state_unlock(tsk);
1493         return 0;
1494
1495 out:
1496         return err;
1497 }
1498
1499
1500 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1501 {
1502         struct sock *sk = sock->sk;
1503         struct unix_address *addr;
1504         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1505         int err = 0;
1506
1507         if (peer) {
1508                 sk = unix_peer_get(sk);
1509
1510                 err = -ENOTCONN;
1511                 if (!sk)
1512                         goto out;
1513                 err = 0;
1514         } else {
1515                 sock_hold(sk);
1516         }
1517
1518         addr = smp_load_acquire(&unix_sk(sk)->addr);
1519         if (!addr) {
1520                 sunaddr->sun_family = AF_UNIX;
1521                 sunaddr->sun_path[0] = 0;
1522                 *uaddr_len = sizeof(short);
1523         } else {
1524                 *uaddr_len = addr->len;
1525                 memcpy(sunaddr, addr->name, *uaddr_len);
1526         }
1527         sock_put(sk);
1528 out:
1529         return err;
1530 }
1531
1532 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1533 {
1534         scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1535
1536         /*
1537          * Garbage collection of unix sockets starts by selecting a set of
1538          * candidate sockets which have reference only from being in flight
1539          * (total_refs == inflight_refs).  This condition is checked once during
1540          * the candidate collection phase, and candidates are marked as such, so
1541          * that non-candidates can later be ignored.  While inflight_refs is
1542          * protected by unix_gc_lock, total_refs (file count) is not, hence this
1543          * is an instantaneous decision.
1544          *
1545          * Once a candidate, however, the socket must not be reinstalled into a
1546          * file descriptor while the garbage collection is in progress.
1547          *
1548          * If the above conditions are met, then the directed graph of
1549          * candidates (*) does not change while unix_gc_lock is held.
1550          *
1551          * Any operations that changes the file count through file descriptors
1552          * (dup, close, sendmsg) does not change the graph since candidates are
1553          * not installed in fds.
1554          *
1555          * Dequeing a candidate via recvmsg would install it into an fd, but
1556          * that takes unix_gc_lock to decrement the inflight count, so it's
1557          * serialized with garbage collection.
1558          *
1559          * MSG_PEEK is special in that it does not change the inflight count,
1560          * yet does install the socket into an fd.  The following lock/unlock
1561          * pair is to ensure serialization with garbage collection.  It must be
1562          * done between incrementing the file count and installing the file into
1563          * an fd.
1564          *
1565          * If garbage collection starts after the barrier provided by the
1566          * lock/unlock, then it will see the elevated refcount and not mark this
1567          * as a candidate.  If a garbage collection is already in progress
1568          * before the file count was incremented, then the lock/unlock pair will
1569          * ensure that garbage collection is finished before progressing to
1570          * installing the fd.
1571          *
1572          * (*) A -> B where B is on the queue of A or B is on the queue of C
1573          * which is on the queue of listening socket A.
1574          */
1575         spin_lock(&unix_gc_lock);
1576         spin_unlock(&unix_gc_lock);
1577 }
1578
1579 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1580 {
1581         int err = 0;
1582
1583         UNIXCB(skb).pid  = get_pid(scm->pid);
1584         UNIXCB(skb).uid = scm->creds.uid;
1585         UNIXCB(skb).gid = scm->creds.gid;
1586         UNIXCB(skb).fp = NULL;
1587         unix_get_secdata(scm, skb);
1588         if (scm->fp && send_fds)
1589                 err = unix_attach_fds(scm, skb);
1590
1591         skb->destructor = unix_destruct_scm;
1592         return err;
1593 }
1594
1595 static bool unix_passcred_enabled(const struct socket *sock,
1596                                   const struct sock *other)
1597 {
1598         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1599                !other->sk_socket ||
1600                test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1601 }
1602
1603 /*
1604  * Some apps rely on write() giving SCM_CREDENTIALS
1605  * We include credentials if source or destination socket
1606  * asserted SOCK_PASSCRED.
1607  */
1608 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1609                             const struct sock *other)
1610 {
1611         if (UNIXCB(skb).pid)
1612                 return;
1613         if (unix_passcred_enabled(sock, other)) {
1614                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1615                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1616         }
1617 }
1618
1619 static int maybe_init_creds(struct scm_cookie *scm,
1620                             struct socket *socket,
1621                             const struct sock *other)
1622 {
1623         int err;
1624         struct msghdr msg = { .msg_controllen = 0 };
1625
1626         err = scm_send(socket, &msg, scm, false);
1627         if (err)
1628                 return err;
1629
1630         if (unix_passcred_enabled(socket, other)) {
1631                 scm->pid = get_pid(task_tgid(current));
1632                 current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1633         }
1634         return err;
1635 }
1636
1637 static bool unix_skb_scm_eq(struct sk_buff *skb,
1638                             struct scm_cookie *scm)
1639 {
1640         const struct unix_skb_parms *u = &UNIXCB(skb);
1641
1642         return u->pid == scm->pid &&
1643                uid_eq(u->uid, scm->creds.uid) &&
1644                gid_eq(u->gid, scm->creds.gid) &&
1645                unix_secdata_eq(scm, skb);
1646 }
1647
1648 /*
1649  *      Send AF_UNIX data.
1650  */
1651
1652 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1653                               size_t len)
1654 {
1655         struct sock *sk = sock->sk;
1656         struct net *net = sock_net(sk);
1657         struct unix_sock *u = unix_sk(sk);
1658         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1659         struct sock *other = NULL;
1660         int namelen = 0; /* fake GCC */
1661         int err;
1662         unsigned int hash;
1663         struct sk_buff *skb;
1664         long timeo;
1665         struct scm_cookie scm;
1666         int max_level;
1667         int data_len = 0;
1668         int sk_locked;
1669
1670         wait_for_unix_gc();
1671         err = scm_send(sock, msg, &scm, false);
1672         if (err < 0)
1673                 return err;
1674
1675         err = -EOPNOTSUPP;
1676         if (msg->msg_flags&MSG_OOB)
1677                 goto out;
1678
1679         if (msg->msg_namelen) {
1680                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1681                 if (err < 0)
1682                         goto out;
1683                 namelen = err;
1684         } else {
1685                 sunaddr = NULL;
1686                 err = -ENOTCONN;
1687                 other = unix_peer_get(sk);
1688                 if (!other)
1689                         goto out;
1690         }
1691
1692         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1693             && (err = unix_autobind(sock)) != 0)
1694                 goto out;
1695
1696         err = -EMSGSIZE;
1697         if (len > sk->sk_sndbuf - 32)
1698                 goto out;
1699
1700         if (len > SKB_MAX_ALLOC) {
1701                 data_len = min_t(size_t,
1702                                  len - SKB_MAX_ALLOC,
1703                                  MAX_SKB_FRAGS * PAGE_SIZE);
1704                 data_len = PAGE_ALIGN(data_len);
1705
1706                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1707         }
1708
1709         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1710                                    msg->msg_flags & MSG_DONTWAIT, &err,
1711                                    PAGE_ALLOC_COSTLY_ORDER);
1712         if (skb == NULL)
1713                 goto out;
1714
1715         err = unix_scm_to_skb(&scm, skb, true);
1716         if (err < 0)
1717                 goto out_free;
1718         max_level = err + 1;
1719
1720         skb_put(skb, len - data_len);
1721         skb->data_len = data_len;
1722         skb->len = len;
1723         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1724         if (err)
1725                 goto out_free;
1726
1727         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1728
1729 restart:
1730         if (!other) {
1731                 err = -ECONNRESET;
1732                 if (sunaddr == NULL)
1733                         goto out_free;
1734
1735                 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1736                                         hash, &err);
1737                 if (other == NULL)
1738                         goto out_free;
1739         }
1740
1741         if (sk_filter(other, skb) < 0) {
1742                 /* Toss the packet but do not return any error to the sender */
1743                 err = len;
1744                 goto out_free;
1745         }
1746
1747         sk_locked = 0;
1748         unix_state_lock(other);
1749 restart_locked:
1750         err = -EPERM;
1751         if (!unix_may_send(sk, other))
1752                 goto out_unlock;
1753
1754         if (unlikely(sock_flag(other, SOCK_DEAD))) {
1755                 /*
1756                  *      Check with 1003.1g - what should
1757                  *      datagram error
1758                  */
1759                 unix_state_unlock(other);
1760                 sock_put(other);
1761
1762                 if (!sk_locked)
1763                         unix_state_lock(sk);
1764
1765                 err = 0;
1766                 if (unix_peer(sk) == other) {
1767                         unix_peer(sk) = NULL;
1768                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1769
1770                         unix_state_unlock(sk);
1771
1772                         unix_dgram_disconnected(sk, other);
1773                         sock_put(other);
1774                         err = -ECONNREFUSED;
1775                 } else {
1776                         unix_state_unlock(sk);
1777                 }
1778
1779                 other = NULL;
1780                 if (err)
1781                         goto out_free;
1782                 goto restart;
1783         }
1784
1785         err = -EPIPE;
1786         if (other->sk_shutdown & RCV_SHUTDOWN)
1787                 goto out_unlock;
1788
1789         if (sk->sk_type != SOCK_SEQPACKET) {
1790                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1791                 if (err)
1792                         goto out_unlock;
1793         }
1794
1795         /* other == sk && unix_peer(other) != sk if
1796          * - unix_peer(sk) == NULL, destination address bound to sk
1797          * - unix_peer(sk) == sk by time of get but disconnected before lock
1798          */
1799         if (other != sk &&
1800             unlikely(unix_peer(other) != sk &&
1801             unix_recvq_full_lockless(other))) {
1802                 if (timeo) {
1803                         timeo = unix_wait_for_peer(other, timeo);
1804
1805                         err = sock_intr_errno(timeo);
1806                         if (signal_pending(current))
1807                                 goto out_free;
1808
1809                         goto restart;
1810                 }
1811
1812                 if (!sk_locked) {
1813                         unix_state_unlock(other);
1814                         unix_state_double_lock(sk, other);
1815                 }
1816
1817                 if (unix_peer(sk) != other ||
1818                     unix_dgram_peer_wake_me(sk, other)) {
1819                         err = -EAGAIN;
1820                         sk_locked = 1;
1821                         goto out_unlock;
1822                 }
1823
1824                 if (!sk_locked) {
1825                         sk_locked = 1;
1826                         goto restart_locked;
1827                 }
1828         }
1829
1830         if (unlikely(sk_locked))
1831                 unix_state_unlock(sk);
1832
1833         if (sock_flag(other, SOCK_RCVTSTAMP))
1834                 __net_timestamp(skb);
1835         maybe_add_creds(skb, sock, other);
1836         skb_queue_tail(&other->sk_receive_queue, skb);
1837         if (max_level > unix_sk(other)->recursion_level)
1838                 unix_sk(other)->recursion_level = max_level;
1839         unix_state_unlock(other);
1840         other->sk_data_ready(other);
1841         sock_put(other);
1842         scm_destroy(&scm);
1843         return len;
1844
1845 out_unlock:
1846         if (sk_locked)
1847                 unix_state_unlock(sk);
1848         unix_state_unlock(other);
1849 out_free:
1850         kfree_skb(skb);
1851 out:
1852         if (other)
1853                 sock_put(other);
1854         scm_destroy(&scm);
1855         return err;
1856 }
1857
1858 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1859  * bytes, and a minimun of a full page.
1860  */
1861 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1862
1863 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1864                                size_t len)
1865 {
1866         struct sock *sk = sock->sk;
1867         struct sock *other = NULL;
1868         int err, size;
1869         struct sk_buff *skb;
1870         int sent = 0;
1871         struct scm_cookie scm;
1872         bool fds_sent = false;
1873         int max_level;
1874         int data_len;
1875
1876         wait_for_unix_gc();
1877         err = scm_send(sock, msg, &scm, false);
1878         if (err < 0)
1879                 return err;
1880
1881         err = -EOPNOTSUPP;
1882         if (msg->msg_flags&MSG_OOB)
1883                 goto out_err;
1884
1885         if (msg->msg_namelen) {
1886                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1887                 goto out_err;
1888         } else {
1889                 err = -ENOTCONN;
1890                 other = unix_peer(sk);
1891                 if (!other)
1892                         goto out_err;
1893         }
1894
1895         if (sk->sk_shutdown & SEND_SHUTDOWN)
1896                 goto pipe_err;
1897
1898         while (sent < len) {
1899                 size = len - sent;
1900
1901                 /* Keep two messages in the pipe so it schedules better */
1902                 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1903
1904                 /* allow fallback to order-0 allocations */
1905                 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1906
1907                 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1908
1909                 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1910
1911                 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1912                                            msg->msg_flags & MSG_DONTWAIT, &err,
1913                                            get_order(UNIX_SKB_FRAGS_SZ));
1914                 if (!skb)
1915                         goto out_err;
1916
1917                 /* Only send the fds in the first buffer */
1918                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
1919                 if (err < 0) {
1920                         kfree_skb(skb);
1921                         goto out_err;
1922                 }
1923                 max_level = err + 1;
1924                 fds_sent = true;
1925
1926                 skb_put(skb, size - data_len);
1927                 skb->data_len = data_len;
1928                 skb->len = size;
1929                 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1930                 if (err) {
1931                         kfree_skb(skb);
1932                         goto out_err;
1933                 }
1934
1935                 unix_state_lock(other);
1936
1937                 if (sock_flag(other, SOCK_DEAD) ||
1938                     (other->sk_shutdown & RCV_SHUTDOWN))
1939                         goto pipe_err_free;
1940
1941                 maybe_add_creds(skb, sock, other);
1942                 skb_queue_tail(&other->sk_receive_queue, skb);
1943                 if (max_level > unix_sk(other)->recursion_level)
1944                         unix_sk(other)->recursion_level = max_level;
1945                 unix_state_unlock(other);
1946                 other->sk_data_ready(other);
1947                 sent += size;
1948         }
1949
1950         scm_destroy(&scm);
1951
1952         return sent;
1953
1954 pipe_err_free:
1955         unix_state_unlock(other);
1956         kfree_skb(skb);
1957 pipe_err:
1958         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1959                 send_sig(SIGPIPE, current, 0);
1960         err = -EPIPE;
1961 out_err:
1962         scm_destroy(&scm);
1963         return sent ? : err;
1964 }
1965
1966 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
1967                                     int offset, size_t size, int flags)
1968 {
1969         int err;
1970         bool send_sigpipe = false;
1971         bool init_scm = true;
1972         struct scm_cookie scm;
1973         struct sock *other, *sk = socket->sk;
1974         struct sk_buff *skb, *newskb = NULL, *tail = NULL;
1975
1976         if (flags & MSG_OOB)
1977                 return -EOPNOTSUPP;
1978
1979         other = unix_peer(sk);
1980         if (!other || sk->sk_state != TCP_ESTABLISHED)
1981                 return -ENOTCONN;
1982
1983         if (false) {
1984 alloc_skb:
1985                 unix_state_unlock(other);
1986                 mutex_unlock(&unix_sk(other)->iolock);
1987                 newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
1988                                               &err, 0);
1989                 if (!newskb)
1990                         goto err;
1991         }
1992
1993         /* we must acquire iolock as we modify already present
1994          * skbs in the sk_receive_queue and mess with skb->len
1995          */
1996         err = mutex_lock_interruptible(&unix_sk(other)->iolock);
1997         if (err) {
1998                 err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
1999                 goto err;
2000         }
2001
2002         if (sk->sk_shutdown & SEND_SHUTDOWN) {
2003                 err = -EPIPE;
2004                 send_sigpipe = true;
2005                 goto err_unlock;
2006         }
2007
2008         unix_state_lock(other);
2009
2010         if (sock_flag(other, SOCK_DEAD) ||
2011             other->sk_shutdown & RCV_SHUTDOWN) {
2012                 err = -EPIPE;
2013                 send_sigpipe = true;
2014                 goto err_state_unlock;
2015         }
2016
2017         if (init_scm) {
2018                 err = maybe_init_creds(&scm, socket, other);
2019                 if (err)
2020                         goto err_state_unlock;
2021                 init_scm = false;
2022         }
2023
2024         skb = skb_peek_tail(&other->sk_receive_queue);
2025         if (tail && tail == skb) {
2026                 skb = newskb;
2027         } else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2028                 if (newskb) {
2029                         skb = newskb;
2030                 } else {
2031                         tail = skb;
2032                         goto alloc_skb;
2033                 }
2034         } else if (newskb) {
2035                 /* this is fast path, we don't necessarily need to
2036                  * call to kfree_skb even though with newskb == NULL
2037                  * this - does no harm
2038                  */
2039                 consume_skb(newskb);
2040                 newskb = NULL;
2041         }
2042
2043         if (skb_append_pagefrags(skb, page, offset, size)) {
2044                 tail = skb;
2045                 goto alloc_skb;
2046         }
2047
2048         skb->len += size;
2049         skb->data_len += size;
2050         skb->truesize += size;
2051         atomic_add(size, &sk->sk_wmem_alloc);
2052
2053         if (newskb) {
2054                 err = unix_scm_to_skb(&scm, skb, false);
2055                 if (err)
2056                         goto err_state_unlock;
2057                 spin_lock(&other->sk_receive_queue.lock);
2058                 __skb_queue_tail(&other->sk_receive_queue, newskb);
2059                 spin_unlock(&other->sk_receive_queue.lock);
2060         }
2061
2062         unix_state_unlock(other);
2063         mutex_unlock(&unix_sk(other)->iolock);
2064
2065         other->sk_data_ready(other);
2066         scm_destroy(&scm);
2067         return size;
2068
2069 err_state_unlock:
2070         unix_state_unlock(other);
2071 err_unlock:
2072         mutex_unlock(&unix_sk(other)->iolock);
2073 err:
2074         kfree_skb(newskb);
2075         if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2076                 send_sig(SIGPIPE, current, 0);
2077         if (!init_scm)
2078                 scm_destroy(&scm);
2079         return err;
2080 }
2081
2082 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2083                                   size_t len)
2084 {
2085         int err;
2086         struct sock *sk = sock->sk;
2087
2088         err = sock_error(sk);
2089         if (err)
2090                 return err;
2091
2092         if (sk->sk_state != TCP_ESTABLISHED)
2093                 return -ENOTCONN;
2094
2095         if (msg->msg_namelen)
2096                 msg->msg_namelen = 0;
2097
2098         return unix_dgram_sendmsg(sock, msg, len);
2099 }
2100
2101 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2102                                   size_t size, int flags)
2103 {
2104         struct sock *sk = sock->sk;
2105
2106         if (sk->sk_state != TCP_ESTABLISHED)
2107                 return -ENOTCONN;
2108
2109         return unix_dgram_recvmsg(sock, msg, size, flags);
2110 }
2111
2112 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2113 {
2114         struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2115
2116         if (addr) {
2117                 msg->msg_namelen = addr->len;
2118                 memcpy(msg->msg_name, addr->name, addr->len);
2119         }
2120 }
2121
2122 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
2123                               size_t size, int flags)
2124 {
2125         struct scm_cookie scm;
2126         struct sock *sk = sock->sk;
2127         struct unix_sock *u = unix_sk(sk);
2128         struct sk_buff *skb, *last;
2129         long timeo;
2130         int err;
2131         int peeked, skip;
2132
2133         err = -EOPNOTSUPP;
2134         if (flags&MSG_OOB)
2135                 goto out;
2136
2137         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2138
2139         do {
2140                 mutex_lock(&u->iolock);
2141
2142                 skip = sk_peek_offset(sk, flags);
2143                 skb = __skb_try_recv_datagram(sk, flags, &peeked, &skip, &err,
2144                                               &last);
2145                 if (skb)
2146                         break;
2147
2148                 mutex_unlock(&u->iolock);
2149
2150                 if (err != -EAGAIN)
2151                         break;
2152         } while (timeo &&
2153                  !__skb_wait_for_more_packets(sk, &err, &timeo, last));
2154
2155         if (!skb) { /* implies iolock unlocked */
2156                 unix_state_lock(sk);
2157                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2158                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2159                     (sk->sk_shutdown & RCV_SHUTDOWN))
2160                         err = 0;
2161                 unix_state_unlock(sk);
2162                 goto out;
2163         }
2164
2165         if (wq_has_sleeper(&u->peer_wait))
2166                 wake_up_interruptible_sync_poll(&u->peer_wait,
2167                                                 POLLOUT | POLLWRNORM |
2168                                                 POLLWRBAND);
2169
2170         if (msg->msg_name)
2171                 unix_copy_addr(msg, skb->sk);
2172
2173         if (size > skb->len - skip)
2174                 size = skb->len - skip;
2175         else if (size < skb->len - skip)
2176                 msg->msg_flags |= MSG_TRUNC;
2177
2178         err = skb_copy_datagram_msg(skb, skip, msg, size);
2179         if (err)
2180                 goto out_free;
2181
2182         if (sock_flag(sk, SOCK_RCVTSTAMP))
2183                 __sock_recv_timestamp(msg, sk, skb);
2184
2185         memset(&scm, 0, sizeof(scm));
2186
2187         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2188         unix_set_secdata(&scm, skb);
2189
2190         if (!(flags & MSG_PEEK)) {
2191                 if (UNIXCB(skb).fp)
2192                         unix_detach_fds(&scm, skb);
2193
2194                 sk_peek_offset_bwd(sk, skb->len);
2195         } else {
2196                 /* It is questionable: on PEEK we could:
2197                    - do not return fds - good, but too simple 8)
2198                    - return fds, and do not return them on read (old strategy,
2199                      apparently wrong)
2200                    - clone fds (I chose it for now, it is the most universal
2201                      solution)
2202
2203                    POSIX 1003.1g does not actually define this clearly
2204                    at all. POSIX 1003.1g doesn't define a lot of things
2205                    clearly however!
2206
2207                 */
2208
2209                 sk_peek_offset_fwd(sk, size);
2210
2211                 if (UNIXCB(skb).fp)
2212                         unix_peek_fds(&scm, skb);
2213         }
2214         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2215
2216         scm_recv(sock, msg, &scm, flags);
2217
2218 out_free:
2219         skb_free_datagram(sk, skb);
2220         mutex_unlock(&u->iolock);
2221 out:
2222         return err;
2223 }
2224
2225 /*
2226  *      Sleep until more data has arrived. But check for races..
2227  */
2228 static long unix_stream_data_wait(struct sock *sk, long timeo,
2229                                   struct sk_buff *last, unsigned int last_len,
2230                                   bool freezable)
2231 {
2232         struct sk_buff *tail;
2233         DEFINE_WAIT(wait);
2234
2235         unix_state_lock(sk);
2236
2237         for (;;) {
2238                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2239
2240                 tail = skb_peek_tail(&sk->sk_receive_queue);
2241                 if (tail != last ||
2242                     (tail && tail->len != last_len) ||
2243                     sk->sk_err ||
2244                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2245                     signal_pending(current) ||
2246                     !timeo)
2247                         break;
2248
2249                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2250                 unix_state_unlock(sk);
2251                 if (freezable)
2252                         timeo = freezable_schedule_timeout(timeo);
2253                 else
2254                         timeo = schedule_timeout(timeo);
2255                 unix_state_lock(sk);
2256
2257                 if (sock_flag(sk, SOCK_DEAD))
2258                         break;
2259
2260                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2261         }
2262
2263         finish_wait(sk_sleep(sk), &wait);
2264         unix_state_unlock(sk);
2265         return timeo;
2266 }
2267
2268 static unsigned int unix_skb_len(const struct sk_buff *skb)
2269 {
2270         return skb->len - UNIXCB(skb).consumed;
2271 }
2272
2273 struct unix_stream_read_state {
2274         int (*recv_actor)(struct sk_buff *, int, int,
2275                           struct unix_stream_read_state *);
2276         struct socket *socket;
2277         struct msghdr *msg;
2278         struct pipe_inode_info *pipe;
2279         size_t size;
2280         int flags;
2281         unsigned int splice_flags;
2282 };
2283
2284 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2285                                     bool freezable)
2286 {
2287         struct scm_cookie scm;
2288         struct socket *sock = state->socket;
2289         struct sock *sk = sock->sk;
2290         struct unix_sock *u = unix_sk(sk);
2291         int copied = 0;
2292         int flags = state->flags;
2293         int noblock = flags & MSG_DONTWAIT;
2294         bool check_creds = false;
2295         int target;
2296         int err = 0;
2297         long timeo;
2298         int skip;
2299         size_t size = state->size;
2300         unsigned int last_len;
2301
2302         if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2303                 err = -EINVAL;
2304                 goto out;
2305         }
2306
2307         if (unlikely(flags & MSG_OOB)) {
2308                 err = -EOPNOTSUPP;
2309                 goto out;
2310         }
2311
2312         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2313         timeo = sock_rcvtimeo(sk, noblock);
2314
2315         memset(&scm, 0, sizeof(scm));
2316
2317         /* Lock the socket to prevent queue disordering
2318          * while sleeps in memcpy_tomsg
2319          */
2320         mutex_lock(&u->iolock);
2321
2322         if (flags & MSG_PEEK)
2323                 skip = sk_peek_offset(sk, flags);
2324         else
2325                 skip = 0;
2326
2327         do {
2328                 int chunk;
2329                 bool drop_skb;
2330                 struct sk_buff *skb, *last;
2331
2332 redo:
2333                 unix_state_lock(sk);
2334                 if (sock_flag(sk, SOCK_DEAD)) {
2335                         err = -ECONNRESET;
2336                         goto unlock;
2337                 }
2338                 last = skb = skb_peek(&sk->sk_receive_queue);
2339                 last_len = last ? last->len : 0;
2340 again:
2341                 if (skb == NULL) {
2342                         unix_sk(sk)->recursion_level = 0;
2343                         if (copied >= target)
2344                                 goto unlock;
2345
2346                         /*
2347                          *      POSIX 1003.1g mandates this order.
2348                          */
2349
2350                         err = sock_error(sk);
2351                         if (err)
2352                                 goto unlock;
2353                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2354                                 goto unlock;
2355
2356                         unix_state_unlock(sk);
2357                         if (!timeo) {
2358                                 err = -EAGAIN;
2359                                 break;
2360                         }
2361
2362                         mutex_unlock(&u->iolock);
2363
2364                         timeo = unix_stream_data_wait(sk, timeo, last,
2365                                                       last_len, freezable);
2366
2367                         if (signal_pending(current)) {
2368                                 err = sock_intr_errno(timeo);
2369                                 scm_destroy(&scm);
2370                                 goto out;
2371                         }
2372
2373                         mutex_lock(&u->iolock);
2374                         goto redo;
2375 unlock:
2376                         unix_state_unlock(sk);
2377                         break;
2378                 }
2379
2380                 while (skip >= unix_skb_len(skb)) {
2381                         skip -= unix_skb_len(skb);
2382                         last = skb;
2383                         last_len = skb->len;
2384                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2385                         if (!skb)
2386                                 goto again;
2387                 }
2388
2389                 unix_state_unlock(sk);
2390
2391                 if (check_creds) {
2392                         /* Never glue messages from different writers */
2393                         if (!unix_skb_scm_eq(skb, &scm))
2394                                 break;
2395                 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2396                         /* Copy credentials */
2397                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2398                         unix_set_secdata(&scm, skb);
2399                         check_creds = true;
2400                 }
2401
2402                 /* Copy address just once */
2403                 if (state->msg && state->msg->msg_name) {
2404                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2405                                          state->msg->msg_name);
2406                         unix_copy_addr(state->msg, skb->sk);
2407                         sunaddr = NULL;
2408                 }
2409
2410                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2411                 skb_get(skb);
2412                 chunk = state->recv_actor(skb, skip, chunk, state);
2413                 drop_skb = !unix_skb_len(skb);
2414                 /* skb is only safe to use if !drop_skb */
2415                 consume_skb(skb);
2416                 if (chunk < 0) {
2417                         if (copied == 0)
2418                                 copied = -EFAULT;
2419                         break;
2420                 }
2421                 copied += chunk;
2422                 size -= chunk;
2423
2424                 if (drop_skb) {
2425                         /* the skb was touched by a concurrent reader;
2426                          * we should not expect anything from this skb
2427                          * anymore and assume it invalid - we can be
2428                          * sure it was dropped from the socket queue
2429                          *
2430                          * let's report a short read
2431                          */
2432                         err = 0;
2433                         break;
2434                 }
2435
2436                 /* Mark read part of skb as used */
2437                 if (!(flags & MSG_PEEK)) {
2438                         UNIXCB(skb).consumed += chunk;
2439
2440                         sk_peek_offset_bwd(sk, chunk);
2441
2442                         if (UNIXCB(skb).fp)
2443                                 unix_detach_fds(&scm, skb);
2444
2445                         if (unix_skb_len(skb))
2446                                 break;
2447
2448                         skb_unlink(skb, &sk->sk_receive_queue);
2449                         consume_skb(skb);
2450
2451                         if (scm.fp)
2452                                 break;
2453                 } else {
2454                         /* It is questionable, see note in unix_dgram_recvmsg.
2455                          */
2456                         if (UNIXCB(skb).fp)
2457                                 unix_peek_fds(&scm, skb);
2458
2459                         sk_peek_offset_fwd(sk, chunk);
2460
2461                         if (UNIXCB(skb).fp)
2462                                 break;
2463
2464                         skip = 0;
2465                         last = skb;
2466                         last_len = skb->len;
2467                         unix_state_lock(sk);
2468                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2469                         if (skb)
2470                                 goto again;
2471                         unix_state_unlock(sk);
2472                         break;
2473                 }
2474         } while (size);
2475
2476         mutex_unlock(&u->iolock);
2477         if (state->msg)
2478                 scm_recv(sock, state->msg, &scm, flags);
2479         else
2480                 scm_destroy(&scm);
2481 out:
2482         return copied ? : err;
2483 }
2484
2485 static int unix_stream_read_actor(struct sk_buff *skb,
2486                                   int skip, int chunk,
2487                                   struct unix_stream_read_state *state)
2488 {
2489         int ret;
2490
2491         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2492                                     state->msg, chunk);
2493         return ret ?: chunk;
2494 }
2495
2496 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2497                                size_t size, int flags)
2498 {
2499         struct unix_stream_read_state state = {
2500                 .recv_actor = unix_stream_read_actor,
2501                 .socket = sock,
2502                 .msg = msg,
2503                 .size = size,
2504                 .flags = flags
2505         };
2506
2507         return unix_stream_read_generic(&state, true);
2508 }
2509
2510 static int unix_stream_splice_actor(struct sk_buff *skb,
2511                                     int skip, int chunk,
2512                                     struct unix_stream_read_state *state)
2513 {
2514         return skb_splice_bits(skb, state->socket->sk,
2515                                UNIXCB(skb).consumed + skip,
2516                                state->pipe, chunk, state->splice_flags);
2517 }
2518
2519 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2520                                        struct pipe_inode_info *pipe,
2521                                        size_t size, unsigned int flags)
2522 {
2523         struct unix_stream_read_state state = {
2524                 .recv_actor = unix_stream_splice_actor,
2525                 .socket = sock,
2526                 .pipe = pipe,
2527                 .size = size,
2528                 .splice_flags = flags,
2529         };
2530
2531         if (unlikely(*ppos))
2532                 return -ESPIPE;
2533
2534         if (sock->file->f_flags & O_NONBLOCK ||
2535             flags & SPLICE_F_NONBLOCK)
2536                 state.flags = MSG_DONTWAIT;
2537
2538         return unix_stream_read_generic(&state, false);
2539 }
2540
2541 static int unix_shutdown(struct socket *sock, int mode)
2542 {
2543         struct sock *sk = sock->sk;
2544         struct sock *other;
2545
2546         if (mode < SHUT_RD || mode > SHUT_RDWR)
2547                 return -EINVAL;
2548         /* This maps:
2549          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2550          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2551          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2552          */
2553         ++mode;
2554
2555         unix_state_lock(sk);
2556         sk->sk_shutdown |= mode;
2557         other = unix_peer(sk);
2558         if (other)
2559                 sock_hold(other);
2560         unix_state_unlock(sk);
2561         sk->sk_state_change(sk);
2562
2563         if (other &&
2564                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2565
2566                 int peer_mode = 0;
2567
2568                 if (mode&RCV_SHUTDOWN)
2569                         peer_mode |= SEND_SHUTDOWN;
2570                 if (mode&SEND_SHUTDOWN)
2571                         peer_mode |= RCV_SHUTDOWN;
2572                 unix_state_lock(other);
2573                 other->sk_shutdown |= peer_mode;
2574                 unix_state_unlock(other);
2575                 other->sk_state_change(other);
2576                 if (peer_mode == SHUTDOWN_MASK)
2577                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2578                 else if (peer_mode & RCV_SHUTDOWN)
2579                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2580         }
2581         if (other)
2582                 sock_put(other);
2583
2584         return 0;
2585 }
2586
2587 long unix_inq_len(struct sock *sk)
2588 {
2589         struct sk_buff *skb;
2590         long amount = 0;
2591
2592         if (sk->sk_state == TCP_LISTEN)
2593                 return -EINVAL;
2594
2595         spin_lock(&sk->sk_receive_queue.lock);
2596         if (sk->sk_type == SOCK_STREAM ||
2597             sk->sk_type == SOCK_SEQPACKET) {
2598                 skb_queue_walk(&sk->sk_receive_queue, skb)
2599                         amount += unix_skb_len(skb);
2600         } else {
2601                 skb = skb_peek(&sk->sk_receive_queue);
2602                 if (skb)
2603                         amount = skb->len;
2604         }
2605         spin_unlock(&sk->sk_receive_queue.lock);
2606
2607         return amount;
2608 }
2609 EXPORT_SYMBOL_GPL(unix_inq_len);
2610
2611 long unix_outq_len(struct sock *sk)
2612 {
2613         return sk_wmem_alloc_get(sk);
2614 }
2615 EXPORT_SYMBOL_GPL(unix_outq_len);
2616
2617 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2618 {
2619         struct sock *sk = sock->sk;
2620         long amount = 0;
2621         int err;
2622
2623         switch (cmd) {
2624         case SIOCOUTQ:
2625                 amount = unix_outq_len(sk);
2626                 err = put_user(amount, (int __user *)arg);
2627                 break;
2628         case SIOCINQ:
2629                 amount = unix_inq_len(sk);
2630                 if (amount < 0)
2631                         err = amount;
2632                 else
2633                         err = put_user(amount, (int __user *)arg);
2634                 break;
2635         default:
2636                 err = -ENOIOCTLCMD;
2637                 break;
2638         }
2639         return err;
2640 }
2641
2642 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2643 {
2644         struct sock *sk = sock->sk;
2645         unsigned int mask;
2646
2647         sock_poll_wait(file, sk_sleep(sk), wait);
2648         mask = 0;
2649
2650         /* exceptional events? */
2651         if (sk->sk_err)
2652                 mask |= POLLERR;
2653         if (sk->sk_shutdown == SHUTDOWN_MASK)
2654                 mask |= POLLHUP;
2655         if (sk->sk_shutdown & RCV_SHUTDOWN)
2656                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2657
2658         /* readable? */
2659         if (!skb_queue_empty(&sk->sk_receive_queue))
2660                 mask |= POLLIN | POLLRDNORM;
2661
2662         /* Connection-based need to check for termination and startup */
2663         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2664             sk->sk_state == TCP_CLOSE)
2665                 mask |= POLLHUP;
2666
2667         /*
2668          * we set writable also when the other side has shut down the
2669          * connection. This prevents stuck sockets.
2670          */
2671         if (unix_writable(sk))
2672                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2673
2674         return mask;
2675 }
2676
2677 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2678                                     poll_table *wait)
2679 {
2680         struct sock *sk = sock->sk, *other;
2681         unsigned int mask, writable;
2682
2683         sock_poll_wait(file, sk_sleep(sk), wait);
2684         mask = 0;
2685
2686         /* exceptional events? */
2687         if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2688                 mask |= POLLERR |
2689                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2690
2691         if (sk->sk_shutdown & RCV_SHUTDOWN)
2692                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2693         if (sk->sk_shutdown == SHUTDOWN_MASK)
2694                 mask |= POLLHUP;
2695
2696         /* readable? */
2697         if (!skb_queue_empty(&sk->sk_receive_queue))
2698                 mask |= POLLIN | POLLRDNORM;
2699
2700         /* Connection-based need to check for termination and startup */
2701         if (sk->sk_type == SOCK_SEQPACKET) {
2702                 if (sk->sk_state == TCP_CLOSE)
2703                         mask |= POLLHUP;
2704                 /* connection hasn't started yet? */
2705                 if (sk->sk_state == TCP_SYN_SENT)
2706                         return mask;
2707         }
2708
2709         /* No write status requested, avoid expensive OUT tests. */
2710         if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2711                 return mask;
2712
2713         writable = unix_writable(sk);
2714         if (writable) {
2715                 unix_state_lock(sk);
2716
2717                 other = unix_peer(sk);
2718                 if (other && unix_peer(other) != sk &&
2719                     unix_recvq_full_lockless(other) &&
2720                     unix_dgram_peer_wake_me(sk, other))
2721                         writable = 0;
2722
2723                 unix_state_unlock(sk);
2724         }
2725
2726         if (writable)
2727                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2728         else
2729                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2730
2731         return mask;
2732 }
2733
2734 #ifdef CONFIG_PROC_FS
2735
2736 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2737
2738 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2739 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2740 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2741
2742 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2743 {
2744         unsigned long offset = get_offset(*pos);
2745         unsigned long bucket = get_bucket(*pos);
2746         struct sock *sk;
2747         unsigned long count = 0;
2748
2749         for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2750                 if (sock_net(sk) != seq_file_net(seq))
2751                         continue;
2752                 if (++count == offset)
2753                         break;
2754         }
2755
2756         return sk;
2757 }
2758
2759 static struct sock *unix_next_socket(struct seq_file *seq,
2760                                      struct sock *sk,
2761                                      loff_t *pos)
2762 {
2763         unsigned long bucket;
2764
2765         while (sk > (struct sock *)SEQ_START_TOKEN) {
2766                 sk = sk_next(sk);
2767                 if (!sk)
2768                         goto next_bucket;
2769                 if (sock_net(sk) == seq_file_net(seq))
2770                         return sk;
2771         }
2772
2773         do {
2774                 sk = unix_from_bucket(seq, pos);
2775                 if (sk)
2776                         return sk;
2777
2778 next_bucket:
2779                 bucket = get_bucket(*pos) + 1;
2780                 *pos = set_bucket_offset(bucket, 1);
2781         } while (bucket < ARRAY_SIZE(unix_socket_table));
2782
2783         return NULL;
2784 }
2785
2786 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2787         __acquires(unix_table_lock)
2788 {
2789         spin_lock(&unix_table_lock);
2790
2791         if (!*pos)
2792                 return SEQ_START_TOKEN;
2793
2794         if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2795                 return NULL;
2796
2797         return unix_next_socket(seq, NULL, pos);
2798 }
2799
2800 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2801 {
2802         ++*pos;
2803         return unix_next_socket(seq, v, pos);
2804 }
2805
2806 static void unix_seq_stop(struct seq_file *seq, void *v)
2807         __releases(unix_table_lock)
2808 {
2809         spin_unlock(&unix_table_lock);
2810 }
2811
2812 static int unix_seq_show(struct seq_file *seq, void *v)
2813 {
2814
2815         if (v == SEQ_START_TOKEN)
2816                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2817                          "Inode Path\n");
2818         else {
2819                 struct sock *s = v;
2820                 struct unix_sock *u = unix_sk(s);
2821                 unix_state_lock(s);
2822
2823                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2824                         s,
2825                         atomic_read(&s->sk_refcnt),
2826                         0,
2827                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2828                         s->sk_type,
2829                         s->sk_socket ?
2830                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2831                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2832                         sock_i_ino(s));
2833
2834                 if (u->addr) {  // under unix_table_lock here
2835                         int i, len;
2836                         seq_putc(seq, ' ');
2837
2838                         i = 0;
2839                         len = u->addr->len - sizeof(short);
2840                         if (!UNIX_ABSTRACT(s))
2841                                 len--;
2842                         else {
2843                                 seq_putc(seq, '@');
2844                                 i++;
2845                         }
2846                         for ( ; i < len; i++)
2847                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
2848                                          '@');
2849                 }
2850                 unix_state_unlock(s);
2851                 seq_putc(seq, '\n');
2852         }
2853
2854         return 0;
2855 }
2856
2857 static const struct seq_operations unix_seq_ops = {
2858         .start  = unix_seq_start,
2859         .next   = unix_seq_next,
2860         .stop   = unix_seq_stop,
2861         .show   = unix_seq_show,
2862 };
2863
2864 static int unix_seq_open(struct inode *inode, struct file *file)
2865 {
2866         return seq_open_net(inode, file, &unix_seq_ops,
2867                             sizeof(struct seq_net_private));
2868 }
2869
2870 static const struct file_operations unix_seq_fops = {
2871         .owner          = THIS_MODULE,
2872         .open           = unix_seq_open,
2873         .read           = seq_read,
2874         .llseek         = seq_lseek,
2875         .release        = seq_release_net,
2876 };
2877
2878 #endif
2879
2880 static const struct net_proto_family unix_family_ops = {
2881         .family = PF_UNIX,
2882         .create = unix_create,
2883         .owner  = THIS_MODULE,
2884 };
2885
2886
2887 static int __net_init unix_net_init(struct net *net)
2888 {
2889         int error = -ENOMEM;
2890
2891         net->unx.sysctl_max_dgram_qlen = 10;
2892         if (unix_sysctl_register(net))
2893                 goto out;
2894
2895 #ifdef CONFIG_PROC_FS
2896         if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2897                 unix_sysctl_unregister(net);
2898                 goto out;
2899         }
2900 #endif
2901         error = 0;
2902 out:
2903         return error;
2904 }
2905
2906 static void __net_exit unix_net_exit(struct net *net)
2907 {
2908         unix_sysctl_unregister(net);
2909         remove_proc_entry("unix", net->proc_net);
2910 }
2911
2912 static struct pernet_operations unix_net_ops = {
2913         .init = unix_net_init,
2914         .exit = unix_net_exit,
2915 };
2916
2917 static int __init af_unix_init(void)
2918 {
2919         int rc = -1;
2920
2921         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2922
2923         rc = proto_register(&unix_proto, 1);
2924         if (rc != 0) {
2925                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2926                 goto out;
2927         }
2928
2929         sock_register(&unix_family_ops);
2930         register_pernet_subsys(&unix_net_ops);
2931 out:
2932         return rc;
2933 }
2934
2935 static void __exit af_unix_exit(void)
2936 {
2937         sock_unregister(PF_UNIX);
2938         proto_unregister(&unix_proto);
2939         unregister_pernet_subsys(&unix_net_ops);
2940 }
2941
2942 /* Earlier than device_initcall() so that other drivers invoking
2943    request_module() don't end up in a loop when modprobe tries
2944    to use a UNIX socket. But later than subsys_initcall() because
2945    we depend on stuff initialised there */
2946 fs_initcall(af_unix_init);
2947 module_exit(af_unix_exit);
2948
2949 MODULE_LICENSE("GPL");
2950 MODULE_ALIAS_NETPROTO(PF_UNIX);