net/unix/af_unix.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * NET4:        Implementation of BSD Unix domain sockets.
   4  *
   5  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   6  *
   7  * Fixes:
   8  *              Linus Torvalds  :       Assorted bug cures.
   9  *              Niibe Yutaka    :       async I/O support.
  10  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  11  *              Alan Cox        :       Limit size of allocated blocks.
  12  *              Alan Cox        :       Fixed the stupid socketpair bug.
  13  *              Alan Cox        :       BSD compatibility fine tuning.
  14  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  15  *              Alan Cox        :       Sorted out a proper draft version of
  16  *                                      file descriptor passing hacked up from
  17  *                                      Mike Shaver's work.
  18  *              Marty Leisner   :       Fixes to fd passing
  19  *              Nick Nevin      :       recvmsg bugfix.
  20  *              Alan Cox        :       Started proper garbage collector
  21  *              Heiko EiBfeldt  :       Missing verify_area check
  22  *              Alan Cox        :       Started POSIXisms
  23  *              Andreas Schwab  :       Replace inode by dentry for proper
  24  *                                      reference counting
  25  *              Kirk Petersen   :       Made this a module
  26  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  27  *                                      Lots of bug fixes.
  28  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  29  *                                      by above two patches.
  30  *           Andrea Arcangeli   :       If possible we block in connect(2)
  31  *                                      if the max backlog of the listen socket
  32  *                                      is been reached. This won't break
  33  *                                      old apps and it will avoid huge amount
  34  *                                      of socks hashed (this for unix_gc()
  35  *                                      performances reasons).
  36  *                                      Security fix that limits the max
  37  *                                      number of socks to 2*max_files and
  38  *                                      the number of skb queueable in the
  39  *                                      dgram receiver.
  40  *              Artur Skawina   :       Hash function optimizations
  41  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  42  *            Malcolm Beattie   :       Set peercred for socketpair
  43  *           Michal Ostrowski   :       Module initialization cleanup.
  44  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  45  *                                      the core infrastructure is doing that
  46  *                                      for all net proto families now (2.5.69+)
  47  *
  48  * Known differences from reference BSD that was tested:
  49  *
  50  *      [TO FIX]
  51  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  52  *              other the moment one end closes.
  53  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  54  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  55  *      [NOT TO FIX]
  56  *      accept() returns a path name even if the connecting socket has closed
  57  *              in the meantime (BSD loses the path and gives up).
  58  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  59  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  60  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  61  *      BSD af_unix apparently has connect forgetting to block properly.
  62  *              (need to check this with the POSIX spec in detail)
  63  *
  64  * Differences from 2.0.0-11-... (ANK)
  65  *      Bug fixes and improvements.
  66  *              - client shutdown killed server socket.
  67  *              - removed all useless cli/sti pairs.
  68  *
  69  *      Semantic changes/extensions.
  70  *              - generic control message passing.
  71  *              - SCM_CREDENTIALS control message.
  72  *              - "Abstract" (not FS based) socket bindings.
  73  *                Abstract names are sequences of bytes (not zero terminated)
  74  *                started by 0, so that this name space does not intersect
  75  *                with BSD names.
  76  */
  77
  78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  79
  80 #include <linux/module.h>
  81 #include <linux/kernel.h>
  82 #include <linux/signal.h>
  83 #include <linux/sched/signal.h>
  84 #include <linux/errno.h>
  85 #include <linux/string.h>
  86 #include <linux/stat.h>
  87 #include <linux/dcache.h>
  88 #include <linux/namei.h>
  89 #include <linux/socket.h>
  90 #include <linux/un.h>
  91 #include <linux/fcntl.h>
  92 #include <linux/termios.h>
  93 #include <linux/sockios.h>
  94 #include <linux/net.h>
  95 #include <linux/in.h>
  96 #include <linux/fs.h>
  97 #include <linux/slab.h>
  98 #include <linux/uaccess.h>
  99 #include <linux/skbuff.h>
 100 #include <linux/netdevice.h>
 101 #include <net/net_namespace.h>
 102 #include <net/sock.h>
 103 #include <net/tcp_states.h>
 104 #include <net/af_unix.h>
 105 #include <linux/proc_fs.h>
 106 #include <linux/seq_file.h>
 107 #include <net/scm.h>
 108 #include <linux/init.h>
 109 #include <linux/poll.h>
 110 #include <linux/rtnetlink.h>
 111 #include <linux/mount.h>
 112 #include <net/checksum.h>
 113 #include <linux/security.h>
 114 #include <linux/freezer.h>
 115 #include <linux/file.h>
 116
 117 #include "scm.h"
 118
 119 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 120 EXPORT_SYMBOL_GPL(unix_socket_table);
 121 DEFINE_SPINLOCK(unix_table_lock);
 122 EXPORT_SYMBOL_GPL(unix_table_lock);
 123 static atomic_long_t unix_nr_socks;
 124
 125
 126 static struct hlist_head *unix_sockets_unbound(void *addr)
 127 {
 128         unsigned long hash = (unsigned long)addr;
 129
 130         hash ^= hash >> 16;
 131         hash ^= hash >> 8;
 132         hash %= UNIX_HASH_SIZE;
 133         return &unix_socket_table[UNIX_HASH_SIZE + hash];
 134 }
 135
 136 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
 137
 138 #ifdef CONFIG_SECURITY_NETWORK
 139 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 140 {
 141         UNIXCB(skb).secid = scm->secid;
 142 }
 143
 144 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 145 {
 146         scm->secid = UNIXCB(skb).secid;
 147 }
 148
 149 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 150 {
 151         return (scm->secid == UNIXCB(skb).secid);
 152 }
 153 #else
 154 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 155 { }
 156
 157 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 158 { }
 159
 160 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 161 {
 162         return true;
 163 }
 164 #endif /* CONFIG_SECURITY_NETWORK */
 165
 166 /*
 167  *  SMP locking strategy:
 168  *    hash table is protected with spinlock unix_table_lock
 169  *    each socket state is protected by separate spin lock.
 170  */
 171
 172 static inline unsigned int unix_hash_fold(__wsum n)
 173 {
 174         unsigned int hash = (__force unsigned int)csum_fold(n);
 175
 176         hash ^= hash>>8;
 177         return hash&(UNIX_HASH_SIZE-1);
 178 }
 179
 180 #define unix_peer(sk) (unix_sk(sk)->peer)
 181
 182 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 183 {
 184         return unix_peer(osk) == sk;
 185 }
 186
 187 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 188 {
 189         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 190 }
 191
 192 static inline int unix_recvq_full(const struct sock *sk)
 193 {
 194         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 195 }
 196
 197 static inline int unix_recvq_full_lockless(const struct sock *sk)
 198 {
 199         return skb_queue_len_lockless(&sk->sk_receive_queue) >
 200                 READ_ONCE(sk->sk_max_ack_backlog);
 201 }
 202
 203 struct sock *unix_peer_get(struct sock *s)
 204 {
 205         struct sock *peer;
 206
 207         unix_state_lock(s);
 208         peer = unix_peer(s);
 209         if (peer)
 210                 sock_hold(peer);
 211         unix_state_unlock(s);
 212         return peer;
 213 }
 214 EXPORT_SYMBOL_GPL(unix_peer_get);
 215
 216 static inline void unix_release_addr(struct unix_address *addr)
 217 {
 218         if (refcount_dec_and_test(&addr->refcnt))
 219                 kfree(addr);
 220 }
 221
 222 /*
 223  *      Check unix socket name:
 224  *              - should be not zero length.
 225  *              - if started by not zero, should be NULL terminated (FS object)
 226  *              - if started by zero, it is abstract name.
 227  */
 228
 229 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
 230 {
 231         *hashp = 0;
 232
 233         if (len <= sizeof(short) || len > sizeof(*sunaddr))
 234                 return -EINVAL;
 235         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
 236                 return -EINVAL;
 237         if (sunaddr->sun_path[0]) {
 238                 /*
 239                  * This may look like an off by one error but it is a bit more
 240                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
 241                  * sun_path[108] doesn't as such exist.  However in kernel space
 242                  * we are guaranteed that it is a valid memory location in our
 243                  * kernel address buffer.
 244                  */
 245                 ((char *)sunaddr)[len] = 0;
 246                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
 247                 return len;
 248         }
 249
 250         *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
 251         return len;
 252 }
 253
 254 static void __unix_remove_socket(struct sock *sk)
 255 {
 256         sk_del_node_init(sk);
 257 }
 258
 259 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 260 {
 261         WARN_ON(!sk_unhashed(sk));
 262         sk_add_node(sk, list);
 263 }
 264
 265 static inline void unix_remove_socket(struct sock *sk)
 266 {
 267         spin_lock(&unix_table_lock);
 268         __unix_remove_socket(sk);
 269         spin_unlock(&unix_table_lock);
 270 }
 271
 272 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 273 {
 274         spin_lock(&unix_table_lock);
 275         __unix_insert_socket(list, sk);
 276         spin_unlock(&unix_table_lock);
 277 }
 278
 279 static struct sock *__unix_find_socket_byname(struct net *net,
 280                                               struct sockaddr_un *sunname,
 281                                               int len, int type, unsigned int hash)
 282 {
 283         struct sock *s;
 284
 285         sk_for_each(s, &unix_socket_table[hash ^ type]) {
 286                 struct unix_sock *u = unix_sk(s);
 287
 288                 if (!net_eq(sock_net(s), net))
 289                         continue;
 290
 291                 if (u->addr->len == len &&
 292                     !memcmp(u->addr->name, sunname, len))
 293                         return s;
 294         }
 295         return NULL;
 296 }
 297
 298 static inline struct sock *unix_find_socket_byname(struct net *net,
 299                                                    struct sockaddr_un *sunname,
 300                                                    int len, int type,
 301                                                    unsigned int hash)
 302 {
 303         struct sock *s;
 304
 305         spin_lock(&unix_table_lock);
 306         s = __unix_find_socket_byname(net, sunname, len, type, hash);
 307         if (s)
 308                 sock_hold(s);
 309         spin_unlock(&unix_table_lock);
 310         return s;
 311 }
 312
 313 static struct sock *unix_find_socket_byinode(struct inode *i)
 314 {
 315         struct sock *s;
 316
 317         spin_lock(&unix_table_lock);
 318         sk_for_each(s,
 319                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 320                 struct dentry *dentry = unix_sk(s)->path.dentry;
 321
 322                 if (dentry && d_backing_inode(dentry) == i) {
 323                         sock_hold(s);
 324                         goto found;
 325                 }
 326         }
 327         s = NULL;
 328 found:
 329         spin_unlock(&unix_table_lock);
 330         return s;
 331 }
 332
 333 /* Support code for asymmetrically connected dgram sockets
 334  *
 335  * If a datagram socket is connected to a socket not itself connected
 336  * to the first socket (eg, /dev/log), clients may only enqueue more
 337  * messages if the present receive queue of the server socket is not
 338  * "too large". This means there's a second writeability condition
 339  * poll and sendmsg need to test. The dgram recv code will do a wake
 340  * up on the peer_wait wait queue of a socket upon reception of a
 341  * datagram which needs to be propagated to sleeping would-be writers
 342  * since these might not have sent anything so far. This can't be
 343  * accomplished via poll_wait because the lifetime of the server
 344  * socket might be less than that of its clients if these break their
 345  * association with it or if the server socket is closed while clients
 346  * are still connected to it and there's no way to inform "a polling
 347  * implementation" that it should let go of a certain wait queue
 348  *
 349  * In order to propagate a wake up, a wait_queue_entry_t of the client
 350  * socket is enqueued on the peer_wait queue of the server socket
 351  * whose wake function does a wake_up on the ordinary client socket
 352  * wait queue. This connection is established whenever a write (or
 353  * poll for write) hit the flow control condition and broken when the
 354  * association to the server socket is dissolved or after a wake up
 355  * was relayed.
 356  */
 357
 358 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
 359                                       void *key)
 360 {
 361         struct unix_sock *u;
 362         wait_queue_head_t *u_sleep;
 363
 364         u = container_of(q, struct unix_sock, peer_wake);
 365
 366         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
 367                             q);
 368         u->peer_wake.private = NULL;
 369
 370         /* relaying can only happen while the wq still exists */
 371         u_sleep = sk_sleep(&u->sk);
 372         if (u_sleep)
 373                 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
 374
 375         return 0;
 376 }
 377
 378 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
 379 {
 380         struct unix_sock *u, *u_other;
 381         int rc;
 382
 383         u = unix_sk(sk);
 384         u_other = unix_sk(other);
 385         rc = 0;
 386         spin_lock(&u_other->peer_wait.lock);
 387
 388         if (!u->peer_wake.private) {
 389                 u->peer_wake.private = other;
 390                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
 391
 392                 rc = 1;
 393         }
 394
 395         spin_unlock(&u_other->peer_wait.lock);
 396         return rc;
 397 }
 398
 399 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
 400                                             struct sock *other)
 401 {
 402         struct unix_sock *u, *u_other;
 403
 404         u = unix_sk(sk);
 405         u_other = unix_sk(other);
 406         spin_lock(&u_other->peer_wait.lock);
 407
 408         if (u->peer_wake.private == other) {
 409                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
 410                 u->peer_wake.private = NULL;
 411         }
 412
 413         spin_unlock(&u_other->peer_wait.lock);
 414 }
 415
 416 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 417                                                    struct sock *other)
 418 {
 419         unix_dgram_peer_wake_disconnect(sk, other);
 420         wake_up_interruptible_poll(sk_sleep(sk),
 421                                    EPOLLOUT |
 422                                    EPOLLWRNORM |
 423                                    EPOLLWRBAND);
 424 }
 425
 426 /* preconditions:
 427  *      - unix_peer(sk) == other
 428  *      - association is stable
 429  */
 430 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 431 {
 432         int connected;
 433
 434         connected = unix_dgram_peer_wake_connect(sk, other);
 435
 436         /* If other is SOCK_DEAD, we want to make sure we signal
 437          * POLLOUT, such that a subsequent write() can get a
 438          * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
 439          * to other and its full, we will hang waiting for POLLOUT.
 440          */
 441         if (unix_recvq_full(other) && !sock_flag(other, SOCK_DEAD))
 442                 return 1;
 443
 444         if (connected)
 445                 unix_dgram_peer_wake_disconnect(sk, other);
 446
 447         return 0;
 448 }
 449
 450 static int unix_writable(const struct sock *sk)
 451 {
 452         return sk->sk_state != TCP_LISTEN &&
 453                (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 454 }
 455
 456 static void unix_write_space(struct sock *sk)
 457 {
 458         struct socket_wq *wq;
 459
 460         rcu_read_lock();
 461         if (unix_writable(sk)) {
 462                 wq = rcu_dereference(sk->sk_wq);
 463                 if (skwq_has_sleeper(wq))
 464                         wake_up_interruptible_sync_poll(&wq->wait,
 465                                 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
 466                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 467         }
 468         rcu_read_unlock();
 469 }
 470
 471 /* When dgram socket disconnects (or changes its peer), we clear its receive
 472  * queue of packets arrived from previous peer. First, it allows to do
 473  * flow control based only on wmem_alloc; second, sk connected to peer
 474  * may receive messages only from that peer. */
 475 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 476 {
 477         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 478                 skb_queue_purge(&sk->sk_receive_queue);
 479                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 480
 481                 /* If one link of bidirectional dgram pipe is disconnected,
 482                  * we signal error. Messages are lost. Do not make this,
 483                  * when peer was not connected to us.
 484                  */
 485                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 486                         other->sk_err = ECONNRESET;
 487                         other->sk_error_report(other);
 488                 }
 489         }
 490 }
 491
 492 static void unix_sock_destructor(struct sock *sk)
 493 {
 494         struct unix_sock *u = unix_sk(sk);
 495
 496         skb_queue_purge(&sk->sk_receive_queue);
 497
 498         WARN_ON(refcount_read(&sk->sk_wmem_alloc));
 499         WARN_ON(!sk_unhashed(sk));
 500         WARN_ON(sk->sk_socket);
 501         if (!sock_flag(sk, SOCK_DEAD)) {
 502                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 503                 return;
 504         }
 505
 506         if (u->addr)
 507                 unix_release_addr(u->addr);
 508
 509         atomic_long_dec(&unix_nr_socks);
 510         local_bh_disable();
 511         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 512         local_bh_enable();
 513 #ifdef UNIX_REFCNT_DEBUG
 514         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 515                 atomic_long_read(&unix_nr_socks));
 516 #endif
 517 }
 518
 519 static void unix_release_sock(struct sock *sk, int embrion)
 520 {
 521         struct unix_sock *u = unix_sk(sk);
 522         struct path path;
 523         struct sock *skpair;
 524         struct sk_buff *skb;
 525         int state;
 526
 527         unix_remove_socket(sk);
 528
 529         /* Clear state */
 530         unix_state_lock(sk);
 531         sock_orphan(sk);
 532         sk->sk_shutdown = SHUTDOWN_MASK;
 533         path         = u->path;
 534         u->path.dentry = NULL;
 535         u->path.mnt = NULL;
 536         state = sk->sk_state;
 537         sk->sk_state = TCP_CLOSE;
 538
 539         skpair = unix_peer(sk);
 540         unix_peer(sk) = NULL;
 541
 542         unix_state_unlock(sk);
 543
 544         wake_up_interruptible_all(&u->peer_wait);
 545
 546         if (skpair != NULL) {
 547                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 548                         unix_state_lock(skpair);
 549                         /* No more writes */
 550                         skpair->sk_shutdown = SHUTDOWN_MASK;
 551                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 552                                 skpair->sk_err = ECONNRESET;
 553                         unix_state_unlock(skpair);
 554                         skpair->sk_state_change(skpair);
 555                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 556                 }
 557
 558                 unix_dgram_peer_wake_disconnect(sk, skpair);
 559                 sock_put(skpair); /* It may now die */
 560         }
 561
 562         /* Try to flush out this socket. Throw out buffers at least */
 563
 564         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 565                 if (state == TCP_LISTEN)
 566                         unix_release_sock(skb->sk, 1);
 567                 /* passed fds are erased in the kfree_skb hook        */
 568                 UNIXCB(skb).consumed = skb->len;
 569                 kfree_skb(skb);
 570         }
 571
 572         if (path.dentry)
 573                 path_put(&path);
 574
 575         sock_put(sk);
 576
 577         /* ---- Socket is dead now and most probably destroyed ---- */
 578
 579         /*
 580          * Fixme: BSD difference: In BSD all sockets connected to us get
 581          *        ECONNRESET and we die on the spot. In Linux we behave
 582          *        like files and pipes do and wait for the last
 583          *        dereference.
 584          *
 585          * Can't we simply set sock->err?
 586          *
 587          *        What the above comment does talk about? --ANK(980817)
 588          */
 589
 590         if (unix_tot_inflight)
 591                 unix_gc();              /* Garbage collect fds */
 592 }
 593
 594 static void init_peercred(struct sock *sk)
 595 {
 596         put_pid(sk->sk_peer_pid);
 597         if (sk->sk_peer_cred)
 598                 put_cred(sk->sk_peer_cred);
 599         sk->sk_peer_pid  = get_pid(task_tgid(current));
 600         sk->sk_peer_cred = get_current_cred();
 601 }
 602
 603 static void copy_peercred(struct sock *sk, struct sock *peersk)
 604 {
 605         put_pid(sk->sk_peer_pid);
 606         if (sk->sk_peer_cred)
 607                 put_cred(sk->sk_peer_cred);
 608         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 609         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 610 }
 611
 612 static int unix_listen(struct socket *sock, int backlog)
 613 {
 614         int err;
 615         struct sock *sk = sock->sk;
 616         struct unix_sock *u = unix_sk(sk);
 617
 618         err = -EOPNOTSUPP;
 619         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 620                 goto out;       /* Only stream/seqpacket sockets accept */
 621         err = -EINVAL;
 622         if (!u->addr)
 623                 goto out;       /* No listens on an unbound socket */
 624         unix_state_lock(sk);
 625         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 626                 goto out_unlock;
 627         if (backlog > sk->sk_max_ack_backlog)
 628                 wake_up_interruptible_all(&u->peer_wait);
 629         sk->sk_max_ack_backlog  = backlog;
 630         sk->sk_state            = TCP_LISTEN;
 631         /* set credentials so connect can copy them */
 632         init_peercred(sk);
 633         err = 0;
 634
 635 out_unlock:
 636         unix_state_unlock(sk);
 637 out:
 638         return err;
 639 }
 640
 641 static int unix_release(struct socket *);
 642 static int unix_bind(struct socket *, struct sockaddr *, int);
 643 static int unix_stream_connect(struct socket *, struct sockaddr *,
 644                                int addr_len, int flags);
 645 static int unix_socketpair(struct socket *, struct socket *);
 646 static int unix_accept(struct socket *, struct socket *, int, bool);
 647 static int unix_getname(struct socket *, struct sockaddr *, int);
 648 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
 649 static __poll_t unix_dgram_poll(struct file *, struct socket *,
 650                                     poll_table *);
 651 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 652 #ifdef CONFIG_COMPAT
 653 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 654 #endif
 655 static int unix_shutdown(struct socket *, int);
 656 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 657 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 658 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
 659                                     size_t size, int flags);
 660 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 661                                        struct pipe_inode_info *, size_t size,
 662                                        unsigned int flags);
 663 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 664 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 665 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 666                               int, int);
 667 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 668 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 669                                   int);
 670
 671 static int unix_set_peek_off(struct sock *sk, int val)
 672 {
 673         struct unix_sock *u = unix_sk(sk);
 674
 675         if (mutex_lock_interruptible(&u->iolock))
 676                 return -EINTR;
 677
 678         sk->sk_peek_off = val;
 679         mutex_unlock(&u->iolock);
 680
 681         return 0;
 682 }
 683
 684 #ifdef CONFIG_PROC_FS
 685 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
 686 {
 687         struct sock *sk = sock->sk;
 688         struct unix_sock *u;
 689
 690         if (sk) {
 691                 u = unix_sk(sock->sk);
 692                 seq_printf(m, "scm_fds: %u\n",
 693                            atomic_read(&u->scm_stat.nr_fds));
 694         }
 695 }
 696 #else
 697 #define unix_show_fdinfo NULL
 698 #endif
 699
 700 static const struct proto_ops unix_stream_ops = {
 701         .family =       PF_UNIX,
 702         .owner =        THIS_MODULE,
 703         .release =      unix_release,
 704         .bind =         unix_bind,
 705         .connect =      unix_stream_connect,
 706         .socketpair =   unix_socketpair,
 707         .accept =       unix_accept,
 708         .getname =      unix_getname,
 709         .poll =         unix_poll,
 710         .ioctl =        unix_ioctl,
 711 #ifdef CONFIG_COMPAT
 712         .compat_ioctl = unix_compat_ioctl,
 713 #endif
 714         .listen =       unix_listen,
 715         .shutdown =     unix_shutdown,
 716         .sendmsg =      unix_stream_sendmsg,
 717         .recvmsg =      unix_stream_recvmsg,
 718         .mmap =         sock_no_mmap,
 719         .sendpage =     unix_stream_sendpage,
 720         .splice_read =  unix_stream_splice_read,
 721         .set_peek_off = unix_set_peek_off,
 722         .show_fdinfo =  unix_show_fdinfo,
 723 };
 724
 725 static const struct proto_ops unix_dgram_ops = {
 726         .family =       PF_UNIX,
 727         .owner =        THIS_MODULE,
 728         .release =      unix_release,
 729         .bind =         unix_bind,
 730         .connect =      unix_dgram_connect,
 731         .socketpair =   unix_socketpair,
 732         .accept =       sock_no_accept,
 733         .getname =      unix_getname,
 734         .poll =         unix_dgram_poll,
 735         .ioctl =        unix_ioctl,
 736 #ifdef CONFIG_COMPAT
 737         .compat_ioctl = unix_compat_ioctl,
 738 #endif
 739         .listen =       sock_no_listen,
 740         .shutdown =     unix_shutdown,
 741         .sendmsg =      unix_dgram_sendmsg,
 742         .recvmsg =      unix_dgram_recvmsg,
 743         .mmap =         sock_no_mmap,
 744         .sendpage =     sock_no_sendpage,
 745         .set_peek_off = unix_set_peek_off,
 746         .show_fdinfo =  unix_show_fdinfo,
 747 };
 748
 749 static const struct proto_ops unix_seqpacket_ops = {
 750         .family =       PF_UNIX,
 751         .owner =        THIS_MODULE,
 752         .release =      unix_release,
 753         .bind =         unix_bind,
 754         .connect =      unix_stream_connect,
 755         .socketpair =   unix_socketpair,
 756         .accept =       unix_accept,
 757         .getname =      unix_getname,
 758         .poll =         unix_dgram_poll,
 759         .ioctl =        unix_ioctl,
 760 #ifdef CONFIG_COMPAT
 761         .compat_ioctl = unix_compat_ioctl,
 762 #endif
 763         .listen =       unix_listen,
 764         .shutdown =     unix_shutdown,
 765         .sendmsg =      unix_seqpacket_sendmsg,
 766         .recvmsg =      unix_seqpacket_recvmsg,
 767         .mmap =         sock_no_mmap,
 768         .sendpage =     sock_no_sendpage,
 769         .set_peek_off = unix_set_peek_off,
 770         .show_fdinfo =  unix_show_fdinfo,
 771 };
 772
 773 static struct proto unix_proto = {
 774         .name                   = "UNIX",
 775         .owner                  = THIS_MODULE,
 776         .obj_size               = sizeof(struct unix_sock),
 777 };
 778
 779 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 780 {
 781         struct sock *sk = NULL;
 782         struct unix_sock *u;
 783
 784         atomic_long_inc(&unix_nr_socks);
 785         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 786                 goto out;
 787
 788         sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
 789         if (!sk)
 790                 goto out;
 791
 792         sock_init_data(sock, sk);
 793
 794         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
 795         sk->sk_write_space      = unix_write_space;
 796         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 797         sk->sk_destruct         = unix_sock_destructor;
 798         u         = unix_sk(sk);
 799         u->path.dentry = NULL;
 800         u->path.mnt = NULL;
 801         spin_lock_init(&u->lock);
 802         atomic_long_set(&u->inflight, 0);
 803         INIT_LIST_HEAD(&u->link);
 804         mutex_init(&u->iolock); /* single task reading lock */
 805         mutex_init(&u->bindlock); /* single task binding lock */
 806         init_waitqueue_head(&u->peer_wait);
 807         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 808         memset(&u->scm_stat, 0, sizeof(struct scm_stat));
 809         unix_insert_socket(unix_sockets_unbound(sk), sk);
 810 out:
 811         if (sk == NULL)
 812                 atomic_long_dec(&unix_nr_socks);
 813         else {
 814                 local_bh_disable();
 815                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 816                 local_bh_enable();
 817         }
 818         return sk;
 819 }
 820
 821 static int unix_create(struct net *net, struct socket *sock, int protocol,
 822                        int kern)
 823 {
 824         if (protocol && protocol != PF_UNIX)
 825                 return -EPROTONOSUPPORT;
 826
 827         sock->state = SS_UNCONNECTED;
 828
 829         switch (sock->type) {
 830         case SOCK_STREAM:
 831                 sock->ops = &unix_stream_ops;
 832                 break;
 833                 /*
 834                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
 835                  *      nothing uses it.
 836                  */
 837         case SOCK_RAW:
 838                 sock->type = SOCK_DGRAM;
 839                 fallthrough;
 840         case SOCK_DGRAM:
 841                 sock->ops = &unix_dgram_ops;
 842                 break;
 843         case SOCK_SEQPACKET:
 844                 sock->ops = &unix_seqpacket_ops;
 845                 break;
 846         default:
 847                 return -ESOCKTNOSUPPORT;
 848         }
 849
 850         return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
 851 }
 852
 853 static int unix_release(struct socket *sock)
 854 {
 855         struct sock *sk = sock->sk;
 856
 857         if (!sk)
 858                 return 0;
 859
 860         unix_release_sock(sk, 0);
 861         sock->sk = NULL;
 862
 863         return 0;
 864 }
 865
 866 static int unix_autobind(struct socket *sock)
 867 {
 868         struct sock *sk = sock->sk;
 869         struct net *net = sock_net(sk);
 870         struct unix_sock *u = unix_sk(sk);
 871         static u32 ordernum = 1;
 872         struct unix_address *addr;
 873         int err;
 874         unsigned int retries = 0;
 875
 876         err = mutex_lock_interruptible(&u->bindlock);
 877         if (err)
 878                 return err;
 879
 880         if (u->addr)
 881                 goto out;
 882
 883         err = -ENOMEM;
 884         addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
 885         if (!addr)
 886                 goto out;
 887
 888         addr->name->sun_family = AF_UNIX;
 889         refcount_set(&addr->refcnt, 1);
 890
 891 retry:
 892         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
 893         addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 894
 895         spin_lock(&unix_table_lock);
 896         ordernum = (ordernum+1)&0xFFFFF;
 897
 898         if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
 899                                       addr->hash)) {
 900                 spin_unlock(&unix_table_lock);
 901                 /*
 902                  * __unix_find_socket_byname() may take long time if many names
 903                  * are already in use.
 904                  */
 905                 cond_resched();
 906                 /* Give up if all names seems to be in use. */
 907                 if (retries++ == 0xFFFFF) {
 908                         err = -ENOSPC;
 909                         kfree(addr);
 910                         goto out;
 911                 }
 912                 goto retry;
 913         }
 914         addr->hash ^= sk->sk_type;
 915
 916         __unix_remove_socket(sk);
 917         smp_store_release(&u->addr, addr);
 918         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
 919         spin_unlock(&unix_table_lock);
 920         err = 0;
 921
 922 out:    mutex_unlock(&u->bindlock);
 923         return err;
 924 }
 925
 926 static struct sock *unix_find_other(struct net *net,
 927                                     struct sockaddr_un *sunname, int len,
 928                                     int type, unsigned int hash, int *error)
 929 {
 930         struct sock *u;
 931         struct path path;
 932         int err = 0;
 933
 934         if (sunname->sun_path[0]) {
 935                 struct inode *inode;
 936                 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
 937                 if (err)
 938                         goto fail;
 939                 inode = d_backing_inode(path.dentry);
 940                 err = path_permission(&path, MAY_WRITE);
 941                 if (err)
 942                         goto put_fail;
 943
 944                 err = -ECONNREFUSED;
 945                 if (!S_ISSOCK(inode->i_mode))
 946                         goto put_fail;
 947                 u = unix_find_socket_byinode(inode);
 948                 if (!u)
 949                         goto put_fail;
 950
 951                 if (u->sk_type == type)
 952                         touch_atime(&path);
 953
 954                 path_put(&path);
 955
 956                 err = -EPROTOTYPE;
 957                 if (u->sk_type != type) {
 958                         sock_put(u);
 959                         goto fail;
 960                 }
 961         } else {
 962                 err = -ECONNREFUSED;
 963                 u = unix_find_socket_byname(net, sunname, len, type, hash);
 964                 if (u) {
 965                         struct dentry *dentry;
 966                         dentry = unix_sk(u)->path.dentry;
 967                         if (dentry)
 968                                 touch_atime(&unix_sk(u)->path);
 969                 } else
 970                         goto fail;
 971         }
 972         return u;
 973
 974 put_fail:
 975         path_put(&path);
 976 fail:
 977         *error = err;
 978         return NULL;
 979 }
 980
 981 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
 982 {
 983         struct dentry *dentry;
 984         struct path path;
 985         int err = 0;
 986         /*
 987          * Get the parent directory, calculate the hash for last
 988          * component.
 989          */
 990         dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
 991         err = PTR_ERR(dentry);
 992         if (IS_ERR(dentry))
 993                 return err;
 994
 995         /*
 996          * All right, let's create it.
 997          */
 998         err = security_path_mknod(&path, dentry, mode, 0);
 999         if (!err) {
1000                 err = vfs_mknod(mnt_user_ns(path.mnt), d_inode(path.dentry),
1001                                 dentry, mode, 0);
1002                 if (!err) {
1003                         res->mnt = mntget(path.mnt);
1004                         res->dentry = dget(dentry);
1005                 }
1006         }
1007         done_path_create(&path, dentry);
1008         return err;
1009 }
1010
1011 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1012 {
1013         struct sock *sk = sock->sk;
1014         struct net *net = sock_net(sk);
1015         struct unix_sock *u = unix_sk(sk);
1016         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1017         char *sun_path = sunaddr->sun_path;
1018         int err;
1019         unsigned int hash;
1020         struct unix_address *addr;
1021         struct hlist_head *list;
1022         struct path path = { };
1023
1024         err = -EINVAL;
1025         if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
1026             sunaddr->sun_family != AF_UNIX)
1027                 goto out;
1028
1029         if (addr_len == sizeof(short)) {
1030                 err = unix_autobind(sock);
1031                 goto out;
1032         }
1033
1034         err = unix_mkname(sunaddr, addr_len, &hash);
1035         if (err < 0)
1036                 goto out;
1037         addr_len = err;
1038
1039         if (sun_path[0]) {
1040                 umode_t mode = S_IFSOCK |
1041                        (SOCK_INODE(sock)->i_mode & ~current_umask());
1042                 err = unix_mknod(sun_path, mode, &path);
1043                 if (err) {
1044                         if (err == -EEXIST)
1045                                 err = -EADDRINUSE;
1046                         goto out;
1047                 }
1048         }
1049
1050         err = mutex_lock_interruptible(&u->bindlock);
1051         if (err)
1052                 goto out_put;
1053
1054         err = -EINVAL;
1055         if (u->addr)
1056                 goto out_up;
1057
1058         err = -ENOMEM;
1059         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
1060         if (!addr)
1061                 goto out_up;
1062
1063         memcpy(addr->name, sunaddr, addr_len);
1064         addr->len = addr_len;
1065         addr->hash = hash ^ sk->sk_type;
1066         refcount_set(&addr->refcnt, 1);
1067
1068         if (sun_path[0]) {
1069                 addr->hash = UNIX_HASH_SIZE;
1070                 hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
1071                 spin_lock(&unix_table_lock);
1072                 u->path = path;
1073                 list = &unix_socket_table[hash];
1074         } else {
1075                 spin_lock(&unix_table_lock);
1076                 err = -EADDRINUSE;
1077                 if (__unix_find_socket_byname(net, sunaddr, addr_len,
1078                                               sk->sk_type, hash)) {
1079                         unix_release_addr(addr);
1080                         goto out_unlock;
1081                 }
1082
1083                 list = &unix_socket_table[addr->hash];
1084         }
1085
1086         err = 0;
1087         __unix_remove_socket(sk);
1088         smp_store_release(&u->addr, addr);
1089         __unix_insert_socket(list, sk);
1090
1091 out_unlock:
1092         spin_unlock(&unix_table_lock);
1093 out_up:
1094         mutex_unlock(&u->bindlock);
1095 out_put:
1096         if (err)
1097                 path_put(&path);
1098 out:
1099         return err;
1100 }
1101
1102 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1103 {
1104         if (unlikely(sk1 == sk2) || !sk2) {
1105                 unix_state_lock(sk1);
1106                 return;
1107         }
1108         if (sk1 < sk2) {
1109                 unix_state_lock(sk1);
1110                 unix_state_lock_nested(sk2);
1111         } else {
1112                 unix_state_lock(sk2);
1113                 unix_state_lock_nested(sk1);
1114         }
1115 }
1116
1117 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1118 {
1119         if (unlikely(sk1 == sk2) || !sk2) {
1120                 unix_state_unlock(sk1);
1121                 return;
1122         }
1123         unix_state_unlock(sk1);
1124         unix_state_unlock(sk2);
1125 }
1126
1127 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1128                               int alen, int flags)
1129 {
1130         struct sock *sk = sock->sk;
1131         struct net *net = sock_net(sk);
1132         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1133         struct sock *other;
1134         unsigned int hash;
1135         int err;
1136
1137         err = -EINVAL;
1138         if (alen < offsetofend(struct sockaddr, sa_family))
1139                 goto out;
1140
1141         if (addr->sa_family != AF_UNSPEC) {
1142                 err = unix_mkname(sunaddr, alen, &hash);
1143                 if (err < 0)
1144                         goto out;
1145                 alen = err;
1146
1147                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1148                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
1149                         goto out;
1150
1151 restart:
1152                 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1153                 if (!other)
1154                         goto out;
1155
1156                 unix_state_double_lock(sk, other);
1157
1158                 /* Apparently VFS overslept socket death. Retry. */
1159                 if (sock_flag(other, SOCK_DEAD)) {
1160                         unix_state_double_unlock(sk, other);
1161                         sock_put(other);
1162                         goto restart;
1163                 }
1164
1165                 err = -EPERM;
1166                 if (!unix_may_send(sk, other))
1167                         goto out_unlock;
1168
1169                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1170                 if (err)
1171                         goto out_unlock;
1172
1173         } else {
1174                 /*
1175                  *      1003.1g breaking connected state with AF_UNSPEC
1176                  */
1177                 other = NULL;
1178                 unix_state_double_lock(sk, other);
1179         }
1180
1181         /*
1182          * If it was connected, reconnect.
1183          */
1184         if (unix_peer(sk)) {
1185                 struct sock *old_peer = unix_peer(sk);
1186                 unix_peer(sk) = other;
1187                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1188
1189                 unix_state_double_unlock(sk, other);
1190
1191                 if (other != old_peer)
1192                         unix_dgram_disconnected(sk, old_peer);
1193                 sock_put(old_peer);
1194         } else {
1195                 unix_peer(sk) = other;
1196                 unix_state_double_unlock(sk, other);
1197         }
1198         return 0;
1199
1200 out_unlock:
1201         unix_state_double_unlock(sk, other);
1202         sock_put(other);
1203 out:
1204         return err;
1205 }
1206
1207 static long unix_wait_for_peer(struct sock *other, long timeo)
1208         __releases(&unix_sk(other)->lock)
1209 {
1210         struct unix_sock *u = unix_sk(other);
1211         int sched;
1212         DEFINE_WAIT(wait);
1213
1214         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1215
1216         sched = !sock_flag(other, SOCK_DEAD) &&
1217                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1218                 unix_recvq_full(other);
1219
1220         unix_state_unlock(other);
1221
1222         if (sched)
1223                 timeo = schedule_timeout(timeo);
1224
1225         finish_wait(&u->peer_wait, &wait);
1226         return timeo;
1227 }
1228
1229 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1230                                int addr_len, int flags)
1231 {
1232         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1233         struct sock *sk = sock->sk;
1234         struct net *net = sock_net(sk);
1235         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1236         struct sock *newsk = NULL;
1237         struct sock *other = NULL;
1238         struct sk_buff *skb = NULL;
1239         unsigned int hash;
1240         int st;
1241         int err;
1242         long timeo;
1243
1244         err = unix_mkname(sunaddr, addr_len, &hash);
1245         if (err < 0)
1246                 goto out;
1247         addr_len = err;
1248
1249         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1250             (err = unix_autobind(sock)) != 0)
1251                 goto out;
1252
1253         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1254
1255         /* First of all allocate resources.
1256            If we will make it after state is locked,
1257            we will have to recheck all again in any case.
1258          */
1259
1260         err = -ENOMEM;
1261
1262         /* create new sock for complete connection */
1263         newsk = unix_create1(sock_net(sk), NULL, 0);
1264         if (newsk == NULL)
1265                 goto out;
1266
1267         /* Allocate skb for sending to listening sock */
1268         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1269         if (skb == NULL)
1270                 goto out;
1271
1272 restart:
1273         /*  Find listening sock. */
1274         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1275         if (!other)
1276                 goto out;
1277
1278         /* Latch state of peer */
1279         unix_state_lock(other);
1280
1281         /* Apparently VFS overslept socket death. Retry. */
1282         if (sock_flag(other, SOCK_DEAD)) {
1283                 unix_state_unlock(other);
1284                 sock_put(other);
1285                 goto restart;
1286         }
1287
1288         err = -ECONNREFUSED;
1289         if (other->sk_state != TCP_LISTEN)
1290                 goto out_unlock;
1291         if (other->sk_shutdown & RCV_SHUTDOWN)
1292                 goto out_unlock;
1293
1294         if (unix_recvq_full(other)) {
1295                 err = -EAGAIN;
1296                 if (!timeo)
1297                         goto out_unlock;
1298
1299                 timeo = unix_wait_for_peer(other, timeo);
1300
1301                 err = sock_intr_errno(timeo);
1302                 if (signal_pending(current))
1303                         goto out;
1304                 sock_put(other);
1305                 goto restart;
1306         }
1307
1308         /* Latch our state.
1309
1310            It is tricky place. We need to grab our state lock and cannot
1311            drop lock on peer. It is dangerous because deadlock is
1312            possible. Connect to self case and simultaneous
1313            attempt to connect are eliminated by checking socket
1314            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1315            check this before attempt to grab lock.
1316
1317            Well, and we have to recheck the state after socket locked.
1318          */
1319         st = sk->sk_state;
1320
1321         switch (st) {
1322         case TCP_CLOSE:
1323                 /* This is ok... continue with connect */
1324                 break;
1325         case TCP_ESTABLISHED:
1326                 /* Socket is already connected */
1327                 err = -EISCONN;
1328                 goto out_unlock;
1329         default:
1330                 err = -EINVAL;
1331                 goto out_unlock;
1332         }
1333
1334         unix_state_lock_nested(sk);
1335
1336         if (sk->sk_state != st) {
1337                 unix_state_unlock(sk);
1338                 unix_state_unlock(other);
1339                 sock_put(other);
1340                 goto restart;
1341         }
1342
1343         err = security_unix_stream_connect(sk, other, newsk);
1344         if (err) {
1345                 unix_state_unlock(sk);
1346                 goto out_unlock;
1347         }
1348
1349         /* The way is open! Fastly set all the necessary fields... */
1350
1351         sock_hold(sk);
1352         unix_peer(newsk)        = sk;
1353         newsk->sk_state         = TCP_ESTABLISHED;
1354         newsk->sk_type          = sk->sk_type;
1355         init_peercred(newsk);
1356         newu = unix_sk(newsk);
1357         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1358         otheru = unix_sk(other);
1359
1360         /* copy address information from listening to new sock
1361          *
1362          * The contents of *(otheru->addr) and otheru->path
1363          * are seen fully set up here, since we have found
1364          * otheru in hash under unix_table_lock.  Insertion
1365          * into the hash chain we'd found it in had been done
1366          * in an earlier critical area protected by unix_table_lock,
1367          * the same one where we'd set *(otheru->addr) contents,
1368          * as well as otheru->path and otheru->addr itself.
1369          *
1370          * Using smp_store_release() here to set newu->addr
1371          * is enough to make those stores, as well as stores
1372          * to newu->path visible to anyone who gets newu->addr
1373          * by smp_load_acquire().  IOW, the same warranties
1374          * as for unix_sock instances bound in unix_bind() or
1375          * in unix_autobind().
1376          */
1377         if (otheru->path.dentry) {
1378                 path_get(&otheru->path);
1379                 newu->path = otheru->path;
1380         }
1381         refcount_inc(&otheru->addr->refcnt);
1382         smp_store_release(&newu->addr, otheru->addr);
1383
1384         /* Set credentials */
1385         copy_peercred(sk, other);
1386
1387         sock->state     = SS_CONNECTED;
1388         sk->sk_state    = TCP_ESTABLISHED;
1389         sock_hold(newsk);
1390
1391         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1392         unix_peer(sk)   = newsk;
1393
1394         unix_state_unlock(sk);
1395
1396         /* take ten and and send info to listening sock */
1397         spin_lock(&other->sk_receive_queue.lock);
1398         __skb_queue_tail(&other->sk_receive_queue, skb);
1399         spin_unlock(&other->sk_receive_queue.lock);
1400         unix_state_unlock(other);
1401         other->sk_data_ready(other);
1402         sock_put(other);
1403         return 0;
1404
1405 out_unlock:
1406         if (other)
1407                 unix_state_unlock(other);
1408
1409 out:
1410         kfree_skb(skb);
1411         if (newsk)
1412                 unix_release_sock(newsk, 0);
1413         if (other)
1414                 sock_put(other);
1415         return err;
1416 }
1417
1418 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1419 {
1420         struct sock *ska = socka->sk, *skb = sockb->sk;
1421
1422         /* Join our sockets back to back */
1423         sock_hold(ska);
1424         sock_hold(skb);
1425         unix_peer(ska) = skb;
1426         unix_peer(skb) = ska;
1427         init_peercred(ska);
1428         init_peercred(skb);
1429
1430         if (ska->sk_type != SOCK_DGRAM) {
1431                 ska->sk_state = TCP_ESTABLISHED;
1432                 skb->sk_state = TCP_ESTABLISHED;
1433                 socka->state  = SS_CONNECTED;
1434                 sockb->state  = SS_CONNECTED;
1435         }
1436         return 0;
1437 }
1438
1439 static void unix_sock_inherit_flags(const struct socket *old,
1440                                     struct socket *new)
1441 {
1442         if (test_bit(SOCK_PASSCRED, &old->flags))
1443                 set_bit(SOCK_PASSCRED, &new->flags);
1444         if (test_bit(SOCK_PASSSEC, &old->flags))
1445                 set_bit(SOCK_PASSSEC, &new->flags);
1446 }
1447
1448 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1449                        bool kern)
1450 {
1451         struct sock *sk = sock->sk;
1452         struct sock *tsk;
1453         struct sk_buff *skb;
1454         int err;
1455
1456         err = -EOPNOTSUPP;
1457         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1458                 goto out;
1459
1460         err = -EINVAL;
1461         if (sk->sk_state != TCP_LISTEN)
1462                 goto out;
1463
1464         /* If socket state is TCP_LISTEN it cannot change (for now...),
1465          * so that no locks are necessary.
1466          */
1467
1468         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1469         if (!skb) {
1470                 /* This means receive shutdown. */
1471                 if (err == 0)
1472                         err = -EINVAL;
1473                 goto out;
1474         }
1475
1476         tsk = skb->sk;
1477         skb_free_datagram(sk, skb);
1478         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1479
1480         /* attach accepted sock to socket */
1481         unix_state_lock(tsk);
1482         newsock->state = SS_CONNECTED;
1483         unix_sock_inherit_flags(sock, newsock);
1484         sock_graft(tsk, newsock);
1485         unix_state_unlock(tsk);
1486         return 0;
1487
1488 out:
1489         return err;
1490 }
1491
1492
1493 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1494 {
1495         struct sock *sk = sock->sk;
1496         struct unix_address *addr;
1497         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1498         int err = 0;
1499
1500         if (peer) {
1501                 sk = unix_peer_get(sk);
1502
1503                 err = -ENOTCONN;
1504                 if (!sk)
1505                         goto out;
1506                 err = 0;
1507         } else {
1508                 sock_hold(sk);
1509         }
1510
1511         addr = smp_load_acquire(&unix_sk(sk)->addr);
1512         if (!addr) {
1513                 sunaddr->sun_family = AF_UNIX;
1514                 sunaddr->sun_path[0] = 0;
1515                 err = sizeof(short);
1516         } else {
1517                 err = addr->len;
1518                 memcpy(sunaddr, addr->name, addr->len);
1519         }
1520         sock_put(sk);
1521 out:
1522         return err;
1523 }
1524
1525 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1526 {
1527         scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1528
1529         /*
1530          * Garbage collection of unix sockets starts by selecting a set of
1531          * candidate sockets which have reference only from being in flight
1532          * (total_refs == inflight_refs).  This condition is checked once during
1533          * the candidate collection phase, and candidates are marked as such, so
1534          * that non-candidates can later be ignored.  While inflight_refs is
1535          * protected by unix_gc_lock, total_refs (file count) is not, hence this
1536          * is an instantaneous decision.
1537          *
1538          * Once a candidate, however, the socket must not be reinstalled into a
1539          * file descriptor while the garbage collection is in progress.
1540          *
1541          * If the above conditions are met, then the directed graph of
1542          * candidates (*) does not change while unix_gc_lock is held.
1543          *
1544          * Any operations that changes the file count through file descriptors
1545          * (dup, close, sendmsg) does not change the graph since candidates are
1546          * not installed in fds.
1547          *
1548          * Dequeing a candidate via recvmsg would install it into an fd, but
1549          * that takes unix_gc_lock to decrement the inflight count, so it's
1550          * serialized with garbage collection.
1551          *
1552          * MSG_PEEK is special in that it does not change the inflight count,
1553          * yet does install the socket into an fd.  The following lock/unlock
1554          * pair is to ensure serialization with garbage collection.  It must be
1555          * done between incrementing the file count and installing the file into
1556          * an fd.
1557          *
1558          * If garbage collection starts after the barrier provided by the
1559          * lock/unlock, then it will see the elevated refcount and not mark this
1560          * as a candidate.  If a garbage collection is already in progress
1561          * before the file count was incremented, then the lock/unlock pair will
1562          * ensure that garbage collection is finished before progressing to
1563          * installing the fd.
1564          *
1565          * (*) A -> B where B is on the queue of A or B is on the queue of C
1566          * which is on the queue of listening socket A.
1567          */
1568         spin_lock(&unix_gc_lock);
1569         spin_unlock(&unix_gc_lock);
1570 }
1571
1572 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1573 {
1574         int err = 0;
1575
1576         UNIXCB(skb).pid  = get_pid(scm->pid);
1577         UNIXCB(skb).uid = scm->creds.uid;
1578         UNIXCB(skb).gid = scm->creds.gid;
1579         UNIXCB(skb).fp = NULL;
1580         unix_get_secdata(scm, skb);
1581         if (scm->fp && send_fds)
1582                 err = unix_attach_fds(scm, skb);
1583
1584         skb->destructor = unix_destruct_scm;
1585         return err;
1586 }
1587
1588 static bool unix_passcred_enabled(const struct socket *sock,
1589                                   const struct sock *other)
1590 {
1591         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1592                !other->sk_socket ||
1593                test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1594 }
1595
1596 /*
1597  * Some apps rely on write() giving SCM_CREDENTIALS
1598  * We include credentials if source or destination socket
1599  * asserted SOCK_PASSCRED.
1600  */
1601 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1602                             const struct sock *other)
1603 {
1604         if (UNIXCB(skb).pid)
1605                 return;
1606         if (unix_passcred_enabled(sock, other)) {
1607                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1608                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1609         }
1610 }
1611
1612 static int maybe_init_creds(struct scm_cookie *scm,
1613                             struct socket *socket,
1614                             const struct sock *other)
1615 {
1616         int err;
1617         struct msghdr msg = { .msg_controllen = 0 };
1618
1619         err = scm_send(socket, &msg, scm, false);
1620         if (err)
1621                 return err;
1622
1623         if (unix_passcred_enabled(socket, other)) {
1624                 scm->pid = get_pid(task_tgid(current));
1625                 current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1626         }
1627         return err;
1628 }
1629
1630 static bool unix_skb_scm_eq(struct sk_buff *skb,
1631                             struct scm_cookie *scm)
1632 {
1633         const struct unix_skb_parms *u = &UNIXCB(skb);
1634
1635         return u->pid == scm->pid &&
1636                uid_eq(u->uid, scm->creds.uid) &&
1637                gid_eq(u->gid, scm->creds.gid) &&
1638                unix_secdata_eq(scm, skb);
1639 }
1640
1641 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1642 {
1643         struct scm_fp_list *fp = UNIXCB(skb).fp;
1644         struct unix_sock *u = unix_sk(sk);
1645
1646         if (unlikely(fp && fp->count))
1647                 atomic_add(fp->count, &u->scm_stat.nr_fds);
1648 }
1649
1650 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1651 {
1652         struct scm_fp_list *fp = UNIXCB(skb).fp;
1653         struct unix_sock *u = unix_sk(sk);
1654
1655         if (unlikely(fp && fp->count))
1656                 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1657 }
1658
1659 /*
1660  *      Send AF_UNIX data.
1661  */
1662
1663 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1664                               size_t len)
1665 {
1666         struct sock *sk = sock->sk;
1667         struct net *net = sock_net(sk);
1668         struct unix_sock *u = unix_sk(sk);
1669         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1670         struct sock *other = NULL;
1671         int namelen = 0; /* fake GCC */
1672         int err;
1673         unsigned int hash;
1674         struct sk_buff *skb;
1675         long timeo;
1676         struct scm_cookie scm;
1677         int data_len = 0;
1678         int sk_locked;
1679
1680         wait_for_unix_gc();
1681         err = scm_send(sock, msg, &scm, false);
1682         if (err < 0)
1683                 return err;
1684
1685         err = -EOPNOTSUPP;
1686         if (msg->msg_flags&MSG_OOB)
1687                 goto out;
1688
1689         if (msg->msg_namelen) {
1690                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1691                 if (err < 0)
1692                         goto out;
1693                 namelen = err;
1694         } else {
1695                 sunaddr = NULL;
1696                 err = -ENOTCONN;
1697                 other = unix_peer_get(sk);
1698                 if (!other)
1699                         goto out;
1700         }
1701
1702         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1703             && (err = unix_autobind(sock)) != 0)
1704                 goto out;
1705
1706         err = -EMSGSIZE;
1707         if (len > sk->sk_sndbuf - 32)
1708                 goto out;
1709
1710         if (len > SKB_MAX_ALLOC) {
1711                 data_len = min_t(size_t,
1712                                  len - SKB_MAX_ALLOC,
1713                                  MAX_SKB_FRAGS * PAGE_SIZE);
1714                 data_len = PAGE_ALIGN(data_len);
1715
1716                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1717         }
1718
1719         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1720                                    msg->msg_flags & MSG_DONTWAIT, &err,
1721                                    PAGE_ALLOC_COSTLY_ORDER);
1722         if (skb == NULL)
1723                 goto out;
1724
1725         err = unix_scm_to_skb(&scm, skb, true);
1726         if (err < 0)
1727                 goto out_free;
1728
1729         skb_put(skb, len - data_len);
1730         skb->data_len = data_len;
1731         skb->len = len;
1732         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1733         if (err)
1734                 goto out_free;
1735
1736         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1737
1738 restart:
1739         if (!other) {
1740                 err = -ECONNRESET;
1741                 if (sunaddr == NULL)
1742                         goto out_free;
1743
1744                 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1745                                         hash, &err);
1746                 if (other == NULL)
1747                         goto out_free;
1748         }
1749
1750         if (sk_filter(other, skb) < 0) {
1751                 /* Toss the packet but do not return any error to the sender */
1752                 err = len;
1753                 goto out_free;
1754         }
1755
1756         sk_locked = 0;
1757         unix_state_lock(other);
1758 restart_locked:
1759         err = -EPERM;
1760         if (!unix_may_send(sk, other))
1761                 goto out_unlock;
1762
1763         if (unlikely(sock_flag(other, SOCK_DEAD))) {
1764                 /*
1765                  *      Check with 1003.1g - what should
1766                  *      datagram error
1767                  */
1768                 unix_state_unlock(other);
1769                 sock_put(other);
1770
1771                 if (!sk_locked)
1772                         unix_state_lock(sk);
1773
1774                 err = 0;
1775                 if (unix_peer(sk) == other) {
1776                         unix_peer(sk) = NULL;
1777                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1778
1779                         unix_state_unlock(sk);
1780
1781                         unix_dgram_disconnected(sk, other);
1782                         sock_put(other);
1783                         err = -ECONNREFUSED;
1784                 } else {
1785                         unix_state_unlock(sk);
1786                 }
1787
1788                 other = NULL;
1789                 if (err)
1790                         goto out_free;
1791                 goto restart;
1792         }
1793
1794         err = -EPIPE;
1795         if (other->sk_shutdown & RCV_SHUTDOWN)
1796                 goto out_unlock;
1797
1798         if (sk->sk_type != SOCK_SEQPACKET) {
1799                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1800                 if (err)
1801                         goto out_unlock;
1802         }
1803
1804         /* other == sk && unix_peer(other) != sk if
1805          * - unix_peer(sk) == NULL, destination address bound to sk
1806          * - unix_peer(sk) == sk by time of get but disconnected before lock
1807          */
1808         if (other != sk &&
1809             unlikely(unix_peer(other) != sk &&
1810             unix_recvq_full_lockless(other))) {
1811                 if (timeo) {
1812                         timeo = unix_wait_for_peer(other, timeo);
1813
1814                         err = sock_intr_errno(timeo);
1815                         if (signal_pending(current))
1816                                 goto out_free;
1817
1818                         goto restart;
1819                 }
1820
1821                 if (!sk_locked) {
1822                         unix_state_unlock(other);
1823                         unix_state_double_lock(sk, other);
1824                 }
1825
1826                 if (unix_peer(sk) != other ||
1827                     unix_dgram_peer_wake_me(sk, other)) {
1828                         err = -EAGAIN;
1829                         sk_locked = 1;
1830                         goto out_unlock;
1831                 }
1832
1833                 if (!sk_locked) {
1834                         sk_locked = 1;
1835                         goto restart_locked;
1836                 }
1837         }
1838
1839         if (unlikely(sk_locked))
1840                 unix_state_unlock(sk);
1841
1842         if (sock_flag(other, SOCK_RCVTSTAMP))
1843                 __net_timestamp(skb);
1844         maybe_add_creds(skb, sock, other);
1845         scm_stat_add(other, skb);
1846         skb_queue_tail(&other->sk_receive_queue, skb);
1847         unix_state_unlock(other);
1848         other->sk_data_ready(other);
1849         sock_put(other);
1850         scm_destroy(&scm);
1851         return len;
1852
1853 out_unlock:
1854         if (sk_locked)
1855                 unix_state_unlock(sk);
1856         unix_state_unlock(other);
1857 out_free:
1858         kfree_skb(skb);
1859 out:
1860         if (other)
1861                 sock_put(other);
1862         scm_destroy(&scm);
1863         return err;
1864 }
1865
1866 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1867  * bytes, and a minimum of a full page.
1868  */
1869 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1870
1871 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1872                                size_t len)
1873 {
1874         struct sock *sk = sock->sk;
1875         struct sock *other = NULL;
1876         int err, size;
1877         struct sk_buff *skb;
1878         int sent = 0;
1879         struct scm_cookie scm;
1880         bool fds_sent = false;
1881         int data_len;
1882
1883         wait_for_unix_gc();
1884         err = scm_send(sock, msg, &scm, false);
1885         if (err < 0)
1886                 return err;
1887
1888         err = -EOPNOTSUPP;
1889         if (msg->msg_flags&MSG_OOB)
1890                 goto out_err;
1891
1892         if (msg->msg_namelen) {
1893                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1894                 goto out_err;
1895         } else {
1896                 err = -ENOTCONN;
1897                 other = unix_peer(sk);
1898                 if (!other)
1899                         goto out_err;
1900         }
1901
1902         if (sk->sk_shutdown & SEND_SHUTDOWN)
1903                 goto pipe_err;
1904
1905         while (sent < len) {
1906                 size = len - sent;
1907
1908                 /* Keep two messages in the pipe so it schedules better */
1909                 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1910
1911                 /* allow fallback to order-0 allocations */
1912                 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1913
1914                 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1915
1916                 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1917
1918                 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1919                                            msg->msg_flags & MSG_DONTWAIT, &err,
1920                                            get_order(UNIX_SKB_FRAGS_SZ));
1921                 if (!skb)
1922                         goto out_err;
1923
1924                 /* Only send the fds in the first buffer */
1925                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
1926                 if (err < 0) {
1927                         kfree_skb(skb);
1928                         goto out_err;
1929                 }
1930                 fds_sent = true;
1931
1932                 skb_put(skb, size - data_len);
1933                 skb->data_len = data_len;
1934                 skb->len = size;
1935                 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1936                 if (err) {
1937                         kfree_skb(skb);
1938                         goto out_err;
1939                 }
1940
1941                 unix_state_lock(other);
1942
1943                 if (sock_flag(other, SOCK_DEAD) ||
1944                     (other->sk_shutdown & RCV_SHUTDOWN))
1945                         goto pipe_err_free;
1946
1947                 maybe_add_creds(skb, sock, other);
1948                 scm_stat_add(other, skb);
1949                 skb_queue_tail(&other->sk_receive_queue, skb);
1950                 unix_state_unlock(other);
1951                 other->sk_data_ready(other);
1952                 sent += size;
1953         }
1954
1955         scm_destroy(&scm);
1956
1957         return sent;
1958
1959 pipe_err_free:
1960         unix_state_unlock(other);
1961         kfree_skb(skb);
1962 pipe_err:
1963         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1964                 send_sig(SIGPIPE, current, 0);
1965         err = -EPIPE;
1966 out_err:
1967         scm_destroy(&scm);
1968         return sent ? : err;
1969 }
1970
1971 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
1972                                     int offset, size_t size, int flags)
1973 {
1974         int err;
1975         bool send_sigpipe = false;
1976         bool init_scm = true;
1977         struct scm_cookie scm;
1978         struct sock *other, *sk = socket->sk;
1979         struct sk_buff *skb, *newskb = NULL, *tail = NULL;
1980
1981         if (flags & MSG_OOB)
1982                 return -EOPNOTSUPP;
1983
1984         other = unix_peer(sk);
1985         if (!other || sk->sk_state != TCP_ESTABLISHED)
1986                 return -ENOTCONN;
1987
1988         if (false) {
1989 alloc_skb:
1990                 unix_state_unlock(other);
1991                 mutex_unlock(&unix_sk(other)->iolock);
1992                 newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
1993                                               &err, 0);
1994                 if (!newskb)
1995                         goto err;
1996         }
1997
1998         /* we must acquire iolock as we modify already present
1999          * skbs in the sk_receive_queue and mess with skb->len
2000          */
2001         err = mutex_lock_interruptible(&unix_sk(other)->iolock);
2002         if (err) {
2003                 err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
2004                 goto err;
2005         }
2006
2007         if (sk->sk_shutdown & SEND_SHUTDOWN) {
2008                 err = -EPIPE;
2009                 send_sigpipe = true;
2010                 goto err_unlock;
2011         }
2012
2013         unix_state_lock(other);
2014
2015         if (sock_flag(other, SOCK_DEAD) ||
2016             other->sk_shutdown & RCV_SHUTDOWN) {
2017                 err = -EPIPE;
2018                 send_sigpipe = true;
2019                 goto err_state_unlock;
2020         }
2021
2022         if (init_scm) {
2023                 err = maybe_init_creds(&scm, socket, other);
2024                 if (err)
2025                         goto err_state_unlock;
2026                 init_scm = false;
2027         }
2028
2029         skb = skb_peek_tail(&other->sk_receive_queue);
2030         if (tail && tail == skb) {
2031                 skb = newskb;
2032         } else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2033                 if (newskb) {
2034                         skb = newskb;
2035                 } else {
2036                         tail = skb;
2037                         goto alloc_skb;
2038                 }
2039         } else if (newskb) {
2040                 /* this is fast path, we don't necessarily need to
2041                  * call to kfree_skb even though with newskb == NULL
2042                  * this - does no harm
2043                  */
2044                 consume_skb(newskb);
2045                 newskb = NULL;
2046         }
2047
2048         if (skb_append_pagefrags(skb, page, offset, size)) {
2049                 tail = skb;
2050                 goto alloc_skb;
2051         }
2052
2053         skb->len += size;
2054         skb->data_len += size;
2055         skb->truesize += size;
2056         refcount_add(size, &sk->sk_wmem_alloc);
2057
2058         if (newskb) {
2059                 err = unix_scm_to_skb(&scm, skb, false);
2060                 if (err)
2061                         goto err_state_unlock;
2062                 spin_lock(&other->sk_receive_queue.lock);
2063                 __skb_queue_tail(&other->sk_receive_queue, newskb);
2064                 spin_unlock(&other->sk_receive_queue.lock);
2065         }
2066
2067         unix_state_unlock(other);
2068         mutex_unlock(&unix_sk(other)->iolock);
2069
2070         other->sk_data_ready(other);
2071         scm_destroy(&scm);
2072         return size;
2073
2074 err_state_unlock:
2075         unix_state_unlock(other);
2076 err_unlock:
2077         mutex_unlock(&unix_sk(other)->iolock);
2078 err:
2079         kfree_skb(newskb);
2080         if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2081                 send_sig(SIGPIPE, current, 0);
2082         if (!init_scm)
2083                 scm_destroy(&scm);
2084         return err;
2085 }
2086
2087 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2088                                   size_t len)
2089 {
2090         int err;
2091         struct sock *sk = sock->sk;
2092
2093         err = sock_error(sk);
2094         if (err)
2095                 return err;
2096
2097         if (sk->sk_state != TCP_ESTABLISHED)
2098                 return -ENOTCONN;
2099
2100         if (msg->msg_namelen)
2101                 msg->msg_namelen = 0;
2102
2103         return unix_dgram_sendmsg(sock, msg, len);
2104 }
2105
2106 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2107                                   size_t size, int flags)
2108 {
2109         struct sock *sk = sock->sk;
2110
2111         if (sk->sk_state != TCP_ESTABLISHED)
2112                 return -ENOTCONN;
2113
2114         return unix_dgram_recvmsg(sock, msg, size, flags);
2115 }
2116
2117 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2118 {
2119         struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2120
2121         if (addr) {
2122                 msg->msg_namelen = addr->len;
2123                 memcpy(msg->msg_name, addr->name, addr->len);
2124         }
2125 }
2126
2127 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
2128                               size_t size, int flags)
2129 {
2130         struct scm_cookie scm;
2131         struct sock *sk = sock->sk;
2132         struct unix_sock *u = unix_sk(sk);
2133         struct sk_buff *skb, *last;
2134         long timeo;
2135         int skip;
2136         int err;
2137
2138         err = -EOPNOTSUPP;
2139         if (flags&MSG_OOB)
2140                 goto out;
2141
2142         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2143
2144         do {
2145                 mutex_lock(&u->iolock);
2146
2147                 skip = sk_peek_offset(sk, flags);
2148                 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2149                                               &skip, &err, &last);
2150                 if (skb) {
2151                         if (!(flags & MSG_PEEK))
2152                                 scm_stat_del(sk, skb);
2153                         break;
2154                 }
2155
2156                 mutex_unlock(&u->iolock);
2157
2158                 if (err != -EAGAIN)
2159                         break;
2160         } while (timeo &&
2161                  !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2162                                               &err, &timeo, last));
2163
2164         if (!skb) { /* implies iolock unlocked */
2165                 unix_state_lock(sk);
2166                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2167                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2168                     (sk->sk_shutdown & RCV_SHUTDOWN))
2169                         err = 0;
2170                 unix_state_unlock(sk);
2171                 goto out;
2172         }
2173
2174         if (wq_has_sleeper(&u->peer_wait))
2175                 wake_up_interruptible_sync_poll(&u->peer_wait,
2176                                                 EPOLLOUT | EPOLLWRNORM |
2177                                                 EPOLLWRBAND);
2178
2179         if (msg->msg_name)
2180                 unix_copy_addr(msg, skb->sk);
2181
2182         if (size > skb->len - skip)
2183                 size = skb->len - skip;
2184         else if (size < skb->len - skip)
2185                 msg->msg_flags |= MSG_TRUNC;
2186
2187         err = skb_copy_datagram_msg(skb, skip, msg, size);
2188         if (err)
2189                 goto out_free;
2190
2191         if (sock_flag(sk, SOCK_RCVTSTAMP))
2192                 __sock_recv_timestamp(msg, sk, skb);
2193
2194         memset(&scm, 0, sizeof(scm));
2195
2196         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2197         unix_set_secdata(&scm, skb);
2198
2199         if (!(flags & MSG_PEEK)) {
2200                 if (UNIXCB(skb).fp)
2201                         unix_detach_fds(&scm, skb);
2202
2203                 sk_peek_offset_bwd(sk, skb->len);
2204         } else {
2205                 /* It is questionable: on PEEK we could:
2206                    - do not return fds - good, but too simple 8)
2207                    - return fds, and do not return them on read (old strategy,
2208                      apparently wrong)
2209                    - clone fds (I chose it for now, it is the most universal
2210                      solution)
2211
2212                    POSIX 1003.1g does not actually define this clearly
2213                    at all. POSIX 1003.1g doesn't define a lot of things
2214                    clearly however!
2215
2216                 */
2217
2218                 sk_peek_offset_fwd(sk, size);
2219
2220                 if (UNIXCB(skb).fp)
2221                         unix_peek_fds(&scm, skb);
2222         }
2223         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2224
2225         scm_recv(sock, msg, &scm, flags);
2226
2227 out_free:
2228         skb_free_datagram(sk, skb);
2229         mutex_unlock(&u->iolock);
2230 out:
2231         return err;
2232 }
2233
2234 /*
2235  *      Sleep until more data has arrived. But check for races..
2236  */
2237 static long unix_stream_data_wait(struct sock *sk, long timeo,
2238                                   struct sk_buff *last, unsigned int last_len,
2239                                   bool freezable)
2240 {
2241         struct sk_buff *tail;
2242         DEFINE_WAIT(wait);
2243
2244         unix_state_lock(sk);
2245
2246         for (;;) {
2247                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2248
2249                 tail = skb_peek_tail(&sk->sk_receive_queue);
2250                 if (tail != last ||
2251                     (tail && tail->len != last_len) ||
2252                     sk->sk_err ||
2253                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2254                     signal_pending(current) ||
2255                     !timeo)
2256                         break;
2257
2258                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2259                 unix_state_unlock(sk);
2260                 if (freezable)
2261                         timeo = freezable_schedule_timeout(timeo);
2262                 else
2263                         timeo = schedule_timeout(timeo);
2264                 unix_state_lock(sk);
2265
2266                 if (sock_flag(sk, SOCK_DEAD))
2267                         break;
2268
2269                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2270         }
2271
2272         finish_wait(sk_sleep(sk), &wait);
2273         unix_state_unlock(sk);
2274         return timeo;
2275 }
2276
2277 static unsigned int unix_skb_len(const struct sk_buff *skb)
2278 {
2279         return skb->len - UNIXCB(skb).consumed;
2280 }
2281
2282 struct unix_stream_read_state {
2283         int (*recv_actor)(struct sk_buff *, int, int,
2284                           struct unix_stream_read_state *);
2285         struct socket *socket;
2286         struct msghdr *msg;
2287         struct pipe_inode_info *pipe;
2288         size_t size;
2289         int flags;
2290         unsigned int splice_flags;
2291 };
2292
2293 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2294                                     bool freezable)
2295 {
2296         struct scm_cookie scm;
2297         struct socket *sock = state->socket;
2298         struct sock *sk = sock->sk;
2299         struct unix_sock *u = unix_sk(sk);
2300         int copied = 0;
2301         int flags = state->flags;
2302         int noblock = flags & MSG_DONTWAIT;
2303         bool check_creds = false;
2304         int target;
2305         int err = 0;
2306         long timeo;
2307         int skip;
2308         size_t size = state->size;
2309         unsigned int last_len;
2310
2311         if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2312                 err = -EINVAL;
2313                 goto out;
2314         }
2315
2316         if (unlikely(flags & MSG_OOB)) {
2317                 err = -EOPNOTSUPP;
2318                 goto out;
2319         }
2320
2321         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2322         timeo = sock_rcvtimeo(sk, noblock);
2323
2324         memset(&scm, 0, sizeof(scm));
2325
2326         /* Lock the socket to prevent queue disordering
2327          * while sleeps in memcpy_tomsg
2328          */
2329         mutex_lock(&u->iolock);
2330
2331         skip = max(sk_peek_offset(sk, flags), 0);
2332
2333         do {
2334                 int chunk;
2335                 bool drop_skb;
2336                 struct sk_buff *skb, *last;
2337
2338 redo:
2339                 unix_state_lock(sk);
2340                 if (sock_flag(sk, SOCK_DEAD)) {
2341                         err = -ECONNRESET;
2342                         goto unlock;
2343                 }
2344                 last = skb = skb_peek(&sk->sk_receive_queue);
2345                 last_len = last ? last->len : 0;
2346 again:
2347                 if (skb == NULL) {
2348                         if (copied >= target)
2349                                 goto unlock;
2350
2351                         /*
2352                          *      POSIX 1003.1g mandates this order.
2353                          */
2354
2355                         err = sock_error(sk);
2356                         if (err)
2357                                 goto unlock;
2358                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2359                                 goto unlock;
2360
2361                         unix_state_unlock(sk);
2362                         if (!timeo) {
2363                                 err = -EAGAIN;
2364                                 break;
2365                         }
2366
2367                         mutex_unlock(&u->iolock);
2368
2369                         timeo = unix_stream_data_wait(sk, timeo, last,
2370                                                       last_len, freezable);
2371
2372                         if (signal_pending(current)) {
2373                                 err = sock_intr_errno(timeo);
2374                                 scm_destroy(&scm);
2375                                 goto out;
2376                         }
2377
2378                         mutex_lock(&u->iolock);
2379                         goto redo;
2380 unlock:
2381                         unix_state_unlock(sk);
2382                         break;
2383                 }
2384
2385                 while (skip >= unix_skb_len(skb)) {
2386                         skip -= unix_skb_len(skb);
2387                         last = skb;
2388                         last_len = skb->len;
2389                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2390                         if (!skb)
2391                                 goto again;
2392                 }
2393
2394                 unix_state_unlock(sk);
2395
2396                 if (check_creds) {
2397                         /* Never glue messages from different writers */
2398                         if (!unix_skb_scm_eq(skb, &scm))
2399                                 break;
2400                 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2401                         /* Copy credentials */
2402                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2403                         unix_set_secdata(&scm, skb);
2404                         check_creds = true;
2405                 }
2406
2407                 /* Copy address just once */
2408                 if (state->msg && state->msg->msg_name) {
2409                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2410                                          state->msg->msg_name);
2411                         unix_copy_addr(state->msg, skb->sk);
2412                         sunaddr = NULL;
2413                 }
2414
2415                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2416                 skb_get(skb);
2417                 chunk = state->recv_actor(skb, skip, chunk, state);
2418                 drop_skb = !unix_skb_len(skb);
2419                 /* skb is only safe to use if !drop_skb */
2420                 consume_skb(skb);
2421                 if (chunk < 0) {
2422                         if (copied == 0)
2423                                 copied = -EFAULT;
2424                         break;
2425                 }
2426                 copied += chunk;
2427                 size -= chunk;
2428
2429                 if (drop_skb) {
2430                         /* the skb was touched by a concurrent reader;
2431                          * we should not expect anything from this skb
2432                          * anymore and assume it invalid - we can be
2433                          * sure it was dropped from the socket queue
2434                          *
2435                          * let's report a short read
2436                          */
2437                         err = 0;
2438                         break;
2439                 }
2440
2441                 /* Mark read part of skb as used */
2442                 if (!(flags & MSG_PEEK)) {
2443                         UNIXCB(skb).consumed += chunk;
2444
2445                         sk_peek_offset_bwd(sk, chunk);
2446
2447                         if (UNIXCB(skb).fp) {
2448                                 scm_stat_del(sk, skb);
2449                                 unix_detach_fds(&scm, skb);
2450                         }
2451
2452                         if (unix_skb_len(skb))
2453                                 break;
2454
2455                         skb_unlink(skb, &sk->sk_receive_queue);
2456                         consume_skb(skb);
2457
2458                         if (scm.fp)
2459                                 break;
2460                 } else {
2461                         /* It is questionable, see note in unix_dgram_recvmsg.
2462                          */
2463                         if (UNIXCB(skb).fp)
2464                                 unix_peek_fds(&scm, skb);
2465
2466                         sk_peek_offset_fwd(sk, chunk);
2467
2468                         if (UNIXCB(skb).fp)
2469                                 break;
2470
2471                         skip = 0;
2472                         last = skb;
2473                         last_len = skb->len;
2474                         unix_state_lock(sk);
2475                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2476                         if (skb)
2477                                 goto again;
2478                         unix_state_unlock(sk);
2479                         break;
2480                 }
2481         } while (size);
2482
2483         mutex_unlock(&u->iolock);
2484         if (state->msg)
2485                 scm_recv(sock, state->msg, &scm, flags);
2486         else
2487                 scm_destroy(&scm);
2488 out:
2489         return copied ? : err;
2490 }
2491
2492 static int unix_stream_read_actor(struct sk_buff *skb,
2493                                   int skip, int chunk,
2494                                   struct unix_stream_read_state *state)
2495 {
2496         int ret;
2497
2498         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2499                                     state->msg, chunk);
2500         return ret ?: chunk;
2501 }
2502
2503 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2504                                size_t size, int flags)
2505 {
2506         struct unix_stream_read_state state = {
2507                 .recv_actor = unix_stream_read_actor,
2508                 .socket = sock,
2509                 .msg = msg,
2510                 .size = size,
2511                 .flags = flags
2512         };
2513
2514         return unix_stream_read_generic(&state, true);
2515 }
2516
2517 static int unix_stream_splice_actor(struct sk_buff *skb,
2518                                     int skip, int chunk,
2519                                     struct unix_stream_read_state *state)
2520 {
2521         return skb_splice_bits(skb, state->socket->sk,
2522                                UNIXCB(skb).consumed + skip,
2523                                state->pipe, chunk, state->splice_flags);
2524 }
2525
2526 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2527                                        struct pipe_inode_info *pipe,
2528                                        size_t size, unsigned int flags)
2529 {
2530         struct unix_stream_read_state state = {
2531                 .recv_actor = unix_stream_splice_actor,
2532                 .socket = sock,
2533                 .pipe = pipe,
2534                 .size = size,
2535                 .splice_flags = flags,
2536         };
2537
2538         if (unlikely(*ppos))
2539                 return -ESPIPE;
2540
2541         if (sock->file->f_flags & O_NONBLOCK ||
2542             flags & SPLICE_F_NONBLOCK)
2543                 state.flags = MSG_DONTWAIT;
2544
2545         return unix_stream_read_generic(&state, false);
2546 }
2547
2548 static int unix_shutdown(struct socket *sock, int mode)
2549 {
2550         struct sock *sk = sock->sk;
2551         struct sock *other;
2552
2553         if (mode < SHUT_RD || mode > SHUT_RDWR)
2554                 return -EINVAL;
2555         /* This maps:
2556          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2557          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2558          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2559          */
2560         ++mode;
2561
2562         unix_state_lock(sk);
2563         sk->sk_shutdown |= mode;
2564         other = unix_peer(sk);
2565         if (other)
2566                 sock_hold(other);
2567         unix_state_unlock(sk);
2568         sk->sk_state_change(sk);
2569
2570         if (other &&
2571                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2572
2573                 int peer_mode = 0;
2574
2575                 if (mode&RCV_SHUTDOWN)
2576                         peer_mode |= SEND_SHUTDOWN;
2577                 if (mode&SEND_SHUTDOWN)
2578                         peer_mode |= RCV_SHUTDOWN;
2579                 unix_state_lock(other);
2580                 other->sk_shutdown |= peer_mode;
2581                 unix_state_unlock(other);
2582                 other->sk_state_change(other);
2583                 if (peer_mode == SHUTDOWN_MASK)
2584                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2585                 else if (peer_mode & RCV_SHUTDOWN)
2586                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2587         }
2588         if (other)
2589                 sock_put(other);
2590
2591         return 0;
2592 }
2593
2594 long unix_inq_len(struct sock *sk)
2595 {
2596         struct sk_buff *skb;
2597         long amount = 0;
2598
2599         if (sk->sk_state == TCP_LISTEN)
2600                 return -EINVAL;
2601
2602         spin_lock(&sk->sk_receive_queue.lock);
2603         if (sk->sk_type == SOCK_STREAM ||
2604             sk->sk_type == SOCK_SEQPACKET) {
2605                 skb_queue_walk(&sk->sk_receive_queue, skb)
2606                         amount += unix_skb_len(skb);
2607         } else {
2608                 skb = skb_peek(&sk->sk_receive_queue);
2609                 if (skb)
2610                         amount = skb->len;
2611         }
2612         spin_unlock(&sk->sk_receive_queue.lock);
2613
2614         return amount;
2615 }
2616 EXPORT_SYMBOL_GPL(unix_inq_len);
2617
2618 long unix_outq_len(struct sock *sk)
2619 {
2620         return sk_wmem_alloc_get(sk);
2621 }
2622 EXPORT_SYMBOL_GPL(unix_outq_len);
2623
2624 static int unix_open_file(struct sock *sk)
2625 {
2626         struct path path;
2627         struct file *f;
2628         int fd;
2629
2630         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2631                 return -EPERM;
2632
2633         if (!smp_load_acquire(&unix_sk(sk)->addr))
2634                 return -ENOENT;
2635
2636         path = unix_sk(sk)->path;
2637         if (!path.dentry)
2638                 return -ENOENT;
2639
2640         path_get(&path);
2641
2642         fd = get_unused_fd_flags(O_CLOEXEC);
2643         if (fd < 0)
2644                 goto out;
2645
2646         f = dentry_open(&path, O_PATH, current_cred());
2647         if (IS_ERR(f)) {
2648                 put_unused_fd(fd);
2649                 fd = PTR_ERR(f);
2650                 goto out;
2651         }
2652
2653         fd_install(fd, f);
2654 out:
2655         path_put(&path);
2656
2657         return fd;
2658 }
2659
2660 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2661 {
2662         struct sock *sk = sock->sk;
2663         long amount = 0;
2664         int err;
2665
2666         switch (cmd) {
2667         case SIOCOUTQ:
2668                 amount = unix_outq_len(sk);
2669                 err = put_user(amount, (int __user *)arg);
2670                 break;
2671         case SIOCINQ:
2672                 amount = unix_inq_len(sk);
2673                 if (amount < 0)
2674                         err = amount;
2675                 else
2676                         err = put_user(amount, (int __user *)arg);
2677                 break;
2678         case SIOCUNIXFILE:
2679                 err = unix_open_file(sk);
2680                 break;
2681         default:
2682                 err = -ENOIOCTLCMD;
2683                 break;
2684         }
2685         return err;
2686 }
2687
2688 #ifdef CONFIG_COMPAT
2689 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2690 {
2691         return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
2692 }
2693 #endif
2694
2695 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2696 {
2697         struct sock *sk = sock->sk;
2698         __poll_t mask;
2699
2700         sock_poll_wait(file, sock, wait);
2701         mask = 0;
2702
2703         /* exceptional events? */
2704         if (sk->sk_err)
2705                 mask |= EPOLLERR;
2706         if (sk->sk_shutdown == SHUTDOWN_MASK)
2707                 mask |= EPOLLHUP;
2708         if (sk->sk_shutdown & RCV_SHUTDOWN)
2709                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
2710
2711         /* readable? */
2712         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
2713                 mask |= EPOLLIN | EPOLLRDNORM;
2714
2715         /* Connection-based need to check for termination and startup */
2716         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2717             sk->sk_state == TCP_CLOSE)
2718                 mask |= EPOLLHUP;
2719
2720         /*
2721          * we set writable also when the other side has shut down the
2722          * connection. This prevents stuck sockets.
2723          */
2724         if (unix_writable(sk))
2725                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
2726
2727         return mask;
2728 }
2729
2730 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
2731                                     poll_table *wait)
2732 {
2733         struct sock *sk = sock->sk, *other;
2734         unsigned int writable;
2735         __poll_t mask;
2736
2737         sock_poll_wait(file, sock, wait);
2738         mask = 0;
2739
2740         /* exceptional events? */
2741         if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
2742                 mask |= EPOLLERR |
2743                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
2744
2745         if (sk->sk_shutdown & RCV_SHUTDOWN)
2746                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
2747         if (sk->sk_shutdown == SHUTDOWN_MASK)
2748                 mask |= EPOLLHUP;
2749
2750         /* readable? */
2751         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
2752                 mask |= EPOLLIN | EPOLLRDNORM;
2753
2754         /* Connection-based need to check for termination and startup */
2755         if (sk->sk_type == SOCK_SEQPACKET) {
2756                 if (sk->sk_state == TCP_CLOSE)
2757                         mask |= EPOLLHUP;
2758                 /* connection hasn't started yet? */
2759                 if (sk->sk_state == TCP_SYN_SENT)
2760                         return mask;
2761         }
2762
2763         /* No write status requested, avoid expensive OUT tests. */
2764         if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
2765                 return mask;
2766
2767         writable = unix_writable(sk);
2768         if (writable) {
2769                 unix_state_lock(sk);
2770
2771                 other = unix_peer(sk);
2772                 if (other && unix_peer(other) != sk &&
2773                     unix_recvq_full(other) &&
2774                     unix_dgram_peer_wake_me(sk, other))
2775                         writable = 0;
2776
2777                 unix_state_unlock(sk);
2778         }
2779
2780         if (writable)
2781                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
2782         else
2783                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2784
2785         return mask;
2786 }
2787
2788 #ifdef CONFIG_PROC_FS
2789
2790 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2791
2792 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2793 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2794 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2795
2796 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2797 {
2798         unsigned long offset = get_offset(*pos);
2799         unsigned long bucket = get_bucket(*pos);
2800         struct sock *sk;
2801         unsigned long count = 0;
2802
2803         for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2804                 if (sock_net(sk) != seq_file_net(seq))
2805                         continue;
2806                 if (++count == offset)
2807                         break;
2808         }
2809
2810         return sk;
2811 }
2812
2813 static struct sock *unix_next_socket(struct seq_file *seq,
2814                                      struct sock *sk,
2815                                      loff_t *pos)
2816 {
2817         unsigned long bucket;
2818
2819         while (sk > (struct sock *)SEQ_START_TOKEN) {
2820                 sk = sk_next(sk);
2821                 if (!sk)
2822                         goto next_bucket;
2823                 if (sock_net(sk) == seq_file_net(seq))
2824                         return sk;
2825         }
2826
2827         do {
2828                 sk = unix_from_bucket(seq, pos);
2829                 if (sk)
2830                         return sk;
2831
2832 next_bucket:
2833                 bucket = get_bucket(*pos) + 1;
2834                 *pos = set_bucket_offset(bucket, 1);
2835         } while (bucket < ARRAY_SIZE(unix_socket_table));
2836
2837         return NULL;
2838 }
2839
2840 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2841         __acquires(unix_table_lock)
2842 {
2843         spin_lock(&unix_table_lock);
2844
2845         if (!*pos)
2846                 return SEQ_START_TOKEN;
2847
2848         if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2849                 return NULL;
2850
2851         return unix_next_socket(seq, NULL, pos);
2852 }
2853
2854 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2855 {
2856         ++*pos;
2857         return unix_next_socket(seq, v, pos);
2858 }
2859
2860 static void unix_seq_stop(struct seq_file *seq, void *v)
2861         __releases(unix_table_lock)
2862 {
2863         spin_unlock(&unix_table_lock);
2864 }
2865
2866 static int unix_seq_show(struct seq_file *seq, void *v)
2867 {
2868
2869         if (v == SEQ_START_TOKEN)
2870                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2871                          "Inode Path\n");
2872         else {
2873                 struct sock *s = v;
2874                 struct unix_sock *u = unix_sk(s);
2875                 unix_state_lock(s);
2876
2877                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2878                         s,
2879                         refcount_read(&s->sk_refcnt),
2880                         0,
2881                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2882                         s->sk_type,
2883                         s->sk_socket ?
2884                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2885                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2886                         sock_i_ino(s));
2887
2888                 if (u->addr) {  // under unix_table_lock here
2889                         int i, len;
2890                         seq_putc(seq, ' ');
2891
2892                         i = 0;
2893                         len = u->addr->len - sizeof(short);
2894                         if (!UNIX_ABSTRACT(s))
2895                                 len--;
2896                         else {
2897                                 seq_putc(seq, '@');
2898                                 i++;
2899                         }
2900                         for ( ; i < len; i++)
2901                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
2902                                          '@');
2903                 }
2904                 unix_state_unlock(s);
2905                 seq_putc(seq, '\n');
2906         }
2907
2908         return 0;
2909 }
2910
2911 static const struct seq_operations unix_seq_ops = {
2912         .start  = unix_seq_start,
2913         .next   = unix_seq_next,
2914         .stop   = unix_seq_stop,
2915         .show   = unix_seq_show,
2916 };
2917 #endif
2918
2919 static const struct net_proto_family unix_family_ops = {
2920         .family = PF_UNIX,
2921         .create = unix_create,
2922         .owner  = THIS_MODULE,
2923 };
2924
2925
2926 static int __net_init unix_net_init(struct net *net)
2927 {
2928         int error = -ENOMEM;
2929
2930         net->unx.sysctl_max_dgram_qlen = 10;
2931         if (unix_sysctl_register(net))
2932                 goto out;
2933
2934 #ifdef CONFIG_PROC_FS
2935         if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
2936                         sizeof(struct seq_net_private))) {
2937                 unix_sysctl_unregister(net);
2938                 goto out;
2939         }
2940 #endif
2941         error = 0;
2942 out:
2943         return error;
2944 }
2945
2946 static void __net_exit unix_net_exit(struct net *net)
2947 {
2948         unix_sysctl_unregister(net);
2949         remove_proc_entry("unix", net->proc_net);
2950 }
2951
2952 static struct pernet_operations unix_net_ops = {
2953         .init = unix_net_init,
2954         .exit = unix_net_exit,
2955 };
2956
2957 static int __init af_unix_init(void)
2958 {
2959         int rc = -1;
2960
2961         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
2962
2963         rc = proto_register(&unix_proto, 1);
2964         if (rc != 0) {
2965                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2966                 goto out;
2967         }
2968
2969         sock_register(&unix_family_ops);
2970         register_pernet_subsys(&unix_net_ops);
2971 out:
2972         return rc;
2973 }
2974
2975 static void __exit af_unix_exit(void)
2976 {
2977         sock_unregister(PF_UNIX);
2978         proto_unregister(&unix_proto);
2979         unregister_pernet_subsys(&unix_net_ops);
2980 }
2981
2982 /* Earlier than device_initcall() so that other drivers invoking
2983    request_module() don't end up in a loop when modprobe tries
2984    to use a UNIX socket. But later than subsys_initcall() because
2985    we depend on stuff initialised there */
2986 fs_initcall(af_unix_init);
2987 module_exit(af_unix_exit);
2988
2989 MODULE_LICENSE("GPL");
2990 MODULE_ALIAS_NETPROTO(PF_UNIX);