net/unix/af_unix.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * NET4:        Implementation of BSD Unix domain sockets.
   4  *
   5  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   6  *
   7  * Fixes:
   8  *              Linus Torvalds  :       Assorted bug cures.
   9  *              Niibe Yutaka    :       async I/O support.
  10  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  11  *              Alan Cox        :       Limit size of allocated blocks.
  12  *              Alan Cox        :       Fixed the stupid socketpair bug.
  13  *              Alan Cox        :       BSD compatibility fine tuning.
  14  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  15  *              Alan Cox        :       Sorted out a proper draft version of
  16  *                                      file descriptor passing hacked up from
  17  *                                      Mike Shaver's work.
  18  *              Marty Leisner   :       Fixes to fd passing
  19  *              Nick Nevin      :       recvmsg bugfix.
  20  *              Alan Cox        :       Started proper garbage collector
  21  *              Heiko EiBfeldt  :       Missing verify_area check
  22  *              Alan Cox        :       Started POSIXisms
  23  *              Andreas Schwab  :       Replace inode by dentry for proper
  24  *                                      reference counting
  25  *              Kirk Petersen   :       Made this a module
  26  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  27  *                                      Lots of bug fixes.
  28  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  29  *                                      by above two patches.
  30  *           Andrea Arcangeli   :       If possible we block in connect(2)
  31  *                                      if the max backlog of the listen socket
  32  *                                      is been reached. This won't break
  33  *                                      old apps and it will avoid huge amount
  34  *                                      of socks hashed (this for unix_gc()
  35  *                                      performances reasons).
  36  *                                      Security fix that limits the max
  37  *                                      number of socks to 2*max_files and
  38  *                                      the number of skb queueable in the
  39  *                                      dgram receiver.
  40  *              Artur Skawina   :       Hash function optimizations
  41  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  42  *            Malcolm Beattie   :       Set peercred for socketpair
  43  *           Michal Ostrowski   :       Module initialization cleanup.
  44  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  45  *                                      the core infrastructure is doing that
  46  *                                      for all net proto families now (2.5.69+)
  47  *
  48  * Known differences from reference BSD that was tested:
  49  *
  50  *      [TO FIX]
  51  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  52  *              other the moment one end closes.
  53  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  54  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  55  *      [NOT TO FIX]
  56  *      accept() returns a path name even if the connecting socket has closed
  57  *              in the meantime (BSD loses the path and gives up).
  58  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  59  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  60  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  61  *      BSD af_unix apparently has connect forgetting to block properly.
  62  *              (need to check this with the POSIX spec in detail)
  63  *
  64  * Differences from 2.0.0-11-... (ANK)
  65  *      Bug fixes and improvements.
  66  *              - client shutdown killed server socket.
  67  *              - removed all useless cli/sti pairs.
  68  *
  69  *      Semantic changes/extensions.
  70  *              - generic control message passing.
  71  *              - SCM_CREDENTIALS control message.
  72  *              - "Abstract" (not FS based) socket bindings.
  73  *                Abstract names are sequences of bytes (not zero terminated)
  74  *                started by 0, so that this name space does not intersect
  75  *                with BSD names.
  76  */
  77
  78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  79
  80 #include <linux/bpf-cgroup.h>
  81 #include <linux/btf_ids.h>
  82 #include <linux/dcache.h>
  83 #include <linux/errno.h>
  84 #include <linux/fcntl.h>
  85 #include <linux/file.h>
  86 #include <linux/filter.h>
  87 #include <linux/fs.h>
  88 #include <linux/fs_struct.h>
  89 #include <linux/init.h>
  90 #include <linux/kernel.h>
  91 #include <linux/mount.h>
  92 #include <linux/namei.h>
  93 #include <linux/net.h>
  94 #include <linux/pidfs.h>
  95 #include <linux/poll.h>
  96 #include <linux/proc_fs.h>
  97 #include <linux/sched/signal.h>
  98 #include <linux/security.h>
  99 #include <linux/seq_file.h>
 100 #include <linux/skbuff.h>
 101 #include <linux/slab.h>
 102 #include <linux/socket.h>
 103 #include <linux/splice.h>
 104 #include <linux/string.h>
 105 #include <linux/uaccess.h>
 106 #include <net/af_unix.h>
 107 #include <net/net_namespace.h>
 108 #include <net/scm.h>
 109 #include <net/tcp_states.h>
 110 #include <uapi/linux/sockios.h>
 111 #include <uapi/linux/termios.h>
 112
 113 #include "af_unix.h"
 114
 115 static atomic_long_t unix_nr_socks;
 116 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
 117 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
 118
 119 /* SMP locking strategy:
 120  *    hash table is protected with spinlock.
 121  *    each socket state is protected by separate spinlock.
 122  */
 123 #ifdef CONFIG_PROVE_LOCKING
 124 #define cmp_ptr(l, r)   (((l) > (r)) - ((l) < (r)))
 125
 126 static int unix_table_lock_cmp_fn(const struct lockdep_map *a,
 127                                   const struct lockdep_map *b)
 128 {
 129         return cmp_ptr(a, b);
 130 }
 131
 132 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a,
 133                                   const struct lockdep_map *_b)
 134 {
 135         const struct unix_sock *a, *b;
 136
 137         a = container_of(_a, struct unix_sock, lock.dep_map);
 138         b = container_of(_b, struct unix_sock, lock.dep_map);
 139
 140         if (a->sk.sk_state == TCP_LISTEN) {
 141                 /* unix_stream_connect(): Before the 2nd unix_state_lock(),
 142                  *
 143                  *   1. a is TCP_LISTEN.
 144                  *   2. b is not a.
 145                  *   3. concurrent connect(b -> a) must fail.
 146                  *
 147                  * Except for 2. & 3., the b's state can be any possible
 148                  * value due to concurrent connect() or listen().
 149                  *
 150                  * 2. is detected in debug_spin_lock_before(), and 3. cannot
 151                  * be expressed as lock_cmp_fn.
 152                  */
 153                 switch (b->sk.sk_state) {
 154                 case TCP_CLOSE:
 155                 case TCP_ESTABLISHED:
 156                 case TCP_LISTEN:
 157                         return -1;
 158                 default:
 159                         /* Invalid case. */
 160                         return 0;
 161                 }
 162         }
 163
 164         /* Should never happen.  Just to be symmetric. */
 165         if (b->sk.sk_state == TCP_LISTEN) {
 166                 switch (b->sk.sk_state) {
 167                 case TCP_CLOSE:
 168                 case TCP_ESTABLISHED:
 169                         return 1;
 170                 default:
 171                         return 0;
 172                 }
 173         }
 174
 175         /* unix_state_double_lock(): ascending address order. */
 176         return cmp_ptr(a, b);
 177 }
 178
 179 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a,
 180                                   const struct lockdep_map *_b)
 181 {
 182         const struct sock *a, *b;
 183
 184         a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map);
 185         b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map);
 186
 187         /* unix_collect_skb(): listener -> embryo order. */
 188         if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a)
 189                 return -1;
 190
 191         /* Should never happen.  Just to be symmetric. */
 192         if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b)
 193                 return 1;
 194
 195         return 0;
 196 }
 197 #endif
 198
 199 static unsigned int unix_unbound_hash(struct sock *sk)
 200 {
 201         unsigned long hash = (unsigned long)sk;
 202
 203         hash ^= hash >> 16;
 204         hash ^= hash >> 8;
 205         hash ^= sk->sk_type;
 206
 207         return hash & UNIX_HASH_MOD;
 208 }
 209
 210 static unsigned int unix_bsd_hash(struct inode *i)
 211 {
 212         return i->i_ino & UNIX_HASH_MOD;
 213 }
 214
 215 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
 216                                        int addr_len, int type)
 217 {
 218         __wsum csum = csum_partial(sunaddr, addr_len, 0);
 219         unsigned int hash;
 220
 221         hash = (__force unsigned int)csum_fold(csum);
 222         hash ^= hash >> 8;
 223         hash ^= type;
 224
 225         return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
 226 }
 227
 228 static void unix_table_double_lock(struct net *net,
 229                                    unsigned int hash1, unsigned int hash2)
 230 {
 231         if (hash1 == hash2) {
 232                 spin_lock(&net->unx.table.locks[hash1]);
 233                 return;
 234         }
 235
 236         if (hash1 > hash2)
 237                 swap(hash1, hash2);
 238
 239         spin_lock(&net->unx.table.locks[hash1]);
 240         spin_lock(&net->unx.table.locks[hash2]);
 241 }
 242
 243 static void unix_table_double_unlock(struct net *net,
 244                                      unsigned int hash1, unsigned int hash2)
 245 {
 246         if (hash1 == hash2) {
 247                 spin_unlock(&net->unx.table.locks[hash1]);
 248                 return;
 249         }
 250
 251         spin_unlock(&net->unx.table.locks[hash1]);
 252         spin_unlock(&net->unx.table.locks[hash2]);
 253 }
 254
 255 #ifdef CONFIG_SECURITY_NETWORK
 256 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 257 {
 258         UNIXCB(skb).secid = scm->secid;
 259 }
 260
 261 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 262 {
 263         scm->secid = UNIXCB(skb).secid;
 264 }
 265
 266 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 267 {
 268         return (scm->secid == UNIXCB(skb).secid);
 269 }
 270 #else
 271 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 272 { }
 273
 274 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 275 { }
 276
 277 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 278 {
 279         return true;
 280 }
 281 #endif /* CONFIG_SECURITY_NETWORK */
 282
 283 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 284 {
 285         return !unix_peer(osk) || unix_peer(osk) == sk;
 286 }
 287
 288 static inline int unix_recvq_full_lockless(const struct sock *sk)
 289 {
 290         return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 291 }
 292
 293 struct sock *unix_peer_get(struct sock *s)
 294 {
 295         struct sock *peer;
 296
 297         unix_state_lock(s);
 298         peer = unix_peer(s);
 299         if (peer)
 300                 sock_hold(peer);
 301         unix_state_unlock(s);
 302         return peer;
 303 }
 304 EXPORT_SYMBOL_GPL(unix_peer_get);
 305
 306 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
 307                                              int addr_len)
 308 {
 309         struct unix_address *addr;
 310
 311         addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
 312         if (!addr)
 313                 return NULL;
 314
 315         refcount_set(&addr->refcnt, 1);
 316         addr->len = addr_len;
 317         memcpy(addr->name, sunaddr, addr_len);
 318
 319         return addr;
 320 }
 321
 322 static inline void unix_release_addr(struct unix_address *addr)
 323 {
 324         if (refcount_dec_and_test(&addr->refcnt))
 325                 kfree(addr);
 326 }
 327
 328 /*
 329  *      Check unix socket name:
 330  *              - should be not zero length.
 331  *              - if started by not zero, should be NULL terminated (FS object)
 332  *              - if started by zero, it is abstract name.
 333  */
 334
 335 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
 336 {
 337         if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
 338             addr_len > sizeof(*sunaddr))
 339                 return -EINVAL;
 340
 341         if (sunaddr->sun_family != AF_UNIX)
 342                 return -EINVAL;
 343
 344         return 0;
 345 }
 346
 347 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
 348 {
 349         struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
 350         short offset = offsetof(struct sockaddr_storage, __data);
 351
 352         BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
 353
 354         /* This may look like an off by one error but it is a bit more
 355          * subtle.  108 is the longest valid AF_UNIX path for a binding.
 356          * sun_path[108] doesn't as such exist.  However in kernel space
 357          * we are guaranteed that it is a valid memory location in our
 358          * kernel address buffer because syscall functions always pass
 359          * a pointer of struct sockaddr_storage which has a bigger buffer
 360          * than 108.  Also, we must terminate sun_path for strlen() in
 361          * getname_kernel().
 362          */
 363         addr->__data[addr_len - offset] = 0;
 364
 365         /* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
 366          * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
 367          * know the actual buffer.
 368          */
 369         return strlen(addr->__data) + offset + 1;
 370 }
 371
 372 static void __unix_remove_socket(struct sock *sk)
 373 {
 374         sk_del_node_init(sk);
 375 }
 376
 377 static void __unix_insert_socket(struct net *net, struct sock *sk)
 378 {
 379         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 380         sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
 381 }
 382
 383 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
 384                                  struct unix_address *addr, unsigned int hash)
 385 {
 386         __unix_remove_socket(sk);
 387         smp_store_release(&unix_sk(sk)->addr, addr);
 388
 389         sk->sk_hash = hash;
 390         __unix_insert_socket(net, sk);
 391 }
 392
 393 static void unix_remove_socket(struct net *net, struct sock *sk)
 394 {
 395         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 396         __unix_remove_socket(sk);
 397         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 398 }
 399
 400 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
 401 {
 402         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 403         __unix_insert_socket(net, sk);
 404         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 405 }
 406
 407 static void unix_insert_bsd_socket(struct sock *sk)
 408 {
 409         spin_lock(&bsd_socket_locks[sk->sk_hash]);
 410         sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
 411         spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 412 }
 413
 414 static void unix_remove_bsd_socket(struct sock *sk)
 415 {
 416         if (!hlist_unhashed(&sk->sk_bind_node)) {
 417                 spin_lock(&bsd_socket_locks[sk->sk_hash]);
 418                 __sk_del_bind_node(sk);
 419                 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 420
 421                 sk_node_init(&sk->sk_bind_node);
 422         }
 423 }
 424
 425 static struct sock *__unix_find_socket_byname(struct net *net,
 426                                               struct sockaddr_un *sunname,
 427                                               int len, unsigned int hash)
 428 {
 429         struct sock *s;
 430
 431         sk_for_each(s, &net->unx.table.buckets[hash]) {
 432                 struct unix_sock *u = unix_sk(s);
 433
 434                 if (u->addr->len == len &&
 435                     !memcmp(u->addr->name, sunname, len))
 436                         return s;
 437         }
 438         return NULL;
 439 }
 440
 441 static inline struct sock *unix_find_socket_byname(struct net *net,
 442                                                    struct sockaddr_un *sunname,
 443                                                    int len, unsigned int hash)
 444 {
 445         struct sock *s;
 446
 447         spin_lock(&net->unx.table.locks[hash]);
 448         s = __unix_find_socket_byname(net, sunname, len, hash);
 449         if (s)
 450                 sock_hold(s);
 451         spin_unlock(&net->unx.table.locks[hash]);
 452         return s;
 453 }
 454
 455 static struct sock *unix_find_socket_byinode(struct inode *i)
 456 {
 457         unsigned int hash = unix_bsd_hash(i);
 458         struct sock *s;
 459
 460         spin_lock(&bsd_socket_locks[hash]);
 461         sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
 462                 struct dentry *dentry = unix_sk(s)->path.dentry;
 463
 464                 if (dentry && d_backing_inode(dentry) == i) {
 465                         sock_hold(s);
 466                         spin_unlock(&bsd_socket_locks[hash]);
 467                         return s;
 468                 }
 469         }
 470         spin_unlock(&bsd_socket_locks[hash]);
 471         return NULL;
 472 }
 473
 474 /* Support code for asymmetrically connected dgram sockets
 475  *
 476  * If a datagram socket is connected to a socket not itself connected
 477  * to the first socket (eg, /dev/log), clients may only enqueue more
 478  * messages if the present receive queue of the server socket is not
 479  * "too large". This means there's a second writeability condition
 480  * poll and sendmsg need to test. The dgram recv code will do a wake
 481  * up on the peer_wait wait queue of a socket upon reception of a
 482  * datagram which needs to be propagated to sleeping would-be writers
 483  * since these might not have sent anything so far. This can't be
 484  * accomplished via poll_wait because the lifetime of the server
 485  * socket might be less than that of its clients if these break their
 486  * association with it or if the server socket is closed while clients
 487  * are still connected to it and there's no way to inform "a polling
 488  * implementation" that it should let go of a certain wait queue
 489  *
 490  * In order to propagate a wake up, a wait_queue_entry_t of the client
 491  * socket is enqueued on the peer_wait queue of the server socket
 492  * whose wake function does a wake_up on the ordinary client socket
 493  * wait queue. This connection is established whenever a write (or
 494  * poll for write) hit the flow control condition and broken when the
 495  * association to the server socket is dissolved or after a wake up
 496  * was relayed.
 497  */
 498
 499 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
 500                                       void *key)
 501 {
 502         struct unix_sock *u;
 503         wait_queue_head_t *u_sleep;
 504
 505         u = container_of(q, struct unix_sock, peer_wake);
 506
 507         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
 508                             q);
 509         u->peer_wake.private = NULL;
 510
 511         /* relaying can only happen while the wq still exists */
 512         u_sleep = sk_sleep(&u->sk);
 513         if (u_sleep)
 514                 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
 515
 516         return 0;
 517 }
 518
 519 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
 520 {
 521         struct unix_sock *u, *u_other;
 522         int rc;
 523
 524         u = unix_sk(sk);
 525         u_other = unix_sk(other);
 526         rc = 0;
 527         spin_lock(&u_other->peer_wait.lock);
 528
 529         if (!u->peer_wake.private) {
 530                 u->peer_wake.private = other;
 531                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
 532
 533                 rc = 1;
 534         }
 535
 536         spin_unlock(&u_other->peer_wait.lock);
 537         return rc;
 538 }
 539
 540 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
 541                                             struct sock *other)
 542 {
 543         struct unix_sock *u, *u_other;
 544
 545         u = unix_sk(sk);
 546         u_other = unix_sk(other);
 547         spin_lock(&u_other->peer_wait.lock);
 548
 549         if (u->peer_wake.private == other) {
 550                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
 551                 u->peer_wake.private = NULL;
 552         }
 553
 554         spin_unlock(&u_other->peer_wait.lock);
 555 }
 556
 557 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 558                                                    struct sock *other)
 559 {
 560         unix_dgram_peer_wake_disconnect(sk, other);
 561         wake_up_interruptible_poll(sk_sleep(sk),
 562                                    EPOLLOUT |
 563                                    EPOLLWRNORM |
 564                                    EPOLLWRBAND);
 565 }
 566
 567 /* preconditions:
 568  *      - unix_peer(sk) == other
 569  *      - association is stable
 570  */
 571 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 572 {
 573         int connected;
 574
 575         connected = unix_dgram_peer_wake_connect(sk, other);
 576
 577         /* If other is SOCK_DEAD, we want to make sure we signal
 578          * POLLOUT, such that a subsequent write() can get a
 579          * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
 580          * to other and its full, we will hang waiting for POLLOUT.
 581          */
 582         if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
 583                 return 1;
 584
 585         if (connected)
 586                 unix_dgram_peer_wake_disconnect(sk, other);
 587
 588         return 0;
 589 }
 590
 591 static int unix_writable(const struct sock *sk, unsigned char state)
 592 {
 593         return state != TCP_LISTEN &&
 594                 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
 595 }
 596
 597 static void unix_write_space(struct sock *sk)
 598 {
 599         struct socket_wq *wq;
 600
 601         rcu_read_lock();
 602         if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
 603                 wq = rcu_dereference(sk->sk_wq);
 604                 if (skwq_has_sleeper(wq))
 605                         wake_up_interruptible_sync_poll(&wq->wait,
 606                                 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
 607                 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
 608         }
 609         rcu_read_unlock();
 610 }
 611
 612 /* When dgram socket disconnects (or changes its peer), we clear its receive
 613  * queue of packets arrived from previous peer. First, it allows to do
 614  * flow control based only on wmem_alloc; second, sk connected to peer
 615  * may receive messages only from that peer. */
 616 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 617 {
 618         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 619                 skb_queue_purge_reason(&sk->sk_receive_queue,
 620                                        SKB_DROP_REASON_UNIX_DISCONNECT);
 621
 622                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 623
 624                 /* If one link of bidirectional dgram pipe is disconnected,
 625                  * we signal error. Messages are lost. Do not make this,
 626                  * when peer was not connected to us.
 627                  */
 628                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 629                         WRITE_ONCE(other->sk_err, ECONNRESET);
 630                         sk_error_report(other);
 631                 }
 632         }
 633 }
 634
 635 static void unix_sock_destructor(struct sock *sk)
 636 {
 637         struct unix_sock *u = unix_sk(sk);
 638
 639         skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE);
 640
 641         DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
 642         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 643         DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
 644         if (!sock_flag(sk, SOCK_DEAD)) {
 645                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 646                 return;
 647         }
 648
 649         if (sk->sk_peer_pid)
 650                 pidfs_put_pid(sk->sk_peer_pid);
 651
 652         if (u->addr)
 653                 unix_release_addr(u->addr);
 654
 655         atomic_long_dec(&unix_nr_socks);
 656         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 657 #ifdef UNIX_REFCNT_DEBUG
 658         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 659                 atomic_long_read(&unix_nr_socks));
 660 #endif
 661 }
 662
 663 static unsigned int unix_skb_len(const struct sk_buff *skb)
 664 {
 665         return skb->len - UNIXCB(skb).consumed;
 666 }
 667
 668 static void unix_release_sock(struct sock *sk, int embrion)
 669 {
 670         struct unix_sock *u = unix_sk(sk);
 671         struct sock *skpair;
 672         struct sk_buff *skb;
 673         struct path path;
 674         int state;
 675
 676         unix_remove_socket(sock_net(sk), sk);
 677         unix_remove_bsd_socket(sk);
 678
 679         /* Clear state */
 680         unix_state_lock(sk);
 681         sock_orphan(sk);
 682         WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
 683         path         = u->path;
 684         u->path.dentry = NULL;
 685         u->path.mnt = NULL;
 686         state = sk->sk_state;
 687         WRITE_ONCE(sk->sk_state, TCP_CLOSE);
 688
 689         skpair = unix_peer(sk);
 690         unix_peer(sk) = NULL;
 691
 692         unix_state_unlock(sk);
 693
 694 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
 695         u->oob_skb = NULL;
 696 #endif
 697
 698         wake_up_interruptible_all(&u->peer_wait);
 699
 700         if (skpair != NULL) {
 701                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 702                         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
 703
 704 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
 705                         if (skb && !unix_skb_len(skb))
 706                                 skb = skb_peek_next(skb, &sk->sk_receive_queue);
 707 #endif
 708                         unix_state_lock(skpair);
 709                         /* No more writes */
 710                         WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
 711                         if (skb || embrion)
 712                                 WRITE_ONCE(skpair->sk_err, ECONNRESET);
 713                         unix_state_unlock(skpair);
 714                         skpair->sk_state_change(skpair);
 715                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 716                 }
 717
 718                 unix_dgram_peer_wake_disconnect(sk, skpair);
 719                 sock_put(skpair); /* It may now die */
 720         }
 721
 722         /* Try to flush out this socket. Throw out buffers at least */
 723
 724         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 725                 if (state == TCP_LISTEN)
 726                         unix_release_sock(skb->sk, 1);
 727
 728                 /* passed fds are erased in the kfree_skb hook */
 729                 kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
 730         }
 731
 732         if (path.dentry)
 733                 path_put(&path);
 734
 735         sock_put(sk);
 736
 737         /* ---- Socket is dead now and most probably destroyed ---- */
 738
 739         /*
 740          * Fixme: BSD difference: In BSD all sockets connected to us get
 741          *        ECONNRESET and we die on the spot. In Linux we behave
 742          *        like files and pipes do and wait for the last
 743          *        dereference.
 744          *
 745          * Can't we simply set sock->err?
 746          *
 747          *        What the above comment does talk about? --ANK(980817)
 748          */
 749
 750         if (READ_ONCE(unix_tot_inflight))
 751                 unix_gc();              /* Garbage collect fds */
 752 }
 753
 754 struct unix_peercred {
 755         struct pid *peer_pid;
 756         const struct cred *peer_cred;
 757 };
 758
 759 static inline int prepare_peercred(struct unix_peercred *peercred)
 760 {
 761         struct pid *pid;
 762         int err;
 763
 764         pid = task_tgid(current);
 765         err = pidfs_register_pid(pid);
 766         if (likely(!err)) {
 767                 peercred->peer_pid = get_pid(pid);
 768                 peercred->peer_cred = get_current_cred();
 769         }
 770         return err;
 771 }
 772
 773 static void drop_peercred(struct unix_peercred *peercred)
 774 {
 775         const struct cred *cred = NULL;
 776         struct pid *pid = NULL;
 777
 778         might_sleep();
 779
 780         swap(peercred->peer_pid, pid);
 781         swap(peercred->peer_cred, cred);
 782
 783         pidfs_put_pid(pid);
 784         put_pid(pid);
 785         put_cred(cred);
 786 }
 787
 788 static inline void init_peercred(struct sock *sk,
 789                                  const struct unix_peercred *peercred)
 790 {
 791         sk->sk_peer_pid = peercred->peer_pid;
 792         sk->sk_peer_cred = peercred->peer_cred;
 793 }
 794
 795 static void update_peercred(struct sock *sk, struct unix_peercred *peercred)
 796 {
 797         const struct cred *old_cred;
 798         struct pid *old_pid;
 799
 800         spin_lock(&sk->sk_peer_lock);
 801         old_pid = sk->sk_peer_pid;
 802         old_cred = sk->sk_peer_cred;
 803         init_peercred(sk, peercred);
 804         spin_unlock(&sk->sk_peer_lock);
 805
 806         peercred->peer_pid = old_pid;
 807         peercred->peer_cred = old_cred;
 808 }
 809
 810 static void copy_peercred(struct sock *sk, struct sock *peersk)
 811 {
 812         lockdep_assert_held(&unix_sk(peersk)->lock);
 813
 814         spin_lock(&sk->sk_peer_lock);
 815         sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
 816         pidfs_get_pid(sk->sk_peer_pid);
 817         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 818         spin_unlock(&sk->sk_peer_lock);
 819 }
 820
 821 static bool unix_may_passcred(const struct sock *sk)
 822 {
 823         return sk->sk_scm_credentials || sk->sk_scm_pidfd;
 824 }
 825
 826 static int unix_listen(struct socket *sock, int backlog)
 827 {
 828         int err;
 829         struct sock *sk = sock->sk;
 830         struct unix_sock *u = unix_sk(sk);
 831         struct unix_peercred peercred = {};
 832
 833         err = -EOPNOTSUPP;
 834         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 835                 goto out;       /* Only stream/seqpacket sockets accept */
 836         err = -EINVAL;
 837         if (!READ_ONCE(u->addr))
 838                 goto out;       /* No listens on an unbound socket */
 839         err = prepare_peercred(&peercred);
 840         if (err)
 841                 goto out;
 842         unix_state_lock(sk);
 843         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 844                 goto out_unlock;
 845         if (backlog > sk->sk_max_ack_backlog)
 846                 wake_up_interruptible_all(&u->peer_wait);
 847         sk->sk_max_ack_backlog  = backlog;
 848         WRITE_ONCE(sk->sk_state, TCP_LISTEN);
 849
 850         /* set credentials so connect can copy them */
 851         update_peercred(sk, &peercred);
 852         err = 0;
 853
 854 out_unlock:
 855         unix_state_unlock(sk);
 856         drop_peercred(&peercred);
 857 out:
 858         return err;
 859 }
 860
 861 static int unix_release(struct socket *);
 862 static int unix_bind(struct socket *, struct sockaddr *, int);
 863 static int unix_stream_connect(struct socket *, struct sockaddr *,
 864                                int addr_len, int flags);
 865 static int unix_socketpair(struct socket *, struct socket *);
 866 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
 867 static int unix_getname(struct socket *, struct sockaddr *, int);
 868 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
 869 static __poll_t unix_dgram_poll(struct file *, struct socket *,
 870                                     poll_table *);
 871 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 872 #ifdef CONFIG_COMPAT
 873 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 874 #endif
 875 static int unix_shutdown(struct socket *, int);
 876 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 877 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 878 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 879                                        struct pipe_inode_info *, size_t size,
 880                                        unsigned int flags);
 881 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 882 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 883 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 884 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 885 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 886                               int, int);
 887 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 888 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 889                                   int);
 890
 891 #ifdef CONFIG_PROC_FS
 892 static int unix_count_nr_fds(struct sock *sk)
 893 {
 894         struct sk_buff *skb;
 895         struct unix_sock *u;
 896         int nr_fds = 0;
 897
 898         spin_lock(&sk->sk_receive_queue.lock);
 899         skb = skb_peek(&sk->sk_receive_queue);
 900         while (skb) {
 901                 u = unix_sk(skb->sk);
 902                 nr_fds += atomic_read(&u->scm_stat.nr_fds);
 903                 skb = skb_peek_next(skb, &sk->sk_receive_queue);
 904         }
 905         spin_unlock(&sk->sk_receive_queue.lock);
 906
 907         return nr_fds;
 908 }
 909
 910 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
 911 {
 912         struct sock *sk = sock->sk;
 913         unsigned char s_state;
 914         struct unix_sock *u;
 915         int nr_fds = 0;
 916
 917         if (sk) {
 918                 s_state = READ_ONCE(sk->sk_state);
 919                 u = unix_sk(sk);
 920
 921                 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
 922                  * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
 923                  * SOCK_DGRAM is ordinary. So, no lock is needed.
 924                  */
 925                 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
 926                         nr_fds = atomic_read(&u->scm_stat.nr_fds);
 927                 else if (s_state == TCP_LISTEN)
 928                         nr_fds = unix_count_nr_fds(sk);
 929
 930                 seq_printf(m, "scm_fds: %u\n", nr_fds);
 931         }
 932 }
 933 #else
 934 #define unix_show_fdinfo NULL
 935 #endif
 936
 937 static const struct proto_ops unix_stream_ops = {
 938         .family =       PF_UNIX,
 939         .owner =        THIS_MODULE,
 940         .release =      unix_release,
 941         .bind =         unix_bind,
 942         .connect =      unix_stream_connect,
 943         .socketpair =   unix_socketpair,
 944         .accept =       unix_accept,
 945         .getname =      unix_getname,
 946         .poll =         unix_poll,
 947         .ioctl =        unix_ioctl,
 948 #ifdef CONFIG_COMPAT
 949         .compat_ioctl = unix_compat_ioctl,
 950 #endif
 951         .listen =       unix_listen,
 952         .shutdown =     unix_shutdown,
 953         .sendmsg =      unix_stream_sendmsg,
 954         .recvmsg =      unix_stream_recvmsg,
 955         .read_skb =     unix_stream_read_skb,
 956         .mmap =         sock_no_mmap,
 957         .splice_read =  unix_stream_splice_read,
 958         .set_peek_off = sk_set_peek_off,
 959         .show_fdinfo =  unix_show_fdinfo,
 960 };
 961
 962 static const struct proto_ops unix_dgram_ops = {
 963         .family =       PF_UNIX,
 964         .owner =        THIS_MODULE,
 965         .release =      unix_release,
 966         .bind =         unix_bind,
 967         .connect =      unix_dgram_connect,
 968         .socketpair =   unix_socketpair,
 969         .accept =       sock_no_accept,
 970         .getname =      unix_getname,
 971         .poll =         unix_dgram_poll,
 972         .ioctl =        unix_ioctl,
 973 #ifdef CONFIG_COMPAT
 974         .compat_ioctl = unix_compat_ioctl,
 975 #endif
 976         .listen =       sock_no_listen,
 977         .shutdown =     unix_shutdown,
 978         .sendmsg =      unix_dgram_sendmsg,
 979         .read_skb =     unix_read_skb,
 980         .recvmsg =      unix_dgram_recvmsg,
 981         .mmap =         sock_no_mmap,
 982         .set_peek_off = sk_set_peek_off,
 983         .show_fdinfo =  unix_show_fdinfo,
 984 };
 985
 986 static const struct proto_ops unix_seqpacket_ops = {
 987         .family =       PF_UNIX,
 988         .owner =        THIS_MODULE,
 989         .release =      unix_release,
 990         .bind =         unix_bind,
 991         .connect =      unix_stream_connect,
 992         .socketpair =   unix_socketpair,
 993         .accept =       unix_accept,
 994         .getname =      unix_getname,
 995         .poll =         unix_dgram_poll,
 996         .ioctl =        unix_ioctl,
 997 #ifdef CONFIG_COMPAT
 998         .compat_ioctl = unix_compat_ioctl,
 999 #endif
1000         .listen =       unix_listen,
1001         .shutdown =     unix_shutdown,
1002         .sendmsg =      unix_seqpacket_sendmsg,
1003         .recvmsg =      unix_seqpacket_recvmsg,
1004         .mmap =         sock_no_mmap,
1005         .set_peek_off = sk_set_peek_off,
1006         .show_fdinfo =  unix_show_fdinfo,
1007 };
1008
1009 static void unix_close(struct sock *sk, long timeout)
1010 {
1011         /* Nothing to do here, unix socket does not need a ->close().
1012          * This is merely for sockmap.
1013          */
1014 }
1015
1016 static bool unix_bpf_bypass_getsockopt(int level, int optname)
1017 {
1018         if (level == SOL_SOCKET) {
1019                 switch (optname) {
1020                 case SO_PEERPIDFD:
1021                         return true;
1022                 default:
1023                         return false;
1024                 }
1025         }
1026
1027         return false;
1028 }
1029
1030 struct proto unix_dgram_proto = {
1031         .name                   = "UNIX",
1032         .owner                  = THIS_MODULE,
1033         .obj_size               = sizeof(struct unix_sock),
1034         .close                  = unix_close,
1035         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
1036 #ifdef CONFIG_BPF_SYSCALL
1037         .psock_update_sk_prot   = unix_dgram_bpf_update_proto,
1038 #endif
1039 };
1040
1041 struct proto unix_stream_proto = {
1042         .name                   = "UNIX-STREAM",
1043         .owner                  = THIS_MODULE,
1044         .obj_size               = sizeof(struct unix_sock),
1045         .close                  = unix_close,
1046         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
1047 #ifdef CONFIG_BPF_SYSCALL
1048         .psock_update_sk_prot   = unix_stream_bpf_update_proto,
1049 #endif
1050 };
1051
1052 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
1053 {
1054         struct unix_sock *u;
1055         struct sock *sk;
1056         int err;
1057
1058         atomic_long_inc(&unix_nr_socks);
1059         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
1060                 err = -ENFILE;
1061                 goto err;
1062         }
1063
1064         if (type == SOCK_STREAM)
1065                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
1066         else /*dgram and  seqpacket */
1067                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
1068
1069         if (!sk) {
1070                 err = -ENOMEM;
1071                 goto err;
1072         }
1073
1074         sock_init_data(sock, sk);
1075
1076         sk->sk_scm_rights       = 1;
1077         sk->sk_hash             = unix_unbound_hash(sk);
1078         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
1079         sk->sk_write_space      = unix_write_space;
1080         sk->sk_max_ack_backlog  = READ_ONCE(net->unx.sysctl_max_dgram_qlen);
1081         sk->sk_destruct         = unix_sock_destructor;
1082         lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL);
1083
1084         u = unix_sk(sk);
1085         u->listener = NULL;
1086         u->vertex = NULL;
1087         u->path.dentry = NULL;
1088         u->path.mnt = NULL;
1089         spin_lock_init(&u->lock);
1090         lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL);
1091         mutex_init(&u->iolock); /* single task reading lock */
1092         mutex_init(&u->bindlock); /* single task binding lock */
1093         init_waitqueue_head(&u->peer_wait);
1094         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1095         memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1096         unix_insert_unbound_socket(net, sk);
1097
1098         sock_prot_inuse_add(net, sk->sk_prot, 1);
1099
1100         return sk;
1101
1102 err:
1103         atomic_long_dec(&unix_nr_socks);
1104         return ERR_PTR(err);
1105 }
1106
1107 static int unix_create(struct net *net, struct socket *sock, int protocol,
1108                        int kern)
1109 {
1110         struct sock *sk;
1111
1112         if (protocol && protocol != PF_UNIX)
1113                 return -EPROTONOSUPPORT;
1114
1115         sock->state = SS_UNCONNECTED;
1116
1117         switch (sock->type) {
1118         case SOCK_STREAM:
1119                 sock->ops = &unix_stream_ops;
1120                 break;
1121                 /*
1122                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
1123                  *      nothing uses it.
1124                  */
1125         case SOCK_RAW:
1126                 sock->type = SOCK_DGRAM;
1127                 fallthrough;
1128         case SOCK_DGRAM:
1129                 sock->ops = &unix_dgram_ops;
1130                 break;
1131         case SOCK_SEQPACKET:
1132                 sock->ops = &unix_seqpacket_ops;
1133                 break;
1134         default:
1135                 return -ESOCKTNOSUPPORT;
1136         }
1137
1138         sk = unix_create1(net, sock, kern, sock->type);
1139         if (IS_ERR(sk))
1140                 return PTR_ERR(sk);
1141
1142         return 0;
1143 }
1144
1145 static int unix_release(struct socket *sock)
1146 {
1147         struct sock *sk = sock->sk;
1148
1149         if (!sk)
1150                 return 0;
1151
1152         sk->sk_prot->close(sk, 0);
1153         unix_release_sock(sk, 0);
1154         sock->sk = NULL;
1155
1156         return 0;
1157 }
1158
1159 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1160                                   int type, int flags)
1161 {
1162         struct inode *inode;
1163         struct path path;
1164         struct sock *sk;
1165         int err;
1166
1167         unix_mkname_bsd(sunaddr, addr_len);
1168
1169         if (flags & SOCK_COREDUMP) {
1170                 const struct cred *cred;
1171                 struct cred *kcred;
1172                 struct path root;
1173
1174                 kcred = prepare_kernel_cred(&init_task);
1175                 if (!kcred) {
1176                         err = -ENOMEM;
1177                         goto fail;
1178                 }
1179
1180                 task_lock(&init_task);
1181                 get_fs_root(init_task.fs, &root);
1182                 task_unlock(&init_task);
1183
1184                 cred = override_creds(kcred);
1185                 err = vfs_path_lookup(root.dentry, root.mnt, sunaddr->sun_path,
1186                                       LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS |
1187                                       LOOKUP_NO_MAGICLINKS, &path);
1188                 put_cred(revert_creds(cred));
1189                 path_put(&root);
1190                 if (err)
1191                         goto fail;
1192         } else {
1193                 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1194                 if (err)
1195                         goto fail;
1196
1197                 err = path_permission(&path, MAY_WRITE);
1198                 if (err)
1199                         goto path_put;
1200         }
1201
1202         err = -ECONNREFUSED;
1203         inode = d_backing_inode(path.dentry);
1204         if (!S_ISSOCK(inode->i_mode))
1205                 goto path_put;
1206
1207         sk = unix_find_socket_byinode(inode);
1208         if (!sk)
1209                 goto path_put;
1210
1211         err = -EPROTOTYPE;
1212         if (sk->sk_type == type)
1213                 touch_atime(&path);
1214         else
1215                 goto sock_put;
1216
1217         path_put(&path);
1218
1219         return sk;
1220
1221 sock_put:
1222         sock_put(sk);
1223 path_put:
1224         path_put(&path);
1225 fail:
1226         return ERR_PTR(err);
1227 }
1228
1229 static struct sock *unix_find_abstract(struct net *net,
1230                                        struct sockaddr_un *sunaddr,
1231                                        int addr_len, int type)
1232 {
1233         unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1234         struct dentry *dentry;
1235         struct sock *sk;
1236
1237         sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1238         if (!sk)
1239                 return ERR_PTR(-ECONNREFUSED);
1240
1241         dentry = unix_sk(sk)->path.dentry;
1242         if (dentry)
1243                 touch_atime(&unix_sk(sk)->path);
1244
1245         return sk;
1246 }
1247
1248 static struct sock *unix_find_other(struct net *net,
1249                                     struct sockaddr_un *sunaddr,
1250                                     int addr_len, int type, int flags)
1251 {
1252         struct sock *sk;
1253
1254         if (sunaddr->sun_path[0])
1255                 sk = unix_find_bsd(sunaddr, addr_len, type, flags);
1256         else
1257                 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1258
1259         return sk;
1260 }
1261
1262 static int unix_autobind(struct sock *sk)
1263 {
1264         struct unix_sock *u = unix_sk(sk);
1265         unsigned int new_hash, old_hash;
1266         struct net *net = sock_net(sk);
1267         struct unix_address *addr;
1268         u32 lastnum, ordernum;
1269         int err;
1270
1271         err = mutex_lock_interruptible(&u->bindlock);
1272         if (err)
1273                 return err;
1274
1275         if (u->addr)
1276                 goto out;
1277
1278         err = -ENOMEM;
1279         addr = kzalloc(sizeof(*addr) +
1280                        offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1281         if (!addr)
1282                 goto out;
1283
1284         addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1285         addr->name->sun_family = AF_UNIX;
1286         refcount_set(&addr->refcnt, 1);
1287
1288         old_hash = sk->sk_hash;
1289         ordernum = get_random_u32();
1290         lastnum = ordernum & 0xFFFFF;
1291 retry:
1292         ordernum = (ordernum + 1) & 0xFFFFF;
1293         sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1294
1295         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1296         unix_table_double_lock(net, old_hash, new_hash);
1297
1298         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1299                 unix_table_double_unlock(net, old_hash, new_hash);
1300
1301                 /* __unix_find_socket_byname() may take long time if many names
1302                  * are already in use.
1303                  */
1304                 cond_resched();
1305
1306                 if (ordernum == lastnum) {
1307                         /* Give up if all names seems to be in use. */
1308                         err = -ENOSPC;
1309                         unix_release_addr(addr);
1310                         goto out;
1311                 }
1312
1313                 goto retry;
1314         }
1315
1316         __unix_set_addr_hash(net, sk, addr, new_hash);
1317         unix_table_double_unlock(net, old_hash, new_hash);
1318         err = 0;
1319
1320 out:    mutex_unlock(&u->bindlock);
1321         return err;
1322 }
1323
1324 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1325                          int addr_len)
1326 {
1327         umode_t mode = S_IFSOCK |
1328                (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1329         struct unix_sock *u = unix_sk(sk);
1330         unsigned int new_hash, old_hash;
1331         struct net *net = sock_net(sk);
1332         struct mnt_idmap *idmap;
1333         struct unix_address *addr;
1334         struct dentry *dentry;
1335         struct path parent;
1336         int err;
1337
1338         addr_len = unix_mkname_bsd(sunaddr, addr_len);
1339         addr = unix_create_addr(sunaddr, addr_len);
1340         if (!addr)
1341                 return -ENOMEM;
1342
1343         /*
1344          * Get the parent directory, calculate the hash for last
1345          * component.
1346          */
1347         dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1348         if (IS_ERR(dentry)) {
1349                 err = PTR_ERR(dentry);
1350                 goto out;
1351         }
1352
1353         /*
1354          * All right, let's create it.
1355          */
1356         idmap = mnt_idmap(parent.mnt);
1357         err = security_path_mknod(&parent, dentry, mode, 0);
1358         if (!err)
1359                 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1360         if (err)
1361                 goto out_path;
1362         err = mutex_lock_interruptible(&u->bindlock);
1363         if (err)
1364                 goto out_unlink;
1365         if (u->addr)
1366                 goto out_unlock;
1367
1368         old_hash = sk->sk_hash;
1369         new_hash = unix_bsd_hash(d_backing_inode(dentry));
1370         unix_table_double_lock(net, old_hash, new_hash);
1371         u->path.mnt = mntget(parent.mnt);
1372         u->path.dentry = dget(dentry);
1373         __unix_set_addr_hash(net, sk, addr, new_hash);
1374         unix_table_double_unlock(net, old_hash, new_hash);
1375         unix_insert_bsd_socket(sk);
1376         mutex_unlock(&u->bindlock);
1377         done_path_create(&parent, dentry);
1378         return 0;
1379
1380 out_unlock:
1381         mutex_unlock(&u->bindlock);
1382         err = -EINVAL;
1383 out_unlink:
1384         /* failed after successful mknod?  unlink what we'd created... */
1385         vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1386 out_path:
1387         done_path_create(&parent, dentry);
1388 out:
1389         unix_release_addr(addr);
1390         return err == -EEXIST ? -EADDRINUSE : err;
1391 }
1392
1393 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1394                               int addr_len)
1395 {
1396         struct unix_sock *u = unix_sk(sk);
1397         unsigned int new_hash, old_hash;
1398         struct net *net = sock_net(sk);
1399         struct unix_address *addr;
1400         int err;
1401
1402         addr = unix_create_addr(sunaddr, addr_len);
1403         if (!addr)
1404                 return -ENOMEM;
1405
1406         err = mutex_lock_interruptible(&u->bindlock);
1407         if (err)
1408                 goto out;
1409
1410         if (u->addr) {
1411                 err = -EINVAL;
1412                 goto out_mutex;
1413         }
1414
1415         old_hash = sk->sk_hash;
1416         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1417         unix_table_double_lock(net, old_hash, new_hash);
1418
1419         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1420                 goto out_spin;
1421
1422         __unix_set_addr_hash(net, sk, addr, new_hash);
1423         unix_table_double_unlock(net, old_hash, new_hash);
1424         mutex_unlock(&u->bindlock);
1425         return 0;
1426
1427 out_spin:
1428         unix_table_double_unlock(net, old_hash, new_hash);
1429         err = -EADDRINUSE;
1430 out_mutex:
1431         mutex_unlock(&u->bindlock);
1432 out:
1433         unix_release_addr(addr);
1434         return err;
1435 }
1436
1437 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1438 {
1439         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1440         struct sock *sk = sock->sk;
1441         int err;
1442
1443         if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1444             sunaddr->sun_family == AF_UNIX)
1445                 return unix_autobind(sk);
1446
1447         err = unix_validate_addr(sunaddr, addr_len);
1448         if (err)
1449                 return err;
1450
1451         if (sunaddr->sun_path[0])
1452                 err = unix_bind_bsd(sk, sunaddr, addr_len);
1453         else
1454                 err = unix_bind_abstract(sk, sunaddr, addr_len);
1455
1456         return err;
1457 }
1458
1459 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1460 {
1461         if (unlikely(sk1 == sk2) || !sk2) {
1462                 unix_state_lock(sk1);
1463                 return;
1464         }
1465
1466         if (sk1 > sk2)
1467                 swap(sk1, sk2);
1468
1469         unix_state_lock(sk1);
1470         unix_state_lock(sk2);
1471 }
1472
1473 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1474 {
1475         if (unlikely(sk1 == sk2) || !sk2) {
1476                 unix_state_unlock(sk1);
1477                 return;
1478         }
1479         unix_state_unlock(sk1);
1480         unix_state_unlock(sk2);
1481 }
1482
1483 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1484                               int alen, int flags)
1485 {
1486         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1487         struct sock *sk = sock->sk;
1488         struct sock *other;
1489         int err;
1490
1491         err = -EINVAL;
1492         if (alen < offsetofend(struct sockaddr, sa_family))
1493                 goto out;
1494
1495         if (addr->sa_family != AF_UNSPEC) {
1496                 err = unix_validate_addr(sunaddr, alen);
1497                 if (err)
1498                         goto out;
1499
1500                 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1501                 if (err)
1502                         goto out;
1503
1504                 if (unix_may_passcred(sk) && !READ_ONCE(unix_sk(sk)->addr)) {
1505                         err = unix_autobind(sk);
1506                         if (err)
1507                                 goto out;
1508                 }
1509
1510 restart:
1511                 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type, 0);
1512                 if (IS_ERR(other)) {
1513                         err = PTR_ERR(other);
1514                         goto out;
1515                 }
1516
1517                 unix_state_double_lock(sk, other);
1518
1519                 /* Apparently VFS overslept socket death. Retry. */
1520                 if (sock_flag(other, SOCK_DEAD)) {
1521                         unix_state_double_unlock(sk, other);
1522                         sock_put(other);
1523                         goto restart;
1524                 }
1525
1526                 err = -EPERM;
1527                 if (!unix_may_send(sk, other))
1528                         goto out_unlock;
1529
1530                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1531                 if (err)
1532                         goto out_unlock;
1533
1534                 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1535                 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1536         } else {
1537                 /*
1538                  *      1003.1g breaking connected state with AF_UNSPEC
1539                  */
1540                 other = NULL;
1541                 unix_state_double_lock(sk, other);
1542         }
1543
1544         /*
1545          * If it was connected, reconnect.
1546          */
1547         if (unix_peer(sk)) {
1548                 struct sock *old_peer = unix_peer(sk);
1549
1550                 unix_peer(sk) = other;
1551                 if (!other)
1552                         WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1553                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1554
1555                 unix_state_double_unlock(sk, other);
1556
1557                 if (other != old_peer) {
1558                         unix_dgram_disconnected(sk, old_peer);
1559
1560                         unix_state_lock(old_peer);
1561                         if (!unix_peer(old_peer))
1562                                 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1563                         unix_state_unlock(old_peer);
1564                 }
1565
1566                 sock_put(old_peer);
1567         } else {
1568                 unix_peer(sk) = other;
1569                 unix_state_double_unlock(sk, other);
1570         }
1571
1572         return 0;
1573
1574 out_unlock:
1575         unix_state_double_unlock(sk, other);
1576         sock_put(other);
1577 out:
1578         return err;
1579 }
1580
1581 static long unix_wait_for_peer(struct sock *other, long timeo)
1582 {
1583         struct unix_sock *u = unix_sk(other);
1584         int sched;
1585         DEFINE_WAIT(wait);
1586
1587         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1588
1589         sched = !sock_flag(other, SOCK_DEAD) &&
1590                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1591                 unix_recvq_full_lockless(other);
1592
1593         unix_state_unlock(other);
1594
1595         if (sched)
1596                 timeo = schedule_timeout(timeo);
1597
1598         finish_wait(&u->peer_wait, &wait);
1599         return timeo;
1600 }
1601
1602 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1603                                int addr_len, int flags)
1604 {
1605         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1606         struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1607         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1608         struct unix_peercred peercred = {};
1609         struct net *net = sock_net(sk);
1610         struct sk_buff *skb = NULL;
1611         unsigned char state;
1612         long timeo;
1613         int err;
1614
1615         err = unix_validate_addr(sunaddr, addr_len);
1616         if (err)
1617                 goto out;
1618
1619         err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1620         if (err)
1621                 goto out;
1622
1623         if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) {
1624                 err = unix_autobind(sk);
1625                 if (err)
1626                         goto out;
1627         }
1628
1629         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1630
1631         /* First of all allocate resources.
1632          * If we will make it after state is locked,
1633          * we will have to recheck all again in any case.
1634          */
1635
1636         /* create new sock for complete connection */
1637         newsk = unix_create1(net, NULL, 0, sock->type);
1638         if (IS_ERR(newsk)) {
1639                 err = PTR_ERR(newsk);
1640                 goto out;
1641         }
1642
1643         err = prepare_peercred(&peercred);
1644         if (err)
1645                 goto out;
1646
1647         /* Allocate skb for sending to listening sock */
1648         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1649         if (!skb) {
1650                 err = -ENOMEM;
1651                 goto out_free_sk;
1652         }
1653
1654 restart:
1655         /*  Find listening sock. */
1656         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, flags);
1657         if (IS_ERR(other)) {
1658                 err = PTR_ERR(other);
1659                 goto out_free_skb;
1660         }
1661
1662         unix_state_lock(other);
1663
1664         /* Apparently VFS overslept socket death. Retry. */
1665         if (sock_flag(other, SOCK_DEAD)) {
1666                 unix_state_unlock(other);
1667                 sock_put(other);
1668                 goto restart;
1669         }
1670
1671         if (other->sk_state != TCP_LISTEN ||
1672             other->sk_shutdown & RCV_SHUTDOWN) {
1673                 err = -ECONNREFUSED;
1674                 goto out_unlock;
1675         }
1676
1677         if (unix_recvq_full_lockless(other)) {
1678                 if (!timeo) {
1679                         err = -EAGAIN;
1680                         goto out_unlock;
1681                 }
1682
1683                 timeo = unix_wait_for_peer(other, timeo);
1684                 sock_put(other);
1685
1686                 err = sock_intr_errno(timeo);
1687                 if (signal_pending(current))
1688                         goto out_free_skb;
1689
1690                 goto restart;
1691         }
1692
1693         /* self connect and simultaneous connect are eliminated
1694          * by rejecting TCP_LISTEN socket to avoid deadlock.
1695          */
1696         state = READ_ONCE(sk->sk_state);
1697         if (unlikely(state != TCP_CLOSE)) {
1698                 err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1699                 goto out_unlock;
1700         }
1701
1702         unix_state_lock(sk);
1703
1704         if (unlikely(sk->sk_state != TCP_CLOSE)) {
1705                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1706                 unix_state_unlock(sk);
1707                 goto out_unlock;
1708         }
1709
1710         err = security_unix_stream_connect(sk, other, newsk);
1711         if (err) {
1712                 unix_state_unlock(sk);
1713                 goto out_unlock;
1714         }
1715
1716         /* The way is open! Fastly set all the necessary fields... */
1717
1718         sock_hold(sk);
1719         unix_peer(newsk) = sk;
1720         newsk->sk_state = TCP_ESTABLISHED;
1721         newsk->sk_type = sk->sk_type;
1722         newsk->sk_scm_recv_flags = other->sk_scm_recv_flags;
1723         init_peercred(newsk, &peercred);
1724
1725         newu = unix_sk(newsk);
1726         newu->listener = other;
1727         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1728         otheru = unix_sk(other);
1729
1730         /* copy address information from listening to new sock
1731          *
1732          * The contents of *(otheru->addr) and otheru->path
1733          * are seen fully set up here, since we have found
1734          * otheru in hash under its lock.  Insertion into the
1735          * hash chain we'd found it in had been done in an
1736          * earlier critical area protected by the chain's lock,
1737          * the same one where we'd set *(otheru->addr) contents,
1738          * as well as otheru->path and otheru->addr itself.
1739          *
1740          * Using smp_store_release() here to set newu->addr
1741          * is enough to make those stores, as well as stores
1742          * to newu->path visible to anyone who gets newu->addr
1743          * by smp_load_acquire().  IOW, the same warranties
1744          * as for unix_sock instances bound in unix_bind() or
1745          * in unix_autobind().
1746          */
1747         if (otheru->path.dentry) {
1748                 path_get(&otheru->path);
1749                 newu->path = otheru->path;
1750         }
1751         refcount_inc(&otheru->addr->refcnt);
1752         smp_store_release(&newu->addr, otheru->addr);
1753
1754         /* Set credentials */
1755         copy_peercred(sk, other);
1756
1757         sock->state     = SS_CONNECTED;
1758         WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1759         sock_hold(newsk);
1760
1761         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1762         unix_peer(sk)   = newsk;
1763
1764         unix_state_unlock(sk);
1765
1766         /* take ten and send info to listening sock */
1767         spin_lock(&other->sk_receive_queue.lock);
1768         __skb_queue_tail(&other->sk_receive_queue, skb);
1769         spin_unlock(&other->sk_receive_queue.lock);
1770         unix_state_unlock(other);
1771         other->sk_data_ready(other);
1772         sock_put(other);
1773         return 0;
1774
1775 out_unlock:
1776         unix_state_unlock(other);
1777         sock_put(other);
1778 out_free_skb:
1779         consume_skb(skb);
1780 out_free_sk:
1781         unix_release_sock(newsk, 0);
1782 out:
1783         drop_peercred(&peercred);
1784         return err;
1785 }
1786
1787 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1788 {
1789         struct unix_peercred ska_peercred = {}, skb_peercred = {};
1790         struct sock *ska = socka->sk, *skb = sockb->sk;
1791         int err;
1792
1793         err = prepare_peercred(&ska_peercred);
1794         if (err)
1795                 return err;
1796
1797         err = prepare_peercred(&skb_peercred);
1798         if (err) {
1799                 drop_peercred(&ska_peercred);
1800                 return err;
1801         }
1802
1803         /* Join our sockets back to back */
1804         sock_hold(ska);
1805         sock_hold(skb);
1806         unix_peer(ska) = skb;
1807         unix_peer(skb) = ska;
1808         init_peercred(ska, &ska_peercred);
1809         init_peercred(skb, &skb_peercred);
1810
1811         ska->sk_state = TCP_ESTABLISHED;
1812         skb->sk_state = TCP_ESTABLISHED;
1813         socka->state  = SS_CONNECTED;
1814         sockb->state  = SS_CONNECTED;
1815         return 0;
1816 }
1817
1818 static int unix_accept(struct socket *sock, struct socket *newsock,
1819                        struct proto_accept_arg *arg)
1820 {
1821         struct sock *sk = sock->sk;
1822         struct sk_buff *skb;
1823         struct sock *tsk;
1824
1825         arg->err = -EOPNOTSUPP;
1826         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1827                 goto out;
1828
1829         arg->err = -EINVAL;
1830         if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1831                 goto out;
1832
1833         /* If socket state is TCP_LISTEN it cannot change (for now...),
1834          * so that no locks are necessary.
1835          */
1836
1837         skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1838                                 &arg->err);
1839         if (!skb) {
1840                 /* This means receive shutdown. */
1841                 if (arg->err == 0)
1842                         arg->err = -EINVAL;
1843                 goto out;
1844         }
1845
1846         tsk = skb->sk;
1847         skb_free_datagram(sk, skb);
1848         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1849
1850         /* attach accepted sock to socket */
1851         unix_state_lock(tsk);
1852         unix_update_edges(unix_sk(tsk));
1853         newsock->state = SS_CONNECTED;
1854         sock_graft(tsk, newsock);
1855         unix_state_unlock(tsk);
1856         return 0;
1857
1858 out:
1859         return arg->err;
1860 }
1861
1862
1863 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1864 {
1865         struct sock *sk = sock->sk;
1866         struct unix_address *addr;
1867         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1868         int err = 0;
1869
1870         if (peer) {
1871                 sk = unix_peer_get(sk);
1872
1873                 err = -ENOTCONN;
1874                 if (!sk)
1875                         goto out;
1876                 err = 0;
1877         } else {
1878                 sock_hold(sk);
1879         }
1880
1881         addr = smp_load_acquire(&unix_sk(sk)->addr);
1882         if (!addr) {
1883                 sunaddr->sun_family = AF_UNIX;
1884                 sunaddr->sun_path[0] = 0;
1885                 err = offsetof(struct sockaddr_un, sun_path);
1886         } else {
1887                 err = addr->len;
1888                 memcpy(sunaddr, addr->name, addr->len);
1889
1890                 if (peer)
1891                         BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1892                                                CGROUP_UNIX_GETPEERNAME);
1893                 else
1894                         BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1895                                                CGROUP_UNIX_GETSOCKNAME);
1896         }
1897         sock_put(sk);
1898 out:
1899         return err;
1900 }
1901
1902 /* The "user->unix_inflight" variable is protected by the garbage
1903  * collection lock, and we just read it locklessly here. If you go
1904  * over the limit, there might be a tiny race in actually noticing
1905  * it across threads. Tough.
1906  */
1907 static inline bool too_many_unix_fds(struct task_struct *p)
1908 {
1909         struct user_struct *user = current_user();
1910
1911         if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1912                 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1913         return false;
1914 }
1915
1916 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1917 {
1918         if (too_many_unix_fds(current))
1919                 return -ETOOMANYREFS;
1920
1921         UNIXCB(skb).fp = scm->fp;
1922         scm->fp = NULL;
1923
1924         if (unix_prepare_fpl(UNIXCB(skb).fp))
1925                 return -ENOMEM;
1926
1927         return 0;
1928 }
1929
1930 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1931 {
1932         scm->fp = UNIXCB(skb).fp;
1933         UNIXCB(skb).fp = NULL;
1934
1935         unix_destroy_fpl(scm->fp);
1936 }
1937
1938 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1939 {
1940         scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1941 }
1942
1943 static void unix_destruct_scm(struct sk_buff *skb)
1944 {
1945         struct scm_cookie scm;
1946
1947         memset(&scm, 0, sizeof(scm));
1948         scm.pid  = UNIXCB(skb).pid;
1949         if (UNIXCB(skb).fp)
1950                 unix_detach_fds(&scm, skb);
1951
1952         /* Alas, it calls VFS */
1953         /* So fscking what? fput() had been SMP-safe since the last Summer */
1954         scm_destroy(&scm);
1955         sock_wfree(skb);
1956 }
1957
1958 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1959 {
1960         int err = 0;
1961
1962         UNIXCB(skb).pid = get_pid(scm->pid);
1963         UNIXCB(skb).uid = scm->creds.uid;
1964         UNIXCB(skb).gid = scm->creds.gid;
1965         UNIXCB(skb).fp = NULL;
1966         unix_get_secdata(scm, skb);
1967         if (scm->fp && send_fds)
1968                 err = unix_attach_fds(scm, skb);
1969
1970         skb->destructor = unix_destruct_scm;
1971         return err;
1972 }
1973
1974 /*
1975  * Some apps rely on write() giving SCM_CREDENTIALS
1976  * We include credentials if source or destination socket
1977  * asserted SOCK_PASSCRED.
1978  */
1979 static void unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk,
1980                                  const struct sock *other)
1981 {
1982         if (UNIXCB(skb).pid)
1983                 return;
1984
1985         if (unix_may_passcred(sk) || unix_may_passcred(other) ||
1986             !other->sk_socket) {
1987                 UNIXCB(skb).pid = get_pid(task_tgid(current));
1988                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1989         }
1990 }
1991
1992 static bool unix_skb_scm_eq(struct sk_buff *skb,
1993                             struct scm_cookie *scm)
1994 {
1995         return UNIXCB(skb).pid == scm->pid &&
1996                uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1997                gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1998                unix_secdata_eq(scm, skb);
1999 }
2000
2001 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
2002 {
2003         struct scm_fp_list *fp = UNIXCB(skb).fp;
2004         struct unix_sock *u = unix_sk(sk);
2005
2006         if (unlikely(fp && fp->count)) {
2007                 atomic_add(fp->count, &u->scm_stat.nr_fds);
2008                 unix_add_edges(fp, u);
2009         }
2010 }
2011
2012 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
2013 {
2014         struct scm_fp_list *fp = UNIXCB(skb).fp;
2015         struct unix_sock *u = unix_sk(sk);
2016
2017         if (unlikely(fp && fp->count)) {
2018                 atomic_sub(fp->count, &u->scm_stat.nr_fds);
2019                 unix_del_edges(fp);
2020         }
2021 }
2022
2023 /*
2024  *      Send AF_UNIX data.
2025  */
2026
2027 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
2028                               size_t len)
2029 {
2030         struct sock *sk = sock->sk, *other = NULL;
2031         struct unix_sock *u = unix_sk(sk);
2032         struct scm_cookie scm;
2033         struct sk_buff *skb;
2034         int data_len = 0;
2035         int sk_locked;
2036         long timeo;
2037         int err;
2038
2039         err = scm_send(sock, msg, &scm, false);
2040         if (err < 0)
2041                 return err;
2042
2043         wait_for_unix_gc(scm.fp);
2044
2045         if (msg->msg_flags & MSG_OOB) {
2046                 err = -EOPNOTSUPP;
2047                 goto out;
2048         }
2049
2050         if (msg->msg_namelen) {
2051                 err = unix_validate_addr(msg->msg_name, msg->msg_namelen);
2052                 if (err)
2053                         goto out;
2054
2055                 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
2056                                                             msg->msg_name,
2057                                                             &msg->msg_namelen,
2058                                                             NULL);
2059                 if (err)
2060                         goto out;
2061         }
2062
2063         if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) {
2064                 err = unix_autobind(sk);
2065                 if (err)
2066                         goto out;
2067         }
2068
2069         if (len > READ_ONCE(sk->sk_sndbuf) - 32) {
2070                 err = -EMSGSIZE;
2071                 goto out;
2072         }
2073
2074         if (len > SKB_MAX_ALLOC) {
2075                 data_len = min_t(size_t,
2076                                  len - SKB_MAX_ALLOC,
2077                                  MAX_SKB_FRAGS * PAGE_SIZE);
2078                 data_len = PAGE_ALIGN(data_len);
2079
2080                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
2081         }
2082
2083         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
2084                                    msg->msg_flags & MSG_DONTWAIT, &err,
2085                                    PAGE_ALLOC_COSTLY_ORDER);
2086         if (!skb)
2087                 goto out;
2088
2089         err = unix_scm_to_skb(&scm, skb, true);
2090         if (err < 0)
2091                 goto out_free;
2092
2093         skb_put(skb, len - data_len);
2094         skb->data_len = data_len;
2095         skb->len = len;
2096         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
2097         if (err)
2098                 goto out_free;
2099
2100         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2101
2102         if (msg->msg_namelen) {
2103 lookup:
2104                 other = unix_find_other(sock_net(sk), msg->msg_name,
2105                                         msg->msg_namelen, sk->sk_type, 0);
2106                 if (IS_ERR(other)) {
2107                         err = PTR_ERR(other);
2108                         goto out_free;
2109                 }
2110         } else {
2111                 other = unix_peer_get(sk);
2112                 if (!other) {
2113                         err = -ENOTCONN;
2114                         goto out_free;
2115                 }
2116         }
2117
2118         if (sk_filter(other, skb) < 0) {
2119                 /* Toss the packet but do not return any error to the sender */
2120                 err = len;
2121                 goto out_sock_put;
2122         }
2123
2124 restart:
2125         sk_locked = 0;
2126         unix_state_lock(other);
2127 restart_locked:
2128
2129         if (!unix_may_send(sk, other)) {
2130                 err = -EPERM;
2131                 goto out_unlock;
2132         }
2133
2134         if (unlikely(sock_flag(other, SOCK_DEAD))) {
2135                 /* Check with 1003.1g - what should datagram error */
2136
2137                 unix_state_unlock(other);
2138
2139                 if (sk->sk_type == SOCK_SEQPACKET) {
2140                         /* We are here only when racing with unix_release_sock()
2141                          * is clearing @other. Never change state to TCP_CLOSE
2142                          * unlike SOCK_DGRAM wants.
2143                          */
2144                         err = -EPIPE;
2145                         goto out_sock_put;
2146                 }
2147
2148                 if (!sk_locked)
2149                         unix_state_lock(sk);
2150
2151                 if (unix_peer(sk) == other) {
2152                         unix_peer(sk) = NULL;
2153                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2154
2155                         WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2156                         unix_state_unlock(sk);
2157
2158                         unix_dgram_disconnected(sk, other);
2159                         sock_put(other);
2160                         err = -ECONNREFUSED;
2161                         goto out_sock_put;
2162                 }
2163
2164                 unix_state_unlock(sk);
2165
2166                 if (!msg->msg_namelen) {
2167                         err = -ECONNRESET;
2168                         goto out_sock_put;
2169                 }
2170
2171                 sock_put(other);
2172                 goto lookup;
2173         }
2174
2175         if (other->sk_shutdown & RCV_SHUTDOWN) {
2176                 err = -EPIPE;
2177                 goto out_unlock;
2178         }
2179
2180         if (UNIXCB(skb).fp && !other->sk_scm_rights) {
2181                 err = -EPERM;
2182                 goto out_unlock;
2183         }
2184
2185         if (sk->sk_type != SOCK_SEQPACKET) {
2186                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2187                 if (err)
2188                         goto out_unlock;
2189         }
2190
2191         /* other == sk && unix_peer(other) != sk if
2192          * - unix_peer(sk) == NULL, destination address bound to sk
2193          * - unix_peer(sk) == sk by time of get but disconnected before lock
2194          */
2195         if (other != sk &&
2196             unlikely(unix_peer(other) != sk &&
2197             unix_recvq_full_lockless(other))) {
2198                 if (timeo) {
2199                         timeo = unix_wait_for_peer(other, timeo);
2200
2201                         err = sock_intr_errno(timeo);
2202                         if (signal_pending(current))
2203                                 goto out_sock_put;
2204
2205                         goto restart;
2206                 }
2207
2208                 if (!sk_locked) {
2209                         unix_state_unlock(other);
2210                         unix_state_double_lock(sk, other);
2211                 }
2212
2213                 if (unix_peer(sk) != other ||
2214                     unix_dgram_peer_wake_me(sk, other)) {
2215                         err = -EAGAIN;
2216                         sk_locked = 1;
2217                         goto out_unlock;
2218                 }
2219
2220                 if (!sk_locked) {
2221                         sk_locked = 1;
2222                         goto restart_locked;
2223                 }
2224         }
2225
2226         if (unlikely(sk_locked))
2227                 unix_state_unlock(sk);
2228
2229         if (sock_flag(other, SOCK_RCVTSTAMP))
2230                 __net_timestamp(skb);
2231
2232         unix_maybe_add_creds(skb, sk, other);
2233         scm_stat_add(other, skb);
2234         skb_queue_tail(&other->sk_receive_queue, skb);
2235         unix_state_unlock(other);
2236         other->sk_data_ready(other);
2237         sock_put(other);
2238         scm_destroy(&scm);
2239         return len;
2240
2241 out_unlock:
2242         if (sk_locked)
2243                 unix_state_unlock(sk);
2244         unix_state_unlock(other);
2245 out_sock_put:
2246         sock_put(other);
2247 out_free:
2248         consume_skb(skb);
2249 out:
2250         scm_destroy(&scm);
2251         return err;
2252 }
2253
2254 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2255  * bytes, and a minimum of a full page.
2256  */
2257 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2258
2259 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2260 static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other,
2261                      struct scm_cookie *scm, bool fds_sent)
2262 {
2263         struct unix_sock *ousk = unix_sk(other);
2264         struct sk_buff *skb;
2265         int err;
2266
2267         skb = sock_alloc_send_skb(sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2268
2269         if (!skb)
2270                 return err;
2271
2272         err = unix_scm_to_skb(scm, skb, !fds_sent);
2273         if (err < 0)
2274                 goto out;
2275
2276         skb_put(skb, 1);
2277         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2278
2279         if (err)
2280                 goto out;
2281
2282         unix_state_lock(other);
2283
2284         if (sock_flag(other, SOCK_DEAD) ||
2285             (other->sk_shutdown & RCV_SHUTDOWN)) {
2286                 err = -EPIPE;
2287                 goto out_unlock;
2288         }
2289
2290         if (UNIXCB(skb).fp && !other->sk_scm_rights) {
2291                 err = -EPERM;
2292                 goto out_unlock;
2293         }
2294
2295         unix_maybe_add_creds(skb, sk, other);
2296         scm_stat_add(other, skb);
2297
2298         spin_lock(&other->sk_receive_queue.lock);
2299         WRITE_ONCE(ousk->oob_skb, skb);
2300         __skb_queue_tail(&other->sk_receive_queue, skb);
2301         spin_unlock(&other->sk_receive_queue.lock);
2302
2303         sk_send_sigurg(other);
2304         unix_state_unlock(other);
2305         other->sk_data_ready(other);
2306
2307         return 0;
2308 out_unlock:
2309         unix_state_unlock(other);
2310 out:
2311         consume_skb(skb);
2312         return err;
2313 }
2314 #endif
2315
2316 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2317                                size_t len)
2318 {
2319         struct sock *sk = sock->sk;
2320         struct sk_buff *skb = NULL;
2321         struct sock *other = NULL;
2322         struct scm_cookie scm;
2323         bool fds_sent = false;
2324         int err, sent = 0;
2325
2326         err = scm_send(sock, msg, &scm, false);
2327         if (err < 0)
2328                 return err;
2329
2330         wait_for_unix_gc(scm.fp);
2331
2332         if (msg->msg_flags & MSG_OOB) {
2333                 err = -EOPNOTSUPP;
2334 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2335                 if (len)
2336                         len--;
2337                 else
2338 #endif
2339                         goto out_err;
2340         }
2341
2342         if (msg->msg_namelen) {
2343                 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2344                 goto out_err;
2345         } else {
2346                 other = unix_peer(sk);
2347                 if (!other) {
2348                         err = -ENOTCONN;
2349                         goto out_err;
2350                 }
2351         }
2352
2353         if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2354                 goto out_pipe;
2355
2356         while (sent < len) {
2357                 int size = len - sent;
2358                 int data_len;
2359
2360                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2361                         skb = sock_alloc_send_pskb(sk, 0, 0,
2362                                                    msg->msg_flags & MSG_DONTWAIT,
2363                                                    &err, 0);
2364                 } else {
2365                         /* Keep two messages in the pipe so it schedules better */
2366                         size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2367
2368                         /* allow fallback to order-0 allocations */
2369                         size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2370
2371                         data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2372
2373                         data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2374
2375                         skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2376                                                    msg->msg_flags & MSG_DONTWAIT, &err,
2377                                                    get_order(UNIX_SKB_FRAGS_SZ));
2378                 }
2379                 if (!skb)
2380                         goto out_err;
2381
2382                 /* Only send the fds in the first buffer */
2383                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2384                 if (err < 0)
2385                         goto out_free;
2386
2387                 fds_sent = true;
2388
2389                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2390                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2391                         err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2392                                                    sk->sk_allocation);
2393                         if (err < 0)
2394                                 goto out_free;
2395
2396                         size = err;
2397                         refcount_add(size, &sk->sk_wmem_alloc);
2398                 } else {
2399                         skb_put(skb, size - data_len);
2400                         skb->data_len = data_len;
2401                         skb->len = size;
2402                         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2403                         if (err)
2404                                 goto out_free;
2405                 }
2406
2407                 unix_state_lock(other);
2408
2409                 if (sock_flag(other, SOCK_DEAD) ||
2410                     (other->sk_shutdown & RCV_SHUTDOWN))
2411                         goto out_pipe_unlock;
2412
2413                 if (UNIXCB(skb).fp && !other->sk_scm_rights) {
2414                         unix_state_unlock(other);
2415                         err = -EPERM;
2416                         goto out_free;
2417                 }
2418
2419                 unix_maybe_add_creds(skb, sk, other);
2420                 scm_stat_add(other, skb);
2421                 skb_queue_tail(&other->sk_receive_queue, skb);
2422                 unix_state_unlock(other);
2423                 other->sk_data_ready(other);
2424                 sent += size;
2425         }
2426
2427 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2428         if (msg->msg_flags & MSG_OOB) {
2429                 err = queue_oob(sk, msg, other, &scm, fds_sent);
2430                 if (err)
2431                         goto out_err;
2432                 sent++;
2433         }
2434 #endif
2435
2436         scm_destroy(&scm);
2437
2438         return sent;
2439
2440 out_pipe_unlock:
2441         unix_state_unlock(other);
2442 out_pipe:
2443         if (!sent && !(msg->msg_flags & MSG_NOSIGNAL))
2444                 send_sig(SIGPIPE, current, 0);
2445         err = -EPIPE;
2446 out_free:
2447         consume_skb(skb);
2448 out_err:
2449         scm_destroy(&scm);
2450         return sent ? : err;
2451 }
2452
2453 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2454                                   size_t len)
2455 {
2456         int err;
2457         struct sock *sk = sock->sk;
2458
2459         err = sock_error(sk);
2460         if (err)
2461                 return err;
2462
2463         if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2464                 return -ENOTCONN;
2465
2466         if (msg->msg_namelen)
2467                 msg->msg_namelen = 0;
2468
2469         return unix_dgram_sendmsg(sock, msg, len);
2470 }
2471
2472 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2473                                   size_t size, int flags)
2474 {
2475         struct sock *sk = sock->sk;
2476
2477         if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2478                 return -ENOTCONN;
2479
2480         return unix_dgram_recvmsg(sock, msg, size, flags);
2481 }
2482
2483 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2484 {
2485         struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2486
2487         if (addr) {
2488                 msg->msg_namelen = addr->len;
2489                 memcpy(msg->msg_name, addr->name, addr->len);
2490         }
2491 }
2492
2493 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2494                          int flags)
2495 {
2496         struct scm_cookie scm;
2497         struct socket *sock = sk->sk_socket;
2498         struct unix_sock *u = unix_sk(sk);
2499         struct sk_buff *skb, *last;
2500         long timeo;
2501         int skip;
2502         int err;
2503
2504         err = -EOPNOTSUPP;
2505         if (flags&MSG_OOB)
2506                 goto out;
2507
2508         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2509
2510         do {
2511                 mutex_lock(&u->iolock);
2512
2513                 skip = sk_peek_offset(sk, flags);
2514                 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2515                                               &skip, &err, &last);
2516                 if (skb) {
2517                         if (!(flags & MSG_PEEK))
2518                                 scm_stat_del(sk, skb);
2519                         break;
2520                 }
2521
2522                 mutex_unlock(&u->iolock);
2523
2524                 if (err != -EAGAIN)
2525                         break;
2526         } while (timeo &&
2527                  !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2528                                               &err, &timeo, last));
2529
2530         if (!skb) { /* implies iolock unlocked */
2531                 unix_state_lock(sk);
2532                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2533                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2534                     (sk->sk_shutdown & RCV_SHUTDOWN))
2535                         err = 0;
2536                 unix_state_unlock(sk);
2537                 goto out;
2538         }
2539
2540         if (wq_has_sleeper(&u->peer_wait))
2541                 wake_up_interruptible_sync_poll(&u->peer_wait,
2542                                                 EPOLLOUT | EPOLLWRNORM |
2543                                                 EPOLLWRBAND);
2544
2545         if (msg->msg_name) {
2546                 unix_copy_addr(msg, skb->sk);
2547
2548                 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2549                                                       msg->msg_name,
2550                                                       &msg->msg_namelen);
2551         }
2552
2553         if (size > skb->len - skip)
2554                 size = skb->len - skip;
2555         else if (size < skb->len - skip)
2556                 msg->msg_flags |= MSG_TRUNC;
2557
2558         err = skb_copy_datagram_msg(skb, skip, msg, size);
2559         if (err)
2560                 goto out_free;
2561
2562         if (sock_flag(sk, SOCK_RCVTSTAMP))
2563                 __sock_recv_timestamp(msg, sk, skb);
2564
2565         memset(&scm, 0, sizeof(scm));
2566
2567         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2568         unix_set_secdata(&scm, skb);
2569
2570         if (!(flags & MSG_PEEK)) {
2571                 if (UNIXCB(skb).fp)
2572                         unix_detach_fds(&scm, skb);
2573
2574                 sk_peek_offset_bwd(sk, skb->len);
2575         } else {
2576                 /* It is questionable: on PEEK we could:
2577                    - do not return fds - good, but too simple 8)
2578                    - return fds, and do not return them on read (old strategy,
2579                      apparently wrong)
2580                    - clone fds (I chose it for now, it is the most universal
2581                      solution)
2582
2583                    POSIX 1003.1g does not actually define this clearly
2584                    at all. POSIX 1003.1g doesn't define a lot of things
2585                    clearly however!
2586
2587                 */
2588
2589                 sk_peek_offset_fwd(sk, size);
2590
2591                 if (UNIXCB(skb).fp)
2592                         unix_peek_fds(&scm, skb);
2593         }
2594         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2595
2596         scm_recv_unix(sock, msg, &scm, flags);
2597
2598 out_free:
2599         skb_free_datagram(sk, skb);
2600         mutex_unlock(&u->iolock);
2601 out:
2602         return err;
2603 }
2604
2605 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2606                               int flags)
2607 {
2608         struct sock *sk = sock->sk;
2609
2610 #ifdef CONFIG_BPF_SYSCALL
2611         const struct proto *prot = READ_ONCE(sk->sk_prot);
2612
2613         if (prot != &unix_dgram_proto)
2614                 return prot->recvmsg(sk, msg, size, flags, NULL);
2615 #endif
2616         return __unix_dgram_recvmsg(sk, msg, size, flags);
2617 }
2618
2619 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2620 {
2621         struct unix_sock *u = unix_sk(sk);
2622         struct sk_buff *skb;
2623         int err;
2624
2625         mutex_lock(&u->iolock);
2626         skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2627         mutex_unlock(&u->iolock);
2628         if (!skb)
2629                 return err;
2630
2631         return recv_actor(sk, skb);
2632 }
2633
2634 /*
2635  *      Sleep until more data has arrived. But check for races..
2636  */
2637 static long unix_stream_data_wait(struct sock *sk, long timeo,
2638                                   struct sk_buff *last, unsigned int last_len,
2639                                   bool freezable)
2640 {
2641         unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2642         struct sk_buff *tail;
2643         DEFINE_WAIT(wait);
2644
2645         unix_state_lock(sk);
2646
2647         for (;;) {
2648                 prepare_to_wait(sk_sleep(sk), &wait, state);
2649
2650                 tail = skb_peek_tail(&sk->sk_receive_queue);
2651                 if (tail != last ||
2652                     (tail && tail->len != last_len) ||
2653                     sk->sk_err ||
2654                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2655                     signal_pending(current) ||
2656                     !timeo)
2657                         break;
2658
2659                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2660                 unix_state_unlock(sk);
2661                 timeo = schedule_timeout(timeo);
2662                 unix_state_lock(sk);
2663
2664                 if (sock_flag(sk, SOCK_DEAD))
2665                         break;
2666
2667                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2668         }
2669
2670         finish_wait(sk_sleep(sk), &wait);
2671         unix_state_unlock(sk);
2672         return timeo;
2673 }
2674
2675 struct unix_stream_read_state {
2676         int (*recv_actor)(struct sk_buff *, int, int,
2677                           struct unix_stream_read_state *);
2678         struct socket *socket;
2679         struct msghdr *msg;
2680         struct pipe_inode_info *pipe;
2681         size_t size;
2682         int flags;
2683         unsigned int splice_flags;
2684 };
2685
2686 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2687 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2688 {
2689         struct sk_buff *oob_skb, *read_skb = NULL;
2690         struct socket *sock = state->socket;
2691         struct sock *sk = sock->sk;
2692         struct unix_sock *u = unix_sk(sk);
2693         int chunk = 1;
2694
2695         mutex_lock(&u->iolock);
2696         unix_state_lock(sk);
2697         spin_lock(&sk->sk_receive_queue.lock);
2698
2699         if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2700                 spin_unlock(&sk->sk_receive_queue.lock);
2701                 unix_state_unlock(sk);
2702                 mutex_unlock(&u->iolock);
2703                 return -EINVAL;
2704         }
2705
2706         oob_skb = u->oob_skb;
2707
2708         if (!(state->flags & MSG_PEEK)) {
2709                 WRITE_ONCE(u->oob_skb, NULL);
2710
2711                 if (oob_skb->prev != (struct sk_buff *)&sk->sk_receive_queue &&
2712                     !unix_skb_len(oob_skb->prev)) {
2713                         read_skb = oob_skb->prev;
2714                         __skb_unlink(read_skb, &sk->sk_receive_queue);
2715                 }
2716         }
2717
2718         spin_unlock(&sk->sk_receive_queue.lock);
2719         unix_state_unlock(sk);
2720
2721         chunk = state->recv_actor(oob_skb, 0, chunk, state);
2722
2723         if (!(state->flags & MSG_PEEK))
2724                 UNIXCB(oob_skb).consumed += 1;
2725
2726         mutex_unlock(&u->iolock);
2727
2728         consume_skb(read_skb);
2729
2730         if (chunk < 0)
2731                 return -EFAULT;
2732
2733         state->msg->msg_flags |= MSG_OOB;
2734         return 1;
2735 }
2736
2737 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2738                                   int flags, int copied)
2739 {
2740         struct sk_buff *read_skb = NULL, *unread_skb = NULL;
2741         struct unix_sock *u = unix_sk(sk);
2742
2743         if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb)))
2744                 return skb;
2745
2746         spin_lock(&sk->sk_receive_queue.lock);
2747
2748         if (!unix_skb_len(skb)) {
2749                 if (copied && (!u->oob_skb || skb == u->oob_skb)) {
2750                         skb = NULL;
2751                 } else if (flags & MSG_PEEK) {
2752                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2753                 } else {
2754                         read_skb = skb;
2755                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2756                         __skb_unlink(read_skb, &sk->sk_receive_queue);
2757                 }
2758
2759                 if (!skb)
2760                         goto unlock;
2761         }
2762
2763         if (skb != u->oob_skb)
2764                 goto unlock;
2765
2766         if (copied) {
2767                 skb = NULL;
2768         } else if (!(flags & MSG_PEEK)) {
2769                 WRITE_ONCE(u->oob_skb, NULL);
2770
2771                 if (!sock_flag(sk, SOCK_URGINLINE)) {
2772                         __skb_unlink(skb, &sk->sk_receive_queue);
2773                         unread_skb = skb;
2774                         skb = skb_peek(&sk->sk_receive_queue);
2775                 }
2776         } else if (!sock_flag(sk, SOCK_URGINLINE)) {
2777                 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2778         }
2779
2780 unlock:
2781         spin_unlock(&sk->sk_receive_queue.lock);
2782
2783         consume_skb(read_skb);
2784         kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
2785
2786         return skb;
2787 }
2788 #endif
2789
2790 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2791 {
2792         struct unix_sock *u = unix_sk(sk);
2793         struct sk_buff *skb;
2794         int err;
2795
2796         if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2797                 return -ENOTCONN;
2798
2799         mutex_lock(&u->iolock);
2800         skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2801         mutex_unlock(&u->iolock);
2802         if (!skb)
2803                 return err;
2804
2805 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2806         if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2807                 bool drop = false;
2808
2809                 unix_state_lock(sk);
2810
2811                 if (sock_flag(sk, SOCK_DEAD)) {
2812                         unix_state_unlock(sk);
2813                         kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
2814                         return -ECONNRESET;
2815                 }
2816
2817                 spin_lock(&sk->sk_receive_queue.lock);
2818                 if (likely(skb == u->oob_skb)) {
2819                         WRITE_ONCE(u->oob_skb, NULL);
2820                         drop = true;
2821                 }
2822                 spin_unlock(&sk->sk_receive_queue.lock);
2823
2824                 unix_state_unlock(sk);
2825
2826                 if (drop) {
2827                         kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
2828                         return -EAGAIN;
2829                 }
2830         }
2831 #endif
2832
2833         return recv_actor(sk, skb);
2834 }
2835
2836 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2837                                     bool freezable)
2838 {
2839         struct scm_cookie scm;
2840         struct socket *sock = state->socket;
2841         struct sock *sk = sock->sk;
2842         struct unix_sock *u = unix_sk(sk);
2843         int copied = 0;
2844         int flags = state->flags;
2845         int noblock = flags & MSG_DONTWAIT;
2846         bool check_creds = false;
2847         int target;
2848         int err = 0;
2849         long timeo;
2850         int skip;
2851         size_t size = state->size;
2852         unsigned int last_len;
2853
2854         if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2855                 err = -EINVAL;
2856                 goto out;
2857         }
2858
2859         if (unlikely(flags & MSG_OOB)) {
2860                 err = -EOPNOTSUPP;
2861 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2862                 err = unix_stream_recv_urg(state);
2863 #endif
2864                 goto out;
2865         }
2866
2867         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2868         timeo = sock_rcvtimeo(sk, noblock);
2869
2870         memset(&scm, 0, sizeof(scm));
2871
2872         /* Lock the socket to prevent queue disordering
2873          * while sleeps in memcpy_tomsg
2874          */
2875         mutex_lock(&u->iolock);
2876
2877         skip = max(sk_peek_offset(sk, flags), 0);
2878
2879         do {
2880                 struct sk_buff *skb, *last;
2881                 int chunk;
2882
2883 redo:
2884                 unix_state_lock(sk);
2885                 if (sock_flag(sk, SOCK_DEAD)) {
2886                         err = -ECONNRESET;
2887                         goto unlock;
2888                 }
2889                 last = skb = skb_peek(&sk->sk_receive_queue);
2890                 last_len = last ? last->len : 0;
2891
2892 again:
2893 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2894                 if (skb) {
2895                         skb = manage_oob(skb, sk, flags, copied);
2896                         if (!skb && copied) {
2897                                 unix_state_unlock(sk);
2898                                 break;
2899                         }
2900                 }
2901 #endif
2902                 if (skb == NULL) {
2903                         if (copied >= target)
2904                                 goto unlock;
2905
2906                         /*
2907                          *      POSIX 1003.1g mandates this order.
2908                          */
2909
2910                         err = sock_error(sk);
2911                         if (err)
2912                                 goto unlock;
2913                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2914                                 goto unlock;
2915
2916                         unix_state_unlock(sk);
2917                         if (!timeo) {
2918                                 err = -EAGAIN;
2919                                 break;
2920                         }
2921
2922                         mutex_unlock(&u->iolock);
2923
2924                         timeo = unix_stream_data_wait(sk, timeo, last,
2925                                                       last_len, freezable);
2926
2927                         if (signal_pending(current)) {
2928                                 err = sock_intr_errno(timeo);
2929                                 scm_destroy(&scm);
2930                                 goto out;
2931                         }
2932
2933                         mutex_lock(&u->iolock);
2934                         goto redo;
2935 unlock:
2936                         unix_state_unlock(sk);
2937                         break;
2938                 }
2939
2940                 while (skip >= unix_skb_len(skb)) {
2941                         skip -= unix_skb_len(skb);
2942                         last = skb;
2943                         last_len = skb->len;
2944                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2945                         if (!skb)
2946                                 goto again;
2947                 }
2948
2949                 unix_state_unlock(sk);
2950
2951                 if (check_creds) {
2952                         /* Never glue messages from different writers */
2953                         if (!unix_skb_scm_eq(skb, &scm))
2954                                 break;
2955                 } else if (unix_may_passcred(sk)) {
2956                         /* Copy credentials */
2957                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2958                         unix_set_secdata(&scm, skb);
2959                         check_creds = true;
2960                 }
2961
2962                 /* Copy address just once */
2963                 if (state->msg && state->msg->msg_name) {
2964                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2965                                          state->msg->msg_name);
2966                         unix_copy_addr(state->msg, skb->sk);
2967
2968                         BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2969                                                               state->msg->msg_name,
2970                                                               &state->msg->msg_namelen);
2971
2972                         sunaddr = NULL;
2973                 }
2974
2975                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2976                 chunk = state->recv_actor(skb, skip, chunk, state);
2977                 if (chunk < 0) {
2978                         if (copied == 0)
2979                                 copied = -EFAULT;
2980                         break;
2981                 }
2982                 copied += chunk;
2983                 size -= chunk;
2984
2985                 /* Mark read part of skb as used */
2986                 if (!(flags & MSG_PEEK)) {
2987                         UNIXCB(skb).consumed += chunk;
2988
2989                         sk_peek_offset_bwd(sk, chunk);
2990
2991                         if (UNIXCB(skb).fp) {
2992                                 scm_stat_del(sk, skb);
2993                                 unix_detach_fds(&scm, skb);
2994                         }
2995
2996                         if (unix_skb_len(skb))
2997                                 break;
2998
2999                         skb_unlink(skb, &sk->sk_receive_queue);
3000                         consume_skb(skb);
3001
3002                         if (scm.fp)
3003                                 break;
3004                 } else {
3005                         /* It is questionable, see note in unix_dgram_recvmsg.
3006                          */
3007                         if (UNIXCB(skb).fp)
3008                                 unix_peek_fds(&scm, skb);
3009
3010                         sk_peek_offset_fwd(sk, chunk);
3011
3012                         if (UNIXCB(skb).fp)
3013                                 break;
3014
3015                         skip = 0;
3016                         last = skb;
3017                         last_len = skb->len;
3018                         unix_state_lock(sk);
3019                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
3020                         if (skb)
3021                                 goto again;
3022                         unix_state_unlock(sk);
3023                         break;
3024                 }
3025         } while (size);
3026
3027         mutex_unlock(&u->iolock);
3028         if (state->msg)
3029                 scm_recv_unix(sock, state->msg, &scm, flags);
3030         else
3031                 scm_destroy(&scm);
3032 out:
3033         return copied ? : err;
3034 }
3035
3036 static int unix_stream_read_actor(struct sk_buff *skb,
3037                                   int skip, int chunk,
3038                                   struct unix_stream_read_state *state)
3039 {
3040         int ret;
3041
3042         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
3043                                     state->msg, chunk);
3044         return ret ?: chunk;
3045 }
3046
3047 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
3048                           size_t size, int flags)
3049 {
3050         struct unix_stream_read_state state = {
3051                 .recv_actor = unix_stream_read_actor,
3052                 .socket = sk->sk_socket,
3053                 .msg = msg,
3054                 .size = size,
3055                 .flags = flags
3056         };
3057
3058         return unix_stream_read_generic(&state, true);
3059 }
3060
3061 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
3062                                size_t size, int flags)
3063 {
3064         struct unix_stream_read_state state = {
3065                 .recv_actor = unix_stream_read_actor,
3066                 .socket = sock,
3067                 .msg = msg,
3068                 .size = size,
3069                 .flags = flags
3070         };
3071
3072 #ifdef CONFIG_BPF_SYSCALL
3073         struct sock *sk = sock->sk;
3074         const struct proto *prot = READ_ONCE(sk->sk_prot);
3075
3076         if (prot != &unix_stream_proto)
3077                 return prot->recvmsg(sk, msg, size, flags, NULL);
3078 #endif
3079         return unix_stream_read_generic(&state, true);
3080 }
3081
3082 static int unix_stream_splice_actor(struct sk_buff *skb,
3083                                     int skip, int chunk,
3084                                     struct unix_stream_read_state *state)
3085 {
3086         return skb_splice_bits(skb, state->socket->sk,
3087                                UNIXCB(skb).consumed + skip,
3088                                state->pipe, chunk, state->splice_flags);
3089 }
3090
3091 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
3092                                        struct pipe_inode_info *pipe,
3093                                        size_t size, unsigned int flags)
3094 {
3095         struct unix_stream_read_state state = {
3096                 .recv_actor = unix_stream_splice_actor,
3097                 .socket = sock,
3098                 .pipe = pipe,
3099                 .size = size,
3100                 .splice_flags = flags,
3101         };
3102
3103         if (unlikely(*ppos))
3104                 return -ESPIPE;
3105
3106         if (sock->file->f_flags & O_NONBLOCK ||
3107             flags & SPLICE_F_NONBLOCK)
3108                 state.flags = MSG_DONTWAIT;
3109
3110         return unix_stream_read_generic(&state, false);
3111 }
3112
3113 static int unix_shutdown(struct socket *sock, int mode)
3114 {
3115         struct sock *sk = sock->sk;
3116         struct sock *other;
3117
3118         if (mode < SHUT_RD || mode > SHUT_RDWR)
3119                 return -EINVAL;
3120         /* This maps:
3121          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
3122          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
3123          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3124          */
3125         ++mode;
3126
3127         unix_state_lock(sk);
3128         WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3129         other = unix_peer(sk);
3130         if (other)
3131                 sock_hold(other);
3132         unix_state_unlock(sk);
3133         sk->sk_state_change(sk);
3134
3135         if (other &&
3136                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3137
3138                 int peer_mode = 0;
3139                 const struct proto *prot = READ_ONCE(other->sk_prot);
3140
3141                 if (prot->unhash)
3142                         prot->unhash(other);
3143                 if (mode&RCV_SHUTDOWN)
3144                         peer_mode |= SEND_SHUTDOWN;
3145                 if (mode&SEND_SHUTDOWN)
3146                         peer_mode |= RCV_SHUTDOWN;
3147                 unix_state_lock(other);
3148                 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3149                 unix_state_unlock(other);
3150                 other->sk_state_change(other);
3151                 if (peer_mode == SHUTDOWN_MASK)
3152                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3153                 else if (peer_mode & RCV_SHUTDOWN)
3154                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3155         }
3156         if (other)
3157                 sock_put(other);
3158
3159         return 0;
3160 }
3161
3162 long unix_inq_len(struct sock *sk)
3163 {
3164         struct sk_buff *skb;
3165         long amount = 0;
3166
3167         if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3168                 return -EINVAL;
3169
3170         spin_lock(&sk->sk_receive_queue.lock);
3171         if (sk->sk_type == SOCK_STREAM ||
3172             sk->sk_type == SOCK_SEQPACKET) {
3173                 skb_queue_walk(&sk->sk_receive_queue, skb)
3174                         amount += unix_skb_len(skb);
3175         } else {
3176                 skb = skb_peek(&sk->sk_receive_queue);
3177                 if (skb)
3178                         amount = skb->len;
3179         }
3180         spin_unlock(&sk->sk_receive_queue.lock);
3181
3182         return amount;
3183 }
3184 EXPORT_SYMBOL_GPL(unix_inq_len);
3185
3186 long unix_outq_len(struct sock *sk)
3187 {
3188         return sk_wmem_alloc_get(sk);
3189 }
3190 EXPORT_SYMBOL_GPL(unix_outq_len);
3191
3192 static int unix_open_file(struct sock *sk)
3193 {
3194         struct path path;
3195         struct file *f;
3196         int fd;
3197
3198         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3199                 return -EPERM;
3200
3201         if (!smp_load_acquire(&unix_sk(sk)->addr))
3202                 return -ENOENT;
3203
3204         path = unix_sk(sk)->path;
3205         if (!path.dentry)
3206                 return -ENOENT;
3207
3208         path_get(&path);
3209
3210         fd = get_unused_fd_flags(O_CLOEXEC);
3211         if (fd < 0)
3212                 goto out;
3213
3214         f = dentry_open(&path, O_PATH, current_cred());
3215         if (IS_ERR(f)) {
3216                 put_unused_fd(fd);
3217                 fd = PTR_ERR(f);
3218                 goto out;
3219         }
3220
3221         fd_install(fd, f);
3222 out:
3223         path_put(&path);
3224
3225         return fd;
3226 }
3227
3228 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3229 {
3230         struct sock *sk = sock->sk;
3231         long amount = 0;
3232         int err;
3233
3234         switch (cmd) {
3235         case SIOCOUTQ:
3236                 amount = unix_outq_len(sk);
3237                 err = put_user(amount, (int __user *)arg);
3238                 break;
3239         case SIOCINQ:
3240                 amount = unix_inq_len(sk);
3241                 if (amount < 0)
3242                         err = amount;
3243                 else
3244                         err = put_user(amount, (int __user *)arg);
3245                 break;
3246         case SIOCUNIXFILE:
3247                 err = unix_open_file(sk);
3248                 break;
3249 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3250         case SIOCATMARK:
3251                 {
3252                         struct unix_sock *u = unix_sk(sk);
3253                         struct sk_buff *skb;
3254                         int answ = 0;
3255
3256                         mutex_lock(&u->iolock);
3257
3258                         skb = skb_peek(&sk->sk_receive_queue);
3259                         if (skb) {
3260                                 struct sk_buff *oob_skb = READ_ONCE(u->oob_skb);
3261                                 struct sk_buff *next_skb;
3262
3263                                 next_skb = skb_peek_next(skb, &sk->sk_receive_queue);
3264
3265                                 if (skb == oob_skb ||
3266                                     (!unix_skb_len(skb) &&
3267                                      (!oob_skb || next_skb == oob_skb)))
3268                                         answ = 1;
3269                         }
3270
3271                         mutex_unlock(&u->iolock);
3272
3273                         err = put_user(answ, (int __user *)arg);
3274                 }
3275                 break;
3276 #endif
3277         default:
3278                 err = -ENOIOCTLCMD;
3279                 break;
3280         }
3281         return err;
3282 }
3283
3284 #ifdef CONFIG_COMPAT
3285 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3286 {
3287         return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3288 }
3289 #endif
3290
3291 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3292 {
3293         struct sock *sk = sock->sk;
3294         unsigned char state;
3295         __poll_t mask;
3296         u8 shutdown;
3297
3298         sock_poll_wait(file, sock, wait);
3299         mask = 0;
3300         shutdown = READ_ONCE(sk->sk_shutdown);
3301         state = READ_ONCE(sk->sk_state);
3302
3303         /* exceptional events? */
3304         if (READ_ONCE(sk->sk_err))
3305                 mask |= EPOLLERR;
3306         if (shutdown == SHUTDOWN_MASK)
3307                 mask |= EPOLLHUP;
3308         if (shutdown & RCV_SHUTDOWN)
3309                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3310
3311         /* readable? */
3312         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3313                 mask |= EPOLLIN | EPOLLRDNORM;
3314         if (sk_is_readable(sk))
3315                 mask |= EPOLLIN | EPOLLRDNORM;
3316 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3317         if (READ_ONCE(unix_sk(sk)->oob_skb))
3318                 mask |= EPOLLPRI;
3319 #endif
3320
3321         /* Connection-based need to check for termination and startup */
3322         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3323             state == TCP_CLOSE)
3324                 mask |= EPOLLHUP;
3325
3326         /*
3327          * we set writable also when the other side has shut down the
3328          * connection. This prevents stuck sockets.
3329          */
3330         if (unix_writable(sk, state))
3331                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3332
3333         return mask;
3334 }
3335
3336 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3337                                     poll_table *wait)
3338 {
3339         struct sock *sk = sock->sk, *other;
3340         unsigned int writable;
3341         unsigned char state;
3342         __poll_t mask;
3343         u8 shutdown;
3344
3345         sock_poll_wait(file, sock, wait);
3346         mask = 0;
3347         shutdown = READ_ONCE(sk->sk_shutdown);
3348         state = READ_ONCE(sk->sk_state);
3349
3350         /* exceptional events? */
3351         if (READ_ONCE(sk->sk_err) ||
3352             !skb_queue_empty_lockless(&sk->sk_error_queue))
3353                 mask |= EPOLLERR |
3354                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3355
3356         if (shutdown & RCV_SHUTDOWN)
3357                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3358         if (shutdown == SHUTDOWN_MASK)
3359                 mask |= EPOLLHUP;
3360
3361         /* readable? */
3362         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3363                 mask |= EPOLLIN | EPOLLRDNORM;
3364         if (sk_is_readable(sk))
3365                 mask |= EPOLLIN | EPOLLRDNORM;
3366
3367         /* Connection-based need to check for termination and startup */
3368         if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3369                 mask |= EPOLLHUP;
3370
3371         /* No write status requested, avoid expensive OUT tests. */
3372         if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3373                 return mask;
3374
3375         writable = unix_writable(sk, state);
3376         if (writable) {
3377                 unix_state_lock(sk);
3378
3379                 other = unix_peer(sk);
3380                 if (other && unix_peer(other) != sk &&
3381                     unix_recvq_full_lockless(other) &&
3382                     unix_dgram_peer_wake_me(sk, other))
3383                         writable = 0;
3384
3385                 unix_state_unlock(sk);
3386         }
3387
3388         if (writable)
3389                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3390         else
3391                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3392
3393         return mask;
3394 }
3395
3396 #ifdef CONFIG_PROC_FS
3397
3398 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3399
3400 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3401 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3402 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3403
3404 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3405 {
3406         unsigned long offset = get_offset(*pos);
3407         unsigned long bucket = get_bucket(*pos);
3408         unsigned long count = 0;
3409         struct sock *sk;
3410
3411         for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3412              sk; sk = sk_next(sk)) {
3413                 if (++count == offset)
3414                         break;
3415         }
3416
3417         return sk;
3418 }
3419
3420 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3421 {
3422         unsigned long bucket = get_bucket(*pos);
3423         struct net *net = seq_file_net(seq);
3424         struct sock *sk;
3425
3426         while (bucket < UNIX_HASH_SIZE) {
3427                 spin_lock(&net->unx.table.locks[bucket]);
3428
3429                 sk = unix_from_bucket(seq, pos);
3430                 if (sk)
3431                         return sk;
3432
3433                 spin_unlock(&net->unx.table.locks[bucket]);
3434
3435                 *pos = set_bucket_offset(++bucket, 1);
3436         }
3437
3438         return NULL;
3439 }
3440
3441 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3442                                   loff_t *pos)
3443 {
3444         unsigned long bucket = get_bucket(*pos);
3445
3446         sk = sk_next(sk);
3447         if (sk)
3448                 return sk;
3449
3450
3451         spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3452
3453         *pos = set_bucket_offset(++bucket, 1);
3454
3455         return unix_get_first(seq, pos);
3456 }
3457
3458 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3459 {
3460         if (!*pos)
3461                 return SEQ_START_TOKEN;
3462
3463         return unix_get_first(seq, pos);
3464 }
3465
3466 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3467 {
3468         ++*pos;
3469
3470         if (v == SEQ_START_TOKEN)
3471                 return unix_get_first(seq, pos);
3472
3473         return unix_get_next(seq, v, pos);
3474 }
3475
3476 static void unix_seq_stop(struct seq_file *seq, void *v)
3477 {
3478         struct sock *sk = v;
3479
3480         if (sk)
3481                 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3482 }
3483
3484 static int unix_seq_show(struct seq_file *seq, void *v)
3485 {
3486
3487         if (v == SEQ_START_TOKEN)
3488                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3489                          "Inode Path\n");
3490         else {
3491                 struct sock *s = v;
3492                 struct unix_sock *u = unix_sk(s);
3493                 unix_state_lock(s);
3494
3495                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3496                         s,
3497                         refcount_read(&s->sk_refcnt),
3498                         0,
3499                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3500                         s->sk_type,
3501                         s->sk_socket ?
3502                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3503                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3504                         sock_i_ino(s));
3505
3506                 if (u->addr) {  // under a hash table lock here
3507                         int i, len;
3508                         seq_putc(seq, ' ');
3509
3510                         i = 0;
3511                         len = u->addr->len -
3512                                 offsetof(struct sockaddr_un, sun_path);
3513                         if (u->addr->name->sun_path[0]) {
3514                                 len--;
3515                         } else {
3516                                 seq_putc(seq, '@');
3517                                 i++;
3518                         }
3519                         for ( ; i < len; i++)
3520                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
3521                                          '@');
3522                 }
3523                 unix_state_unlock(s);
3524                 seq_putc(seq, '\n');
3525         }
3526
3527         return 0;
3528 }
3529
3530 static const struct seq_operations unix_seq_ops = {
3531         .start  = unix_seq_start,
3532         .next   = unix_seq_next,
3533         .stop   = unix_seq_stop,
3534         .show   = unix_seq_show,
3535 };
3536
3537 #ifdef CONFIG_BPF_SYSCALL
3538 struct bpf_unix_iter_state {
3539         struct seq_net_private p;
3540         unsigned int cur_sk;
3541         unsigned int end_sk;
3542         unsigned int max_sk;
3543         struct sock **batch;
3544         bool st_bucket_done;
3545 };
3546
3547 struct bpf_iter__unix {
3548         __bpf_md_ptr(struct bpf_iter_meta *, meta);
3549         __bpf_md_ptr(struct unix_sock *, unix_sk);
3550         uid_t uid __aligned(8);
3551 };
3552
3553 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3554                               struct unix_sock *unix_sk, uid_t uid)
3555 {
3556         struct bpf_iter__unix ctx;
3557
3558         meta->seq_num--;  /* skip SEQ_START_TOKEN */
3559         ctx.meta = meta;
3560         ctx.unix_sk = unix_sk;
3561         ctx.uid = uid;
3562         return bpf_iter_run_prog(prog, &ctx);
3563 }
3564
3565 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3566
3567 {
3568         struct bpf_unix_iter_state *iter = seq->private;
3569         unsigned int expected = 1;
3570         struct sock *sk;
3571
3572         sock_hold(start_sk);
3573         iter->batch[iter->end_sk++] = start_sk;
3574
3575         for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3576                 if (iter->end_sk < iter->max_sk) {
3577                         sock_hold(sk);
3578                         iter->batch[iter->end_sk++] = sk;
3579                 }
3580
3581                 expected++;
3582         }
3583
3584         spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3585
3586         return expected;
3587 }
3588
3589 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3590 {
3591         while (iter->cur_sk < iter->end_sk)
3592                 sock_put(iter->batch[iter->cur_sk++]);
3593 }
3594
3595 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3596                                        unsigned int new_batch_sz)
3597 {
3598         struct sock **new_batch;
3599
3600         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3601                              GFP_USER | __GFP_NOWARN);
3602         if (!new_batch)
3603                 return -ENOMEM;
3604
3605         bpf_iter_unix_put_batch(iter);
3606         kvfree(iter->batch);
3607         iter->batch = new_batch;
3608         iter->max_sk = new_batch_sz;
3609
3610         return 0;
3611 }
3612
3613 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3614                                         loff_t *pos)
3615 {
3616         struct bpf_unix_iter_state *iter = seq->private;
3617         unsigned int expected;
3618         bool resized = false;
3619         struct sock *sk;
3620
3621         if (iter->st_bucket_done)
3622                 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3623
3624 again:
3625         /* Get a new batch */
3626         iter->cur_sk = 0;
3627         iter->end_sk = 0;
3628
3629         sk = unix_get_first(seq, pos);
3630         if (!sk)
3631                 return NULL; /* Done */
3632
3633         expected = bpf_iter_unix_hold_batch(seq, sk);
3634
3635         if (iter->end_sk == expected) {
3636                 iter->st_bucket_done = true;
3637                 return sk;
3638         }
3639
3640         if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3641                 resized = true;
3642                 goto again;
3643         }
3644
3645         return sk;
3646 }
3647
3648 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3649 {
3650         if (!*pos)
3651                 return SEQ_START_TOKEN;
3652
3653         /* bpf iter does not support lseek, so it always
3654          * continue from where it was stop()-ped.
3655          */
3656         return bpf_iter_unix_batch(seq, pos);
3657 }
3658
3659 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3660 {
3661         struct bpf_unix_iter_state *iter = seq->private;
3662         struct sock *sk;
3663
3664         /* Whenever seq_next() is called, the iter->cur_sk is
3665          * done with seq_show(), so advance to the next sk in
3666          * the batch.
3667          */
3668         if (iter->cur_sk < iter->end_sk)
3669                 sock_put(iter->batch[iter->cur_sk++]);
3670
3671         ++*pos;
3672
3673         if (iter->cur_sk < iter->end_sk)
3674                 sk = iter->batch[iter->cur_sk];
3675         else
3676                 sk = bpf_iter_unix_batch(seq, pos);
3677
3678         return sk;
3679 }
3680
3681 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3682 {
3683         struct bpf_iter_meta meta;
3684         struct bpf_prog *prog;
3685         struct sock *sk = v;
3686         uid_t uid;
3687         bool slow;
3688         int ret;
3689
3690         if (v == SEQ_START_TOKEN)
3691                 return 0;
3692
3693         slow = lock_sock_fast(sk);
3694
3695         if (unlikely(sk_unhashed(sk))) {
3696                 ret = SEQ_SKIP;
3697                 goto unlock;
3698         }
3699
3700         uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3701         meta.seq = seq;
3702         prog = bpf_iter_get_info(&meta, false);
3703         ret = unix_prog_seq_show(prog, &meta, v, uid);
3704 unlock:
3705         unlock_sock_fast(sk, slow);
3706         return ret;
3707 }
3708
3709 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3710 {
3711         struct bpf_unix_iter_state *iter = seq->private;
3712         struct bpf_iter_meta meta;
3713         struct bpf_prog *prog;
3714
3715         if (!v) {
3716                 meta.seq = seq;
3717                 prog = bpf_iter_get_info(&meta, true);
3718                 if (prog)
3719                         (void)unix_prog_seq_show(prog, &meta, v, 0);
3720         }
3721
3722         if (iter->cur_sk < iter->end_sk)
3723                 bpf_iter_unix_put_batch(iter);
3724 }
3725
3726 static const struct seq_operations bpf_iter_unix_seq_ops = {
3727         .start  = bpf_iter_unix_seq_start,
3728         .next   = bpf_iter_unix_seq_next,
3729         .stop   = bpf_iter_unix_seq_stop,
3730         .show   = bpf_iter_unix_seq_show,
3731 };
3732 #endif
3733 #endif
3734
3735 static const struct net_proto_family unix_family_ops = {
3736         .family = PF_UNIX,
3737         .create = unix_create,
3738         .owner  = THIS_MODULE,
3739 };
3740
3741
3742 static int __net_init unix_net_init(struct net *net)
3743 {
3744         int i;
3745
3746         net->unx.sysctl_max_dgram_qlen = 10;
3747         if (unix_sysctl_register(net))
3748                 goto out;
3749
3750 #ifdef CONFIG_PROC_FS
3751         if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3752                              sizeof(struct seq_net_private)))
3753                 goto err_sysctl;
3754 #endif
3755
3756         net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3757                                               sizeof(spinlock_t), GFP_KERNEL);
3758         if (!net->unx.table.locks)
3759                 goto err_proc;
3760
3761         net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3762                                                 sizeof(struct hlist_head),
3763                                                 GFP_KERNEL);
3764         if (!net->unx.table.buckets)
3765                 goto free_locks;
3766
3767         for (i = 0; i < UNIX_HASH_SIZE; i++) {
3768                 spin_lock_init(&net->unx.table.locks[i]);
3769                 lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL);
3770                 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3771         }
3772
3773         return 0;
3774
3775 free_locks:
3776         kvfree(net->unx.table.locks);
3777 err_proc:
3778 #ifdef CONFIG_PROC_FS
3779         remove_proc_entry("unix", net->proc_net);
3780 err_sysctl:
3781 #endif
3782         unix_sysctl_unregister(net);
3783 out:
3784         return -ENOMEM;
3785 }
3786
3787 static void __net_exit unix_net_exit(struct net *net)
3788 {
3789         kvfree(net->unx.table.buckets);
3790         kvfree(net->unx.table.locks);
3791         unix_sysctl_unregister(net);
3792         remove_proc_entry("unix", net->proc_net);
3793 }
3794
3795 static struct pernet_operations unix_net_ops = {
3796         .init = unix_net_init,
3797         .exit = unix_net_exit,
3798 };
3799
3800 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3801 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3802                      struct unix_sock *unix_sk, uid_t uid)
3803
3804 #define INIT_BATCH_SZ 16
3805
3806 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3807 {
3808         struct bpf_unix_iter_state *iter = priv_data;
3809         int err;
3810
3811         err = bpf_iter_init_seq_net(priv_data, aux);
3812         if (err)
3813                 return err;
3814
3815         err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3816         if (err) {
3817                 bpf_iter_fini_seq_net(priv_data);
3818                 return err;
3819         }
3820
3821         return 0;
3822 }
3823
3824 static void bpf_iter_fini_unix(void *priv_data)
3825 {
3826         struct bpf_unix_iter_state *iter = priv_data;
3827
3828         bpf_iter_fini_seq_net(priv_data);
3829         kvfree(iter->batch);
3830 }
3831
3832 static const struct bpf_iter_seq_info unix_seq_info = {
3833         .seq_ops                = &bpf_iter_unix_seq_ops,
3834         .init_seq_private       = bpf_iter_init_unix,
3835         .fini_seq_private       = bpf_iter_fini_unix,
3836         .seq_priv_size          = sizeof(struct bpf_unix_iter_state),
3837 };
3838
3839 static const struct bpf_func_proto *
3840 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3841                              const struct bpf_prog *prog)
3842 {
3843         switch (func_id) {
3844         case BPF_FUNC_setsockopt:
3845                 return &bpf_sk_setsockopt_proto;
3846         case BPF_FUNC_getsockopt:
3847                 return &bpf_sk_getsockopt_proto;
3848         default:
3849                 return NULL;
3850         }
3851 }
3852
3853 static struct bpf_iter_reg unix_reg_info = {
3854         .target                 = "unix",
3855         .ctx_arg_info_size      = 1,
3856         .ctx_arg_info           = {
3857                 { offsetof(struct bpf_iter__unix, unix_sk),
3858                   PTR_TO_BTF_ID_OR_NULL },
3859         },
3860         .get_func_proto         = bpf_iter_unix_get_func_proto,
3861         .seq_info               = &unix_seq_info,
3862 };
3863
3864 static void __init bpf_iter_register(void)
3865 {
3866         unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3867         if (bpf_iter_reg_target(&unix_reg_info))
3868                 pr_warn("Warning: could not register bpf iterator unix\n");
3869 }
3870 #endif
3871
3872 static int __init af_unix_init(void)
3873 {
3874         int i, rc = -1;
3875
3876         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3877
3878         for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3879                 spin_lock_init(&bsd_socket_locks[i]);
3880                 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3881         }
3882
3883         rc = proto_register(&unix_dgram_proto, 1);
3884         if (rc != 0) {
3885                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3886                 goto out;
3887         }
3888
3889         rc = proto_register(&unix_stream_proto, 1);
3890         if (rc != 0) {
3891                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3892                 proto_unregister(&unix_dgram_proto);
3893                 goto out;
3894         }
3895
3896         sock_register(&unix_family_ops);
3897         register_pernet_subsys(&unix_net_ops);
3898         unix_bpf_build_proto();
3899
3900 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3901         bpf_iter_register();
3902 #endif
3903
3904 out:
3905         return rc;
3906 }
3907
3908 /* Later than subsys_initcall() because we depend on stuff initialised there */
3909 fs_initcall(af_unix_init);