net/unix/af_unix.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * NET4:        Implementation of BSD Unix domain sockets.
   4  *
   5  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   6  *
   7  * Fixes:
   8  *              Linus Torvalds  :       Assorted bug cures.
   9  *              Niibe Yutaka    :       async I/O support.
  10  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  11  *              Alan Cox        :       Limit size of allocated blocks.
  12  *              Alan Cox        :       Fixed the stupid socketpair bug.
  13  *              Alan Cox        :       BSD compatibility fine tuning.
  14  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  15  *              Alan Cox        :       Sorted out a proper draft version of
  16  *                                      file descriptor passing hacked up from
  17  *                                      Mike Shaver's work.
  18  *              Marty Leisner   :       Fixes to fd passing
  19  *              Nick Nevin      :       recvmsg bugfix.
  20  *              Alan Cox        :       Started proper garbage collector
  21  *              Heiko EiBfeldt  :       Missing verify_area check
  22  *              Alan Cox        :       Started POSIXisms
  23  *              Andreas Schwab  :       Replace inode by dentry for proper
  24  *                                      reference counting
  25  *              Kirk Petersen   :       Made this a module
  26  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  27  *                                      Lots of bug fixes.
  28  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  29  *                                      by above two patches.
  30  *           Andrea Arcangeli   :       If possible we block in connect(2)
  31  *                                      if the max backlog of the listen socket
  32  *                                      is been reached. This won't break
  33  *                                      old apps and it will avoid huge amount
  34  *                                      of socks hashed (this for unix_gc()
  35  *                                      performances reasons).
  36  *                                      Security fix that limits the max
  37  *                                      number of socks to 2*max_files and
  38  *                                      the number of skb queueable in the
  39  *                                      dgram receiver.
  40  *              Artur Skawina   :       Hash function optimizations
  41  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  42  *            Malcolm Beattie   :       Set peercred for socketpair
  43  *           Michal Ostrowski   :       Module initialization cleanup.
  44  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  45  *                                      the core infrastructure is doing that
  46  *                                      for all net proto families now (2.5.69+)
  47  *
  48  * Known differences from reference BSD that was tested:
  49  *
  50  *      [TO FIX]
  51  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  52  *              other the moment one end closes.
  53  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  54  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  55  *      [NOT TO FIX]
  56  *      accept() returns a path name even if the connecting socket has closed
  57  *              in the meantime (BSD loses the path and gives up).
  58  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  59  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  60  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  61  *      BSD af_unix apparently has connect forgetting to block properly.
  62  *              (need to check this with the POSIX spec in detail)
  63  *
  64  * Differences from 2.0.0-11-... (ANK)
  65  *      Bug fixes and improvements.
  66  *              - client shutdown killed server socket.
  67  *              - removed all useless cli/sti pairs.
  68  *
  69  *      Semantic changes/extensions.
  70  *              - generic control message passing.
  71  *              - SCM_CREDENTIALS control message.
  72  *              - "Abstract" (not FS based) socket bindings.
  73  *                Abstract names are sequences of bytes (not zero terminated)
  74  *                started by 0, so that this name space does not intersect
  75  *                with BSD names.
  76  */
  77
  78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  79
  80 #include <linux/module.h>
  81 #include <linux/kernel.h>
  82 #include <linux/signal.h>
  83 #include <linux/sched/signal.h>
  84 #include <linux/errno.h>
  85 #include <linux/string.h>
  86 #include <linux/stat.h>
  87 #include <linux/dcache.h>
  88 #include <linux/namei.h>
  89 #include <linux/socket.h>
  90 #include <linux/un.h>
  91 #include <linux/fcntl.h>
  92 #include <linux/filter.h>
  93 #include <linux/termios.h>
  94 #include <linux/sockios.h>
  95 #include <linux/net.h>
  96 #include <linux/in.h>
  97 #include <linux/fs.h>
  98 #include <linux/slab.h>
  99 #include <linux/uaccess.h>
 100 #include <linux/skbuff.h>
 101 #include <linux/netdevice.h>
 102 #include <net/net_namespace.h>
 103 #include <net/sock.h>
 104 #include <net/tcp_states.h>
 105 #include <net/af_unix.h>
 106 #include <linux/proc_fs.h>
 107 #include <linux/seq_file.h>
 108 #include <net/scm.h>
 109 #include <linux/init.h>
 110 #include <linux/poll.h>
 111 #include <linux/rtnetlink.h>
 112 #include <linux/mount.h>
 113 #include <net/checksum.h>
 114 #include <linux/security.h>
 115 #include <linux/splice.h>
 116 #include <linux/freezer.h>
 117 #include <linux/file.h>
 118 #include <linux/btf_ids.h>
 119 #include <linux/bpf-cgroup.h>
 120
 121 #include "scm.h"
 122
 123 static atomic_long_t unix_nr_socks;
 124 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
 125 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
 126
 127 /* SMP locking strategy:
 128  *    hash table is protected with spinlock.
 129  *    each socket state is protected by separate spinlock.
 130  */
 131
 132 static unsigned int unix_unbound_hash(struct sock *sk)
 133 {
 134         unsigned long hash = (unsigned long)sk;
 135
 136         hash ^= hash >> 16;
 137         hash ^= hash >> 8;
 138         hash ^= sk->sk_type;
 139
 140         return hash & UNIX_HASH_MOD;
 141 }
 142
 143 static unsigned int unix_bsd_hash(struct inode *i)
 144 {
 145         return i->i_ino & UNIX_HASH_MOD;
 146 }
 147
 148 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
 149                                        int addr_len, int type)
 150 {
 151         __wsum csum = csum_partial(sunaddr, addr_len, 0);
 152         unsigned int hash;
 153
 154         hash = (__force unsigned int)csum_fold(csum);
 155         hash ^= hash >> 8;
 156         hash ^= type;
 157
 158         return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
 159 }
 160
 161 static void unix_table_double_lock(struct net *net,
 162                                    unsigned int hash1, unsigned int hash2)
 163 {
 164         if (hash1 == hash2) {
 165                 spin_lock(&net->unx.table.locks[hash1]);
 166                 return;
 167         }
 168
 169         if (hash1 > hash2)
 170                 swap(hash1, hash2);
 171
 172         spin_lock(&net->unx.table.locks[hash1]);
 173         spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
 174 }
 175
 176 static void unix_table_double_unlock(struct net *net,
 177                                      unsigned int hash1, unsigned int hash2)
 178 {
 179         if (hash1 == hash2) {
 180                 spin_unlock(&net->unx.table.locks[hash1]);
 181                 return;
 182         }
 183
 184         spin_unlock(&net->unx.table.locks[hash1]);
 185         spin_unlock(&net->unx.table.locks[hash2]);
 186 }
 187
 188 #ifdef CONFIG_SECURITY_NETWORK
 189 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 190 {
 191         UNIXCB(skb).secid = scm->secid;
 192 }
 193
 194 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 195 {
 196         scm->secid = UNIXCB(skb).secid;
 197 }
 198
 199 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 200 {
 201         return (scm->secid == UNIXCB(skb).secid);
 202 }
 203 #else
 204 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 205 { }
 206
 207 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 208 { }
 209
 210 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 211 {
 212         return true;
 213 }
 214 #endif /* CONFIG_SECURITY_NETWORK */
 215
 216 #define unix_peer(sk) (unix_sk(sk)->peer)
 217
 218 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 219 {
 220         return unix_peer(osk) == sk;
 221 }
 222
 223 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 224 {
 225         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 226 }
 227
 228 static inline int unix_recvq_full(const struct sock *sk)
 229 {
 230         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 231 }
 232
 233 static inline int unix_recvq_full_lockless(const struct sock *sk)
 234 {
 235         return skb_queue_len_lockless(&sk->sk_receive_queue) >
 236                 READ_ONCE(sk->sk_max_ack_backlog);
 237 }
 238
 239 struct sock *unix_peer_get(struct sock *s)
 240 {
 241         struct sock *peer;
 242
 243         unix_state_lock(s);
 244         peer = unix_peer(s);
 245         if (peer)
 246                 sock_hold(peer);
 247         unix_state_unlock(s);
 248         return peer;
 249 }
 250 EXPORT_SYMBOL_GPL(unix_peer_get);
 251
 252 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
 253                                              int addr_len)
 254 {
 255         struct unix_address *addr;
 256
 257         addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
 258         if (!addr)
 259                 return NULL;
 260
 261         refcount_set(&addr->refcnt, 1);
 262         addr->len = addr_len;
 263         memcpy(addr->name, sunaddr, addr_len);
 264
 265         return addr;
 266 }
 267
 268 static inline void unix_release_addr(struct unix_address *addr)
 269 {
 270         if (refcount_dec_and_test(&addr->refcnt))
 271                 kfree(addr);
 272 }
 273
 274 /*
 275  *      Check unix socket name:
 276  *              - should be not zero length.
 277  *              - if started by not zero, should be NULL terminated (FS object)
 278  *              - if started by zero, it is abstract name.
 279  */
 280
 281 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
 282 {
 283         if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
 284             addr_len > sizeof(*sunaddr))
 285                 return -EINVAL;
 286
 287         if (sunaddr->sun_family != AF_UNIX)
 288                 return -EINVAL;
 289
 290         return 0;
 291 }
 292
 293 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
 294 {
 295         struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
 296         short offset = offsetof(struct sockaddr_storage, __data);
 297
 298         BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
 299
 300         /* This may look like an off by one error but it is a bit more
 301          * subtle.  108 is the longest valid AF_UNIX path for a binding.
 302          * sun_path[108] doesn't as such exist.  However in kernel space
 303          * we are guaranteed that it is a valid memory location in our
 304          * kernel address buffer because syscall functions always pass
 305          * a pointer of struct sockaddr_storage which has a bigger buffer
 306          * than 108.  Also, we must terminate sun_path for strlen() in
 307          * getname_kernel().
 308          */
 309         addr->__data[addr_len - offset] = 0;
 310
 311         /* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
 312          * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
 313          * know the actual buffer.
 314          */
 315         return strlen(addr->__data) + offset + 1;
 316 }
 317
 318 static void __unix_remove_socket(struct sock *sk)
 319 {
 320         sk_del_node_init(sk);
 321 }
 322
 323 static void __unix_insert_socket(struct net *net, struct sock *sk)
 324 {
 325         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 326         sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
 327 }
 328
 329 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
 330                                  struct unix_address *addr, unsigned int hash)
 331 {
 332         __unix_remove_socket(sk);
 333         smp_store_release(&unix_sk(sk)->addr, addr);
 334
 335         sk->sk_hash = hash;
 336         __unix_insert_socket(net, sk);
 337 }
 338
 339 static void unix_remove_socket(struct net *net, struct sock *sk)
 340 {
 341         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 342         __unix_remove_socket(sk);
 343         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 344 }
 345
 346 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
 347 {
 348         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 349         __unix_insert_socket(net, sk);
 350         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 351 }
 352
 353 static void unix_insert_bsd_socket(struct sock *sk)
 354 {
 355         spin_lock(&bsd_socket_locks[sk->sk_hash]);
 356         sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
 357         spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 358 }
 359
 360 static void unix_remove_bsd_socket(struct sock *sk)
 361 {
 362         if (!hlist_unhashed(&sk->sk_bind_node)) {
 363                 spin_lock(&bsd_socket_locks[sk->sk_hash]);
 364                 __sk_del_bind_node(sk);
 365                 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 366
 367                 sk_node_init(&sk->sk_bind_node);
 368         }
 369 }
 370
 371 static struct sock *__unix_find_socket_byname(struct net *net,
 372                                               struct sockaddr_un *sunname,
 373                                               int len, unsigned int hash)
 374 {
 375         struct sock *s;
 376
 377         sk_for_each(s, &net->unx.table.buckets[hash]) {
 378                 struct unix_sock *u = unix_sk(s);
 379
 380                 if (u->addr->len == len &&
 381                     !memcmp(u->addr->name, sunname, len))
 382                         return s;
 383         }
 384         return NULL;
 385 }
 386
 387 static inline struct sock *unix_find_socket_byname(struct net *net,
 388                                                    struct sockaddr_un *sunname,
 389                                                    int len, unsigned int hash)
 390 {
 391         struct sock *s;
 392
 393         spin_lock(&net->unx.table.locks[hash]);
 394         s = __unix_find_socket_byname(net, sunname, len, hash);
 395         if (s)
 396                 sock_hold(s);
 397         spin_unlock(&net->unx.table.locks[hash]);
 398         return s;
 399 }
 400
 401 static struct sock *unix_find_socket_byinode(struct inode *i)
 402 {
 403         unsigned int hash = unix_bsd_hash(i);
 404         struct sock *s;
 405
 406         spin_lock(&bsd_socket_locks[hash]);
 407         sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
 408                 struct dentry *dentry = unix_sk(s)->path.dentry;
 409
 410                 if (dentry && d_backing_inode(dentry) == i) {
 411                         sock_hold(s);
 412                         spin_unlock(&bsd_socket_locks[hash]);
 413                         return s;
 414                 }
 415         }
 416         spin_unlock(&bsd_socket_locks[hash]);
 417         return NULL;
 418 }
 419
 420 /* Support code for asymmetrically connected dgram sockets
 421  *
 422  * If a datagram socket is connected to a socket not itself connected
 423  * to the first socket (eg, /dev/log), clients may only enqueue more
 424  * messages if the present receive queue of the server socket is not
 425  * "too large". This means there's a second writeability condition
 426  * poll and sendmsg need to test. The dgram recv code will do a wake
 427  * up on the peer_wait wait queue of a socket upon reception of a
 428  * datagram which needs to be propagated to sleeping would-be writers
 429  * since these might not have sent anything so far. This can't be
 430  * accomplished via poll_wait because the lifetime of the server
 431  * socket might be less than that of its clients if these break their
 432  * association with it or if the server socket is closed while clients
 433  * are still connected to it and there's no way to inform "a polling
 434  * implementation" that it should let go of a certain wait queue
 435  *
 436  * In order to propagate a wake up, a wait_queue_entry_t of the client
 437  * socket is enqueued on the peer_wait queue of the server socket
 438  * whose wake function does a wake_up on the ordinary client socket
 439  * wait queue. This connection is established whenever a write (or
 440  * poll for write) hit the flow control condition and broken when the
 441  * association to the server socket is dissolved or after a wake up
 442  * was relayed.
 443  */
 444
 445 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
 446                                       void *key)
 447 {
 448         struct unix_sock *u;
 449         wait_queue_head_t *u_sleep;
 450
 451         u = container_of(q, struct unix_sock, peer_wake);
 452
 453         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
 454                             q);
 455         u->peer_wake.private = NULL;
 456
 457         /* relaying can only happen while the wq still exists */
 458         u_sleep = sk_sleep(&u->sk);
 459         if (u_sleep)
 460                 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
 461
 462         return 0;
 463 }
 464
 465 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
 466 {
 467         struct unix_sock *u, *u_other;
 468         int rc;
 469
 470         u = unix_sk(sk);
 471         u_other = unix_sk(other);
 472         rc = 0;
 473         spin_lock(&u_other->peer_wait.lock);
 474
 475         if (!u->peer_wake.private) {
 476                 u->peer_wake.private = other;
 477                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
 478
 479                 rc = 1;
 480         }
 481
 482         spin_unlock(&u_other->peer_wait.lock);
 483         return rc;
 484 }
 485
 486 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
 487                                             struct sock *other)
 488 {
 489         struct unix_sock *u, *u_other;
 490
 491         u = unix_sk(sk);
 492         u_other = unix_sk(other);
 493         spin_lock(&u_other->peer_wait.lock);
 494
 495         if (u->peer_wake.private == other) {
 496                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
 497                 u->peer_wake.private = NULL;
 498         }
 499
 500         spin_unlock(&u_other->peer_wait.lock);
 501 }
 502
 503 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 504                                                    struct sock *other)
 505 {
 506         unix_dgram_peer_wake_disconnect(sk, other);
 507         wake_up_interruptible_poll(sk_sleep(sk),
 508                                    EPOLLOUT |
 509                                    EPOLLWRNORM |
 510                                    EPOLLWRBAND);
 511 }
 512
 513 /* preconditions:
 514  *      - unix_peer(sk) == other
 515  *      - association is stable
 516  */
 517 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 518 {
 519         int connected;
 520
 521         connected = unix_dgram_peer_wake_connect(sk, other);
 522
 523         /* If other is SOCK_DEAD, we want to make sure we signal
 524          * POLLOUT, such that a subsequent write() can get a
 525          * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
 526          * to other and its full, we will hang waiting for POLLOUT.
 527          */
 528         if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
 529                 return 1;
 530
 531         if (connected)
 532                 unix_dgram_peer_wake_disconnect(sk, other);
 533
 534         return 0;
 535 }
 536
 537 static int unix_writable(const struct sock *sk)
 538 {
 539         return sk->sk_state != TCP_LISTEN &&
 540                (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 541 }
 542
 543 static void unix_write_space(struct sock *sk)
 544 {
 545         struct socket_wq *wq;
 546
 547         rcu_read_lock();
 548         if (unix_writable(sk)) {
 549                 wq = rcu_dereference(sk->sk_wq);
 550                 if (skwq_has_sleeper(wq))
 551                         wake_up_interruptible_sync_poll(&wq->wait,
 552                                 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
 553                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 554         }
 555         rcu_read_unlock();
 556 }
 557
 558 /* When dgram socket disconnects (or changes its peer), we clear its receive
 559  * queue of packets arrived from previous peer. First, it allows to do
 560  * flow control based only on wmem_alloc; second, sk connected to peer
 561  * may receive messages only from that peer. */
 562 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 563 {
 564         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 565                 skb_queue_purge(&sk->sk_receive_queue);
 566                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 567
 568                 /* If one link of bidirectional dgram pipe is disconnected,
 569                  * we signal error. Messages are lost. Do not make this,
 570                  * when peer was not connected to us.
 571                  */
 572                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 573                         WRITE_ONCE(other->sk_err, ECONNRESET);
 574                         sk_error_report(other);
 575                 }
 576         }
 577         other->sk_state = TCP_CLOSE;
 578 }
 579
 580 static void unix_sock_destructor(struct sock *sk)
 581 {
 582         struct unix_sock *u = unix_sk(sk);
 583
 584         skb_queue_purge(&sk->sk_receive_queue);
 585
 586         DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
 587         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 588         DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
 589         if (!sock_flag(sk, SOCK_DEAD)) {
 590                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 591                 return;
 592         }
 593
 594         if (u->addr)
 595                 unix_release_addr(u->addr);
 596
 597         atomic_long_dec(&unix_nr_socks);
 598         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 599 #ifdef UNIX_REFCNT_DEBUG
 600         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 601                 atomic_long_read(&unix_nr_socks));
 602 #endif
 603 }
 604
 605 static void unix_release_sock(struct sock *sk, int embrion)
 606 {
 607         struct unix_sock *u = unix_sk(sk);
 608         struct sock *skpair;
 609         struct sk_buff *skb;
 610         struct path path;
 611         int state;
 612
 613         unix_remove_socket(sock_net(sk), sk);
 614         unix_remove_bsd_socket(sk);
 615
 616         /* Clear state */
 617         unix_state_lock(sk);
 618         sock_orphan(sk);
 619         WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
 620         path         = u->path;
 621         u->path.dentry = NULL;
 622         u->path.mnt = NULL;
 623         state = sk->sk_state;
 624         sk->sk_state = TCP_CLOSE;
 625
 626         skpair = unix_peer(sk);
 627         unix_peer(sk) = NULL;
 628
 629         unix_state_unlock(sk);
 630
 631 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
 632         if (u->oob_skb) {
 633                 kfree_skb(u->oob_skb);
 634                 u->oob_skb = NULL;
 635         }
 636 #endif
 637
 638         wake_up_interruptible_all(&u->peer_wait);
 639
 640         if (skpair != NULL) {
 641                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 642                         unix_state_lock(skpair);
 643                         /* No more writes */
 644                         WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
 645                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 646                                 WRITE_ONCE(skpair->sk_err, ECONNRESET);
 647                         unix_state_unlock(skpair);
 648                         skpair->sk_state_change(skpair);
 649                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 650                 }
 651
 652                 unix_dgram_peer_wake_disconnect(sk, skpair);
 653                 sock_put(skpair); /* It may now die */
 654         }
 655
 656         /* Try to flush out this socket. Throw out buffers at least */
 657
 658         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 659                 if (state == TCP_LISTEN)
 660                         unix_release_sock(skb->sk, 1);
 661                 /* passed fds are erased in the kfree_skb hook        */
 662                 UNIXCB(skb).consumed = skb->len;
 663                 kfree_skb(skb);
 664         }
 665
 666         if (path.dentry)
 667                 path_put(&path);
 668
 669         sock_put(sk);
 670
 671         /* ---- Socket is dead now and most probably destroyed ---- */
 672
 673         /*
 674          * Fixme: BSD difference: In BSD all sockets connected to us get
 675          *        ECONNRESET and we die on the spot. In Linux we behave
 676          *        like files and pipes do and wait for the last
 677          *        dereference.
 678          *
 679          * Can't we simply set sock->err?
 680          *
 681          *        What the above comment does talk about? --ANK(980817)
 682          */
 683
 684         if (READ_ONCE(unix_tot_inflight))
 685                 unix_gc();              /* Garbage collect fds */
 686 }
 687
 688 static void init_peercred(struct sock *sk)
 689 {
 690         const struct cred *old_cred;
 691         struct pid *old_pid;
 692
 693         spin_lock(&sk->sk_peer_lock);
 694         old_pid = sk->sk_peer_pid;
 695         old_cred = sk->sk_peer_cred;
 696         sk->sk_peer_pid  = get_pid(task_tgid(current));
 697         sk->sk_peer_cred = get_current_cred();
 698         spin_unlock(&sk->sk_peer_lock);
 699
 700         put_pid(old_pid);
 701         put_cred(old_cred);
 702 }
 703
 704 static void copy_peercred(struct sock *sk, struct sock *peersk)
 705 {
 706         const struct cred *old_cred;
 707         struct pid *old_pid;
 708
 709         if (sk < peersk) {
 710                 spin_lock(&sk->sk_peer_lock);
 711                 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 712         } else {
 713                 spin_lock(&peersk->sk_peer_lock);
 714                 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 715         }
 716         old_pid = sk->sk_peer_pid;
 717         old_cred = sk->sk_peer_cred;
 718         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 719         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 720
 721         spin_unlock(&sk->sk_peer_lock);
 722         spin_unlock(&peersk->sk_peer_lock);
 723
 724         put_pid(old_pid);
 725         put_cred(old_cred);
 726 }
 727
 728 static int unix_listen(struct socket *sock, int backlog)
 729 {
 730         int err;
 731         struct sock *sk = sock->sk;
 732         struct unix_sock *u = unix_sk(sk);
 733
 734         err = -EOPNOTSUPP;
 735         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 736                 goto out;       /* Only stream/seqpacket sockets accept */
 737         err = -EINVAL;
 738         if (!u->addr)
 739                 goto out;       /* No listens on an unbound socket */
 740         unix_state_lock(sk);
 741         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 742                 goto out_unlock;
 743         if (backlog > sk->sk_max_ack_backlog)
 744                 wake_up_interruptible_all(&u->peer_wait);
 745         sk->sk_max_ack_backlog  = backlog;
 746         sk->sk_state            = TCP_LISTEN;
 747         /* set credentials so connect can copy them */
 748         init_peercred(sk);
 749         err = 0;
 750
 751 out_unlock:
 752         unix_state_unlock(sk);
 753 out:
 754         return err;
 755 }
 756
 757 static int unix_release(struct socket *);
 758 static int unix_bind(struct socket *, struct sockaddr *, int);
 759 static int unix_stream_connect(struct socket *, struct sockaddr *,
 760                                int addr_len, int flags);
 761 static int unix_socketpair(struct socket *, struct socket *);
 762 static int unix_accept(struct socket *, struct socket *, int, bool);
 763 static int unix_getname(struct socket *, struct sockaddr *, int);
 764 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
 765 static __poll_t unix_dgram_poll(struct file *, struct socket *,
 766                                     poll_table *);
 767 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 768 #ifdef CONFIG_COMPAT
 769 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 770 #endif
 771 static int unix_shutdown(struct socket *, int);
 772 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 773 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 774 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 775                                        struct pipe_inode_info *, size_t size,
 776                                        unsigned int flags);
 777 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 778 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 779 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 780 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 781 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 782                               int, int);
 783 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 784 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 785                                   int);
 786
 787 static int unix_set_peek_off(struct sock *sk, int val)
 788 {
 789         struct unix_sock *u = unix_sk(sk);
 790
 791         if (mutex_lock_interruptible(&u->iolock))
 792                 return -EINTR;
 793
 794         WRITE_ONCE(sk->sk_peek_off, val);
 795         mutex_unlock(&u->iolock);
 796
 797         return 0;
 798 }
 799
 800 #ifdef CONFIG_PROC_FS
 801 static int unix_count_nr_fds(struct sock *sk)
 802 {
 803         struct sk_buff *skb;
 804         struct unix_sock *u;
 805         int nr_fds = 0;
 806
 807         spin_lock(&sk->sk_receive_queue.lock);
 808         skb = skb_peek(&sk->sk_receive_queue);
 809         while (skb) {
 810                 u = unix_sk(skb->sk);
 811                 nr_fds += atomic_read(&u->scm_stat.nr_fds);
 812                 skb = skb_peek_next(skb, &sk->sk_receive_queue);
 813         }
 814         spin_unlock(&sk->sk_receive_queue.lock);
 815
 816         return nr_fds;
 817 }
 818
 819 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
 820 {
 821         struct sock *sk = sock->sk;
 822         unsigned char s_state;
 823         struct unix_sock *u;
 824         int nr_fds = 0;
 825
 826         if (sk) {
 827                 s_state = READ_ONCE(sk->sk_state);
 828                 u = unix_sk(sk);
 829
 830                 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
 831                  * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
 832                  * SOCK_DGRAM is ordinary. So, no lock is needed.
 833                  */
 834                 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
 835                         nr_fds = atomic_read(&u->scm_stat.nr_fds);
 836                 else if (s_state == TCP_LISTEN)
 837                         nr_fds = unix_count_nr_fds(sk);
 838
 839                 seq_printf(m, "scm_fds: %u\n", nr_fds);
 840         }
 841 }
 842 #else
 843 #define unix_show_fdinfo NULL
 844 #endif
 845
 846 static const struct proto_ops unix_stream_ops = {
 847         .family =       PF_UNIX,
 848         .owner =        THIS_MODULE,
 849         .release =      unix_release,
 850         .bind =         unix_bind,
 851         .connect =      unix_stream_connect,
 852         .socketpair =   unix_socketpair,
 853         .accept =       unix_accept,
 854         .getname =      unix_getname,
 855         .poll =         unix_poll,
 856         .ioctl =        unix_ioctl,
 857 #ifdef CONFIG_COMPAT
 858         .compat_ioctl = unix_compat_ioctl,
 859 #endif
 860         .listen =       unix_listen,
 861         .shutdown =     unix_shutdown,
 862         .sendmsg =      unix_stream_sendmsg,
 863         .recvmsg =      unix_stream_recvmsg,
 864         .read_skb =     unix_stream_read_skb,
 865         .mmap =         sock_no_mmap,
 866         .splice_read =  unix_stream_splice_read,
 867         .set_peek_off = unix_set_peek_off,
 868         .show_fdinfo =  unix_show_fdinfo,
 869 };
 870
 871 static const struct proto_ops unix_dgram_ops = {
 872         .family =       PF_UNIX,
 873         .owner =        THIS_MODULE,
 874         .release =      unix_release,
 875         .bind =         unix_bind,
 876         .connect =      unix_dgram_connect,
 877         .socketpair =   unix_socketpair,
 878         .accept =       sock_no_accept,
 879         .getname =      unix_getname,
 880         .poll =         unix_dgram_poll,
 881         .ioctl =        unix_ioctl,
 882 #ifdef CONFIG_COMPAT
 883         .compat_ioctl = unix_compat_ioctl,
 884 #endif
 885         .listen =       sock_no_listen,
 886         .shutdown =     unix_shutdown,
 887         .sendmsg =      unix_dgram_sendmsg,
 888         .read_skb =     unix_read_skb,
 889         .recvmsg =      unix_dgram_recvmsg,
 890         .mmap =         sock_no_mmap,
 891         .set_peek_off = unix_set_peek_off,
 892         .show_fdinfo =  unix_show_fdinfo,
 893 };
 894
 895 static const struct proto_ops unix_seqpacket_ops = {
 896         .family =       PF_UNIX,
 897         .owner =        THIS_MODULE,
 898         .release =      unix_release,
 899         .bind =         unix_bind,
 900         .connect =      unix_stream_connect,
 901         .socketpair =   unix_socketpair,
 902         .accept =       unix_accept,
 903         .getname =      unix_getname,
 904         .poll =         unix_dgram_poll,
 905         .ioctl =        unix_ioctl,
 906 #ifdef CONFIG_COMPAT
 907         .compat_ioctl = unix_compat_ioctl,
 908 #endif
 909         .listen =       unix_listen,
 910         .shutdown =     unix_shutdown,
 911         .sendmsg =      unix_seqpacket_sendmsg,
 912         .recvmsg =      unix_seqpacket_recvmsg,
 913         .mmap =         sock_no_mmap,
 914         .set_peek_off = unix_set_peek_off,
 915         .show_fdinfo =  unix_show_fdinfo,
 916 };
 917
 918 static void unix_close(struct sock *sk, long timeout)
 919 {
 920         /* Nothing to do here, unix socket does not need a ->close().
 921          * This is merely for sockmap.
 922          */
 923 }
 924
 925 static void unix_unhash(struct sock *sk)
 926 {
 927         /* Nothing to do here, unix socket does not need a ->unhash().
 928          * This is merely for sockmap.
 929          */
 930 }
 931
 932 static bool unix_bpf_bypass_getsockopt(int level, int optname)
 933 {
 934         if (level == SOL_SOCKET) {
 935                 switch (optname) {
 936                 case SO_PEERPIDFD:
 937                         return true;
 938                 default:
 939                         return false;
 940                 }
 941         }
 942
 943         return false;
 944 }
 945
 946 struct proto unix_dgram_proto = {
 947         .name                   = "UNIX",
 948         .owner                  = THIS_MODULE,
 949         .obj_size               = sizeof(struct unix_sock),
 950         .close                  = unix_close,
 951         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
 952 #ifdef CONFIG_BPF_SYSCALL
 953         .psock_update_sk_prot   = unix_dgram_bpf_update_proto,
 954 #endif
 955 };
 956
 957 struct proto unix_stream_proto = {
 958         .name                   = "UNIX-STREAM",
 959         .owner                  = THIS_MODULE,
 960         .obj_size               = sizeof(struct unix_sock),
 961         .close                  = unix_close,
 962         .unhash                 = unix_unhash,
 963         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
 964 #ifdef CONFIG_BPF_SYSCALL
 965         .psock_update_sk_prot   = unix_stream_bpf_update_proto,
 966 #endif
 967 };
 968
 969 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
 970 {
 971         struct unix_sock *u;
 972         struct sock *sk;
 973         int err;
 974
 975         atomic_long_inc(&unix_nr_socks);
 976         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
 977                 err = -ENFILE;
 978                 goto err;
 979         }
 980
 981         if (type == SOCK_STREAM)
 982                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
 983         else /*dgram and  seqpacket */
 984                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
 985
 986         if (!sk) {
 987                 err = -ENOMEM;
 988                 goto err;
 989         }
 990
 991         sock_init_data(sock, sk);
 992
 993         sk->sk_hash             = unix_unbound_hash(sk);
 994         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
 995         sk->sk_write_space      = unix_write_space;
 996         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 997         sk->sk_destruct         = unix_sock_destructor;
 998         u         = unix_sk(sk);
 999         u->path.dentry = NULL;
1000         u->path.mnt = NULL;
1001         spin_lock_init(&u->lock);
1002         atomic_long_set(&u->inflight, 0);
1003         INIT_LIST_HEAD(&u->link);
1004         mutex_init(&u->iolock); /* single task reading lock */
1005         mutex_init(&u->bindlock); /* single task binding lock */
1006         init_waitqueue_head(&u->peer_wait);
1007         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1008         memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1009         unix_insert_unbound_socket(net, sk);
1010
1011         sock_prot_inuse_add(net, sk->sk_prot, 1);
1012
1013         return sk;
1014
1015 err:
1016         atomic_long_dec(&unix_nr_socks);
1017         return ERR_PTR(err);
1018 }
1019
1020 static int unix_create(struct net *net, struct socket *sock, int protocol,
1021                        int kern)
1022 {
1023         struct sock *sk;
1024
1025         if (protocol && protocol != PF_UNIX)
1026                 return -EPROTONOSUPPORT;
1027
1028         sock->state = SS_UNCONNECTED;
1029
1030         switch (sock->type) {
1031         case SOCK_STREAM:
1032                 sock->ops = &unix_stream_ops;
1033                 break;
1034                 /*
1035                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
1036                  *      nothing uses it.
1037                  */
1038         case SOCK_RAW:
1039                 sock->type = SOCK_DGRAM;
1040                 fallthrough;
1041         case SOCK_DGRAM:
1042                 sock->ops = &unix_dgram_ops;
1043                 break;
1044         case SOCK_SEQPACKET:
1045                 sock->ops = &unix_seqpacket_ops;
1046                 break;
1047         default:
1048                 return -ESOCKTNOSUPPORT;
1049         }
1050
1051         sk = unix_create1(net, sock, kern, sock->type);
1052         if (IS_ERR(sk))
1053                 return PTR_ERR(sk);
1054
1055         return 0;
1056 }
1057
1058 static int unix_release(struct socket *sock)
1059 {
1060         struct sock *sk = sock->sk;
1061
1062         if (!sk)
1063                 return 0;
1064
1065         sk->sk_prot->close(sk, 0);
1066         unix_release_sock(sk, 0);
1067         sock->sk = NULL;
1068
1069         return 0;
1070 }
1071
1072 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1073                                   int type)
1074 {
1075         struct inode *inode;
1076         struct path path;
1077         struct sock *sk;
1078         int err;
1079
1080         unix_mkname_bsd(sunaddr, addr_len);
1081         err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1082         if (err)
1083                 goto fail;
1084
1085         err = path_permission(&path, MAY_WRITE);
1086         if (err)
1087                 goto path_put;
1088
1089         err = -ECONNREFUSED;
1090         inode = d_backing_inode(path.dentry);
1091         if (!S_ISSOCK(inode->i_mode))
1092                 goto path_put;
1093
1094         sk = unix_find_socket_byinode(inode);
1095         if (!sk)
1096                 goto path_put;
1097
1098         err = -EPROTOTYPE;
1099         if (sk->sk_type == type)
1100                 touch_atime(&path);
1101         else
1102                 goto sock_put;
1103
1104         path_put(&path);
1105
1106         return sk;
1107
1108 sock_put:
1109         sock_put(sk);
1110 path_put:
1111         path_put(&path);
1112 fail:
1113         return ERR_PTR(err);
1114 }
1115
1116 static struct sock *unix_find_abstract(struct net *net,
1117                                        struct sockaddr_un *sunaddr,
1118                                        int addr_len, int type)
1119 {
1120         unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1121         struct dentry *dentry;
1122         struct sock *sk;
1123
1124         sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1125         if (!sk)
1126                 return ERR_PTR(-ECONNREFUSED);
1127
1128         dentry = unix_sk(sk)->path.dentry;
1129         if (dentry)
1130                 touch_atime(&unix_sk(sk)->path);
1131
1132         return sk;
1133 }
1134
1135 static struct sock *unix_find_other(struct net *net,
1136                                     struct sockaddr_un *sunaddr,
1137                                     int addr_len, int type)
1138 {
1139         struct sock *sk;
1140
1141         if (sunaddr->sun_path[0])
1142                 sk = unix_find_bsd(sunaddr, addr_len, type);
1143         else
1144                 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1145
1146         return sk;
1147 }
1148
1149 static int unix_autobind(struct sock *sk)
1150 {
1151         unsigned int new_hash, old_hash = sk->sk_hash;
1152         struct unix_sock *u = unix_sk(sk);
1153         struct net *net = sock_net(sk);
1154         struct unix_address *addr;
1155         u32 lastnum, ordernum;
1156         int err;
1157
1158         err = mutex_lock_interruptible(&u->bindlock);
1159         if (err)
1160                 return err;
1161
1162         if (u->addr)
1163                 goto out;
1164
1165         err = -ENOMEM;
1166         addr = kzalloc(sizeof(*addr) +
1167                        offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1168         if (!addr)
1169                 goto out;
1170
1171         addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1172         addr->name->sun_family = AF_UNIX;
1173         refcount_set(&addr->refcnt, 1);
1174
1175         ordernum = get_random_u32();
1176         lastnum = ordernum & 0xFFFFF;
1177 retry:
1178         ordernum = (ordernum + 1) & 0xFFFFF;
1179         sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1180
1181         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1182         unix_table_double_lock(net, old_hash, new_hash);
1183
1184         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1185                 unix_table_double_unlock(net, old_hash, new_hash);
1186
1187                 /* __unix_find_socket_byname() may take long time if many names
1188                  * are already in use.
1189                  */
1190                 cond_resched();
1191
1192                 if (ordernum == lastnum) {
1193                         /* Give up if all names seems to be in use. */
1194                         err = -ENOSPC;
1195                         unix_release_addr(addr);
1196                         goto out;
1197                 }
1198
1199                 goto retry;
1200         }
1201
1202         __unix_set_addr_hash(net, sk, addr, new_hash);
1203         unix_table_double_unlock(net, old_hash, new_hash);
1204         err = 0;
1205
1206 out:    mutex_unlock(&u->bindlock);
1207         return err;
1208 }
1209
1210 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1211                          int addr_len)
1212 {
1213         umode_t mode = S_IFSOCK |
1214                (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1215         unsigned int new_hash, old_hash = sk->sk_hash;
1216         struct unix_sock *u = unix_sk(sk);
1217         struct net *net = sock_net(sk);
1218         struct mnt_idmap *idmap;
1219         struct unix_address *addr;
1220         struct dentry *dentry;
1221         struct path parent;
1222         int err;
1223
1224         addr_len = unix_mkname_bsd(sunaddr, addr_len);
1225         addr = unix_create_addr(sunaddr, addr_len);
1226         if (!addr)
1227                 return -ENOMEM;
1228
1229         /*
1230          * Get the parent directory, calculate the hash for last
1231          * component.
1232          */
1233         dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1234         if (IS_ERR(dentry)) {
1235                 err = PTR_ERR(dentry);
1236                 goto out;
1237         }
1238
1239         /*
1240          * All right, let's create it.
1241          */
1242         idmap = mnt_idmap(parent.mnt);
1243         err = security_path_mknod(&parent, dentry, mode, 0);
1244         if (!err)
1245                 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1246         if (err)
1247                 goto out_path;
1248         err = mutex_lock_interruptible(&u->bindlock);
1249         if (err)
1250                 goto out_unlink;
1251         if (u->addr)
1252                 goto out_unlock;
1253
1254         new_hash = unix_bsd_hash(d_backing_inode(dentry));
1255         unix_table_double_lock(net, old_hash, new_hash);
1256         u->path.mnt = mntget(parent.mnt);
1257         u->path.dentry = dget(dentry);
1258         __unix_set_addr_hash(net, sk, addr, new_hash);
1259         unix_table_double_unlock(net, old_hash, new_hash);
1260         unix_insert_bsd_socket(sk);
1261         mutex_unlock(&u->bindlock);
1262         done_path_create(&parent, dentry);
1263         return 0;
1264
1265 out_unlock:
1266         mutex_unlock(&u->bindlock);
1267         err = -EINVAL;
1268 out_unlink:
1269         /* failed after successful mknod?  unlink what we'd created... */
1270         vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1271 out_path:
1272         done_path_create(&parent, dentry);
1273 out:
1274         unix_release_addr(addr);
1275         return err == -EEXIST ? -EADDRINUSE : err;
1276 }
1277
1278 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1279                               int addr_len)
1280 {
1281         unsigned int new_hash, old_hash = sk->sk_hash;
1282         struct unix_sock *u = unix_sk(sk);
1283         struct net *net = sock_net(sk);
1284         struct unix_address *addr;
1285         int err;
1286
1287         addr = unix_create_addr(sunaddr, addr_len);
1288         if (!addr)
1289                 return -ENOMEM;
1290
1291         err = mutex_lock_interruptible(&u->bindlock);
1292         if (err)
1293                 goto out;
1294
1295         if (u->addr) {
1296                 err = -EINVAL;
1297                 goto out_mutex;
1298         }
1299
1300         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1301         unix_table_double_lock(net, old_hash, new_hash);
1302
1303         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1304                 goto out_spin;
1305
1306         __unix_set_addr_hash(net, sk, addr, new_hash);
1307         unix_table_double_unlock(net, old_hash, new_hash);
1308         mutex_unlock(&u->bindlock);
1309         return 0;
1310
1311 out_spin:
1312         unix_table_double_unlock(net, old_hash, new_hash);
1313         err = -EADDRINUSE;
1314 out_mutex:
1315         mutex_unlock(&u->bindlock);
1316 out:
1317         unix_release_addr(addr);
1318         return err;
1319 }
1320
1321 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1322 {
1323         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1324         struct sock *sk = sock->sk;
1325         int err;
1326
1327         if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1328             sunaddr->sun_family == AF_UNIX)
1329                 return unix_autobind(sk);
1330
1331         err = unix_validate_addr(sunaddr, addr_len);
1332         if (err)
1333                 return err;
1334
1335         if (sunaddr->sun_path[0])
1336                 err = unix_bind_bsd(sk, sunaddr, addr_len);
1337         else
1338                 err = unix_bind_abstract(sk, sunaddr, addr_len);
1339
1340         return err;
1341 }
1342
1343 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1344 {
1345         if (unlikely(sk1 == sk2) || !sk2) {
1346                 unix_state_lock(sk1);
1347                 return;
1348         }
1349         if (sk1 < sk2) {
1350                 unix_state_lock(sk1);
1351                 unix_state_lock_nested(sk2);
1352         } else {
1353                 unix_state_lock(sk2);
1354                 unix_state_lock_nested(sk1);
1355         }
1356 }
1357
1358 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1359 {
1360         if (unlikely(sk1 == sk2) || !sk2) {
1361                 unix_state_unlock(sk1);
1362                 return;
1363         }
1364         unix_state_unlock(sk1);
1365         unix_state_unlock(sk2);
1366 }
1367
1368 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1369                               int alen, int flags)
1370 {
1371         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1372         struct sock *sk = sock->sk;
1373         struct sock *other;
1374         int err;
1375
1376         err = -EINVAL;
1377         if (alen < offsetofend(struct sockaddr, sa_family))
1378                 goto out;
1379
1380         if (addr->sa_family != AF_UNSPEC) {
1381                 err = unix_validate_addr(sunaddr, alen);
1382                 if (err)
1383                         goto out;
1384
1385                 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1386                 if (err)
1387                         goto out;
1388
1389                 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1390                      test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1391                     !unix_sk(sk)->addr) {
1392                         err = unix_autobind(sk);
1393                         if (err)
1394                                 goto out;
1395                 }
1396
1397 restart:
1398                 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1399                 if (IS_ERR(other)) {
1400                         err = PTR_ERR(other);
1401                         goto out;
1402                 }
1403
1404                 unix_state_double_lock(sk, other);
1405
1406                 /* Apparently VFS overslept socket death. Retry. */
1407                 if (sock_flag(other, SOCK_DEAD)) {
1408                         unix_state_double_unlock(sk, other);
1409                         sock_put(other);
1410                         goto restart;
1411                 }
1412
1413                 err = -EPERM;
1414                 if (!unix_may_send(sk, other))
1415                         goto out_unlock;
1416
1417                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1418                 if (err)
1419                         goto out_unlock;
1420
1421                 sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1422         } else {
1423                 /*
1424                  *      1003.1g breaking connected state with AF_UNSPEC
1425                  */
1426                 other = NULL;
1427                 unix_state_double_lock(sk, other);
1428         }
1429
1430         /*
1431          * If it was connected, reconnect.
1432          */
1433         if (unix_peer(sk)) {
1434                 struct sock *old_peer = unix_peer(sk);
1435
1436                 unix_peer(sk) = other;
1437                 if (!other)
1438                         sk->sk_state = TCP_CLOSE;
1439                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1440
1441                 unix_state_double_unlock(sk, other);
1442
1443                 if (other != old_peer)
1444                         unix_dgram_disconnected(sk, old_peer);
1445                 sock_put(old_peer);
1446         } else {
1447                 unix_peer(sk) = other;
1448                 unix_state_double_unlock(sk, other);
1449         }
1450
1451         return 0;
1452
1453 out_unlock:
1454         unix_state_double_unlock(sk, other);
1455         sock_put(other);
1456 out:
1457         return err;
1458 }
1459
1460 static long unix_wait_for_peer(struct sock *other, long timeo)
1461         __releases(&unix_sk(other)->lock)
1462 {
1463         struct unix_sock *u = unix_sk(other);
1464         int sched;
1465         DEFINE_WAIT(wait);
1466
1467         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1468
1469         sched = !sock_flag(other, SOCK_DEAD) &&
1470                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1471                 unix_recvq_full_lockless(other);
1472
1473         unix_state_unlock(other);
1474
1475         if (sched)
1476                 timeo = schedule_timeout(timeo);
1477
1478         finish_wait(&u->peer_wait, &wait);
1479         return timeo;
1480 }
1481
1482 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1483                                int addr_len, int flags)
1484 {
1485         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1486         struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1487         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1488         struct net *net = sock_net(sk);
1489         struct sk_buff *skb = NULL;
1490         long timeo;
1491         int err;
1492         int st;
1493
1494         err = unix_validate_addr(sunaddr, addr_len);
1495         if (err)
1496                 goto out;
1497
1498         err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1499         if (err)
1500                 goto out;
1501
1502         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1503              test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1504                 err = unix_autobind(sk);
1505                 if (err)
1506                         goto out;
1507         }
1508
1509         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1510
1511         /* First of all allocate resources.
1512            If we will make it after state is locked,
1513            we will have to recheck all again in any case.
1514          */
1515
1516         /* create new sock for complete connection */
1517         newsk = unix_create1(net, NULL, 0, sock->type);
1518         if (IS_ERR(newsk)) {
1519                 err = PTR_ERR(newsk);
1520                 newsk = NULL;
1521                 goto out;
1522         }
1523
1524         err = -ENOMEM;
1525
1526         /* Allocate skb for sending to listening sock */
1527         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1528         if (skb == NULL)
1529                 goto out;
1530
1531 restart:
1532         /*  Find listening sock. */
1533         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1534         if (IS_ERR(other)) {
1535                 err = PTR_ERR(other);
1536                 other = NULL;
1537                 goto out;
1538         }
1539
1540         /* Latch state of peer */
1541         unix_state_lock(other);
1542
1543         /* Apparently VFS overslept socket death. Retry. */
1544         if (sock_flag(other, SOCK_DEAD)) {
1545                 unix_state_unlock(other);
1546                 sock_put(other);
1547                 goto restart;
1548         }
1549
1550         err = -ECONNREFUSED;
1551         if (other->sk_state != TCP_LISTEN)
1552                 goto out_unlock;
1553         if (other->sk_shutdown & RCV_SHUTDOWN)
1554                 goto out_unlock;
1555
1556         if (unix_recvq_full(other)) {
1557                 err = -EAGAIN;
1558                 if (!timeo)
1559                         goto out_unlock;
1560
1561                 timeo = unix_wait_for_peer(other, timeo);
1562
1563                 err = sock_intr_errno(timeo);
1564                 if (signal_pending(current))
1565                         goto out;
1566                 sock_put(other);
1567                 goto restart;
1568         }
1569
1570         /* Latch our state.
1571
1572            It is tricky place. We need to grab our state lock and cannot
1573            drop lock on peer. It is dangerous because deadlock is
1574            possible. Connect to self case and simultaneous
1575            attempt to connect are eliminated by checking socket
1576            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1577            check this before attempt to grab lock.
1578
1579            Well, and we have to recheck the state after socket locked.
1580          */
1581         st = sk->sk_state;
1582
1583         switch (st) {
1584         case TCP_CLOSE:
1585                 /* This is ok... continue with connect */
1586                 break;
1587         case TCP_ESTABLISHED:
1588                 /* Socket is already connected */
1589                 err = -EISCONN;
1590                 goto out_unlock;
1591         default:
1592                 err = -EINVAL;
1593                 goto out_unlock;
1594         }
1595
1596         unix_state_lock_nested(sk);
1597
1598         if (sk->sk_state != st) {
1599                 unix_state_unlock(sk);
1600                 unix_state_unlock(other);
1601                 sock_put(other);
1602                 goto restart;
1603         }
1604
1605         err = security_unix_stream_connect(sk, other, newsk);
1606         if (err) {
1607                 unix_state_unlock(sk);
1608                 goto out_unlock;
1609         }
1610
1611         /* The way is open! Fastly set all the necessary fields... */
1612
1613         sock_hold(sk);
1614         unix_peer(newsk)        = sk;
1615         newsk->sk_state         = TCP_ESTABLISHED;
1616         newsk->sk_type          = sk->sk_type;
1617         init_peercred(newsk);
1618         newu = unix_sk(newsk);
1619         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1620         otheru = unix_sk(other);
1621
1622         /* copy address information from listening to new sock
1623          *
1624          * The contents of *(otheru->addr) and otheru->path
1625          * are seen fully set up here, since we have found
1626          * otheru in hash under its lock.  Insertion into the
1627          * hash chain we'd found it in had been done in an
1628          * earlier critical area protected by the chain's lock,
1629          * the same one where we'd set *(otheru->addr) contents,
1630          * as well as otheru->path and otheru->addr itself.
1631          *
1632          * Using smp_store_release() here to set newu->addr
1633          * is enough to make those stores, as well as stores
1634          * to newu->path visible to anyone who gets newu->addr
1635          * by smp_load_acquire().  IOW, the same warranties
1636          * as for unix_sock instances bound in unix_bind() or
1637          * in unix_autobind().
1638          */
1639         if (otheru->path.dentry) {
1640                 path_get(&otheru->path);
1641                 newu->path = otheru->path;
1642         }
1643         refcount_inc(&otheru->addr->refcnt);
1644         smp_store_release(&newu->addr, otheru->addr);
1645
1646         /* Set credentials */
1647         copy_peercred(sk, other);
1648
1649         sock->state     = SS_CONNECTED;
1650         sk->sk_state    = TCP_ESTABLISHED;
1651         sock_hold(newsk);
1652
1653         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1654         unix_peer(sk)   = newsk;
1655
1656         unix_state_unlock(sk);
1657
1658         /* take ten and send info to listening sock */
1659         spin_lock(&other->sk_receive_queue.lock);
1660         __skb_queue_tail(&other->sk_receive_queue, skb);
1661         spin_unlock(&other->sk_receive_queue.lock);
1662         unix_state_unlock(other);
1663         other->sk_data_ready(other);
1664         sock_put(other);
1665         return 0;
1666
1667 out_unlock:
1668         if (other)
1669                 unix_state_unlock(other);
1670
1671 out:
1672         kfree_skb(skb);
1673         if (newsk)
1674                 unix_release_sock(newsk, 0);
1675         if (other)
1676                 sock_put(other);
1677         return err;
1678 }
1679
1680 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1681 {
1682         struct sock *ska = socka->sk, *skb = sockb->sk;
1683
1684         /* Join our sockets back to back */
1685         sock_hold(ska);
1686         sock_hold(skb);
1687         unix_peer(ska) = skb;
1688         unix_peer(skb) = ska;
1689         init_peercred(ska);
1690         init_peercred(skb);
1691
1692         ska->sk_state = TCP_ESTABLISHED;
1693         skb->sk_state = TCP_ESTABLISHED;
1694         socka->state  = SS_CONNECTED;
1695         sockb->state  = SS_CONNECTED;
1696         return 0;
1697 }
1698
1699 static void unix_sock_inherit_flags(const struct socket *old,
1700                                     struct socket *new)
1701 {
1702         if (test_bit(SOCK_PASSCRED, &old->flags))
1703                 set_bit(SOCK_PASSCRED, &new->flags);
1704         if (test_bit(SOCK_PASSPIDFD, &old->flags))
1705                 set_bit(SOCK_PASSPIDFD, &new->flags);
1706         if (test_bit(SOCK_PASSSEC, &old->flags))
1707                 set_bit(SOCK_PASSSEC, &new->flags);
1708 }
1709
1710 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1711                        bool kern)
1712 {
1713         struct sock *sk = sock->sk;
1714         struct sock *tsk;
1715         struct sk_buff *skb;
1716         int err;
1717
1718         err = -EOPNOTSUPP;
1719         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1720                 goto out;
1721
1722         err = -EINVAL;
1723         if (sk->sk_state != TCP_LISTEN)
1724                 goto out;
1725
1726         /* If socket state is TCP_LISTEN it cannot change (for now...),
1727          * so that no locks are necessary.
1728          */
1729
1730         skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1731                                 &err);
1732         if (!skb) {
1733                 /* This means receive shutdown. */
1734                 if (err == 0)
1735                         err = -EINVAL;
1736                 goto out;
1737         }
1738
1739         tsk = skb->sk;
1740         skb_free_datagram(sk, skb);
1741         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1742
1743         /* attach accepted sock to socket */
1744         unix_state_lock(tsk);
1745         newsock->state = SS_CONNECTED;
1746         unix_sock_inherit_flags(sock, newsock);
1747         sock_graft(tsk, newsock);
1748         unix_state_unlock(tsk);
1749         return 0;
1750
1751 out:
1752         return err;
1753 }
1754
1755
1756 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1757 {
1758         struct sock *sk = sock->sk;
1759         struct unix_address *addr;
1760         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1761         int err = 0;
1762
1763         if (peer) {
1764                 sk = unix_peer_get(sk);
1765
1766                 err = -ENOTCONN;
1767                 if (!sk)
1768                         goto out;
1769                 err = 0;
1770         } else {
1771                 sock_hold(sk);
1772         }
1773
1774         addr = smp_load_acquire(&unix_sk(sk)->addr);
1775         if (!addr) {
1776                 sunaddr->sun_family = AF_UNIX;
1777                 sunaddr->sun_path[0] = 0;
1778                 err = offsetof(struct sockaddr_un, sun_path);
1779         } else {
1780                 err = addr->len;
1781                 memcpy(sunaddr, addr->name, addr->len);
1782
1783                 if (peer)
1784                         BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1785                                                CGROUP_UNIX_GETPEERNAME);
1786                 else
1787                         BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1788                                                CGROUP_UNIX_GETSOCKNAME);
1789         }
1790         sock_put(sk);
1791 out:
1792         return err;
1793 }
1794
1795 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1796 {
1797         scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1798
1799         /*
1800          * Garbage collection of unix sockets starts by selecting a set of
1801          * candidate sockets which have reference only from being in flight
1802          * (total_refs == inflight_refs).  This condition is checked once during
1803          * the candidate collection phase, and candidates are marked as such, so
1804          * that non-candidates can later be ignored.  While inflight_refs is
1805          * protected by unix_gc_lock, total_refs (file count) is not, hence this
1806          * is an instantaneous decision.
1807          *
1808          * Once a candidate, however, the socket must not be reinstalled into a
1809          * file descriptor while the garbage collection is in progress.
1810          *
1811          * If the above conditions are met, then the directed graph of
1812          * candidates (*) does not change while unix_gc_lock is held.
1813          *
1814          * Any operations that changes the file count through file descriptors
1815          * (dup, close, sendmsg) does not change the graph since candidates are
1816          * not installed in fds.
1817          *
1818          * Dequeing a candidate via recvmsg would install it into an fd, but
1819          * that takes unix_gc_lock to decrement the inflight count, so it's
1820          * serialized with garbage collection.
1821          *
1822          * MSG_PEEK is special in that it does not change the inflight count,
1823          * yet does install the socket into an fd.  The following lock/unlock
1824          * pair is to ensure serialization with garbage collection.  It must be
1825          * done between incrementing the file count and installing the file into
1826          * an fd.
1827          *
1828          * If garbage collection starts after the barrier provided by the
1829          * lock/unlock, then it will see the elevated refcount and not mark this
1830          * as a candidate.  If a garbage collection is already in progress
1831          * before the file count was incremented, then the lock/unlock pair will
1832          * ensure that garbage collection is finished before progressing to
1833          * installing the fd.
1834          *
1835          * (*) A -> B where B is on the queue of A or B is on the queue of C
1836          * which is on the queue of listening socket A.
1837          */
1838         spin_lock(&unix_gc_lock);
1839         spin_unlock(&unix_gc_lock);
1840 }
1841
1842 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1843 {
1844         int err = 0;
1845
1846         UNIXCB(skb).pid  = get_pid(scm->pid);
1847         UNIXCB(skb).uid = scm->creds.uid;
1848         UNIXCB(skb).gid = scm->creds.gid;
1849         UNIXCB(skb).fp = NULL;
1850         unix_get_secdata(scm, skb);
1851         if (scm->fp && send_fds)
1852                 err = unix_attach_fds(scm, skb);
1853
1854         skb->destructor = unix_destruct_scm;
1855         return err;
1856 }
1857
1858 static bool unix_passcred_enabled(const struct socket *sock,
1859                                   const struct sock *other)
1860 {
1861         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1862                test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1863                !other->sk_socket ||
1864                test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1865                test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1866 }
1867
1868 /*
1869  * Some apps rely on write() giving SCM_CREDENTIALS
1870  * We include credentials if source or destination socket
1871  * asserted SOCK_PASSCRED.
1872  */
1873 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1874                             const struct sock *other)
1875 {
1876         if (UNIXCB(skb).pid)
1877                 return;
1878         if (unix_passcred_enabled(sock, other)) {
1879                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1880                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1881         }
1882 }
1883
1884 static bool unix_skb_scm_eq(struct sk_buff *skb,
1885                             struct scm_cookie *scm)
1886 {
1887         return UNIXCB(skb).pid == scm->pid &&
1888                uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1889                gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1890                unix_secdata_eq(scm, skb);
1891 }
1892
1893 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1894 {
1895         struct scm_fp_list *fp = UNIXCB(skb).fp;
1896         struct unix_sock *u = unix_sk(sk);
1897
1898         if (unlikely(fp && fp->count))
1899                 atomic_add(fp->count, &u->scm_stat.nr_fds);
1900 }
1901
1902 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1903 {
1904         struct scm_fp_list *fp = UNIXCB(skb).fp;
1905         struct unix_sock *u = unix_sk(sk);
1906
1907         if (unlikely(fp && fp->count))
1908                 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1909 }
1910
1911 /*
1912  *      Send AF_UNIX data.
1913  */
1914
1915 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1916                               size_t len)
1917 {
1918         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1919         struct sock *sk = sock->sk, *other = NULL;
1920         struct unix_sock *u = unix_sk(sk);
1921         struct scm_cookie scm;
1922         struct sk_buff *skb;
1923         int data_len = 0;
1924         int sk_locked;
1925         long timeo;
1926         int err;
1927
1928         wait_for_unix_gc();
1929         err = scm_send(sock, msg, &scm, false);
1930         if (err < 0)
1931                 return err;
1932
1933         err = -EOPNOTSUPP;
1934         if (msg->msg_flags&MSG_OOB)
1935                 goto out;
1936
1937         if (msg->msg_namelen) {
1938                 err = unix_validate_addr(sunaddr, msg->msg_namelen);
1939                 if (err)
1940                         goto out;
1941
1942                 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1943                                                             msg->msg_name,
1944                                                             &msg->msg_namelen,
1945                                                             NULL);
1946                 if (err)
1947                         goto out;
1948         } else {
1949                 sunaddr = NULL;
1950                 err = -ENOTCONN;
1951                 other = unix_peer_get(sk);
1952                 if (!other)
1953                         goto out;
1954         }
1955
1956         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1957              test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1958                 err = unix_autobind(sk);
1959                 if (err)
1960                         goto out;
1961         }
1962
1963         err = -EMSGSIZE;
1964         if (len > sk->sk_sndbuf - 32)
1965                 goto out;
1966
1967         if (len > SKB_MAX_ALLOC) {
1968                 data_len = min_t(size_t,
1969                                  len - SKB_MAX_ALLOC,
1970                                  MAX_SKB_FRAGS * PAGE_SIZE);
1971                 data_len = PAGE_ALIGN(data_len);
1972
1973                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1974         }
1975
1976         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1977                                    msg->msg_flags & MSG_DONTWAIT, &err,
1978                                    PAGE_ALLOC_COSTLY_ORDER);
1979         if (skb == NULL)
1980                 goto out;
1981
1982         err = unix_scm_to_skb(&scm, skb, true);
1983         if (err < 0)
1984                 goto out_free;
1985
1986         skb_put(skb, len - data_len);
1987         skb->data_len = data_len;
1988         skb->len = len;
1989         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1990         if (err)
1991                 goto out_free;
1992
1993         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1994
1995 restart:
1996         if (!other) {
1997                 err = -ECONNRESET;
1998                 if (sunaddr == NULL)
1999                         goto out_free;
2000
2001                 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
2002                                         sk->sk_type);
2003                 if (IS_ERR(other)) {
2004                         err = PTR_ERR(other);
2005                         other = NULL;
2006                         goto out_free;
2007                 }
2008         }
2009
2010         if (sk_filter(other, skb) < 0) {
2011                 /* Toss the packet but do not return any error to the sender */
2012                 err = len;
2013                 goto out_free;
2014         }
2015
2016         sk_locked = 0;
2017         unix_state_lock(other);
2018 restart_locked:
2019         err = -EPERM;
2020         if (!unix_may_send(sk, other))
2021                 goto out_unlock;
2022
2023         if (unlikely(sock_flag(other, SOCK_DEAD))) {
2024                 /*
2025                  *      Check with 1003.1g - what should
2026                  *      datagram error
2027                  */
2028                 unix_state_unlock(other);
2029                 sock_put(other);
2030
2031                 if (!sk_locked)
2032                         unix_state_lock(sk);
2033
2034                 err = 0;
2035                 if (sk->sk_type == SOCK_SEQPACKET) {
2036                         /* We are here only when racing with unix_release_sock()
2037                          * is clearing @other. Never change state to TCP_CLOSE
2038                          * unlike SOCK_DGRAM wants.
2039                          */
2040                         unix_state_unlock(sk);
2041                         err = -EPIPE;
2042                 } else if (unix_peer(sk) == other) {
2043                         unix_peer(sk) = NULL;
2044                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2045
2046                         sk->sk_state = TCP_CLOSE;
2047                         unix_state_unlock(sk);
2048
2049                         unix_dgram_disconnected(sk, other);
2050                         sock_put(other);
2051                         err = -ECONNREFUSED;
2052                 } else {
2053                         unix_state_unlock(sk);
2054                 }
2055
2056                 other = NULL;
2057                 if (err)
2058                         goto out_free;
2059                 goto restart;
2060         }
2061
2062         err = -EPIPE;
2063         if (other->sk_shutdown & RCV_SHUTDOWN)
2064                 goto out_unlock;
2065
2066         if (sk->sk_type != SOCK_SEQPACKET) {
2067                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2068                 if (err)
2069                         goto out_unlock;
2070         }
2071
2072         /* other == sk && unix_peer(other) != sk if
2073          * - unix_peer(sk) == NULL, destination address bound to sk
2074          * - unix_peer(sk) == sk by time of get but disconnected before lock
2075          */
2076         if (other != sk &&
2077             unlikely(unix_peer(other) != sk &&
2078             unix_recvq_full_lockless(other))) {
2079                 if (timeo) {
2080                         timeo = unix_wait_for_peer(other, timeo);
2081
2082                         err = sock_intr_errno(timeo);
2083                         if (signal_pending(current))
2084                                 goto out_free;
2085
2086                         goto restart;
2087                 }
2088
2089                 if (!sk_locked) {
2090                         unix_state_unlock(other);
2091                         unix_state_double_lock(sk, other);
2092                 }
2093
2094                 if (unix_peer(sk) != other ||
2095                     unix_dgram_peer_wake_me(sk, other)) {
2096                         err = -EAGAIN;
2097                         sk_locked = 1;
2098                         goto out_unlock;
2099                 }
2100
2101                 if (!sk_locked) {
2102                         sk_locked = 1;
2103                         goto restart_locked;
2104                 }
2105         }
2106
2107         if (unlikely(sk_locked))
2108                 unix_state_unlock(sk);
2109
2110         if (sock_flag(other, SOCK_RCVTSTAMP))
2111                 __net_timestamp(skb);
2112         maybe_add_creds(skb, sock, other);
2113         scm_stat_add(other, skb);
2114         skb_queue_tail(&other->sk_receive_queue, skb);
2115         unix_state_unlock(other);
2116         other->sk_data_ready(other);
2117         sock_put(other);
2118         scm_destroy(&scm);
2119         return len;
2120
2121 out_unlock:
2122         if (sk_locked)
2123                 unix_state_unlock(sk);
2124         unix_state_unlock(other);
2125 out_free:
2126         kfree_skb(skb);
2127 out:
2128         if (other)
2129                 sock_put(other);
2130         scm_destroy(&scm);
2131         return err;
2132 }
2133
2134 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2135  * bytes, and a minimum of a full page.
2136  */
2137 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2138
2139 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2140 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2141                      struct scm_cookie *scm, bool fds_sent)
2142 {
2143         struct unix_sock *ousk = unix_sk(other);
2144         struct sk_buff *skb;
2145         int err = 0;
2146
2147         skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2148
2149         if (!skb)
2150                 return err;
2151
2152         err = unix_scm_to_skb(scm, skb, !fds_sent);
2153         if (err < 0) {
2154                 kfree_skb(skb);
2155                 return err;
2156         }
2157         skb_put(skb, 1);
2158         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2159
2160         if (err) {
2161                 kfree_skb(skb);
2162                 return err;
2163         }
2164
2165         unix_state_lock(other);
2166
2167         if (sock_flag(other, SOCK_DEAD) ||
2168             (other->sk_shutdown & RCV_SHUTDOWN)) {
2169                 unix_state_unlock(other);
2170                 kfree_skb(skb);
2171                 return -EPIPE;
2172         }
2173
2174         maybe_add_creds(skb, sock, other);
2175         skb_get(skb);
2176
2177         if (ousk->oob_skb)
2178                 consume_skb(ousk->oob_skb);
2179
2180         WRITE_ONCE(ousk->oob_skb, skb);
2181
2182         scm_stat_add(other, skb);
2183         skb_queue_tail(&other->sk_receive_queue, skb);
2184         sk_send_sigurg(other);
2185         unix_state_unlock(other);
2186         other->sk_data_ready(other);
2187
2188         return err;
2189 }
2190 #endif
2191
2192 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2193                                size_t len)
2194 {
2195         struct sock *sk = sock->sk;
2196         struct sock *other = NULL;
2197         int err, size;
2198         struct sk_buff *skb;
2199         int sent = 0;
2200         struct scm_cookie scm;
2201         bool fds_sent = false;
2202         int data_len;
2203
2204         wait_for_unix_gc();
2205         err = scm_send(sock, msg, &scm, false);
2206         if (err < 0)
2207                 return err;
2208
2209         err = -EOPNOTSUPP;
2210         if (msg->msg_flags & MSG_OOB) {
2211 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2212                 if (len)
2213                         len--;
2214                 else
2215 #endif
2216                         goto out_err;
2217         }
2218
2219         if (msg->msg_namelen) {
2220                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2221                 goto out_err;
2222         } else {
2223                 err = -ENOTCONN;
2224                 other = unix_peer(sk);
2225                 if (!other)
2226                         goto out_err;
2227         }
2228
2229         if (sk->sk_shutdown & SEND_SHUTDOWN)
2230                 goto pipe_err;
2231
2232         while (sent < len) {
2233                 size = len - sent;
2234
2235                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2236                         skb = sock_alloc_send_pskb(sk, 0, 0,
2237                                                    msg->msg_flags & MSG_DONTWAIT,
2238                                                    &err, 0);
2239                 } else {
2240                         /* Keep two messages in the pipe so it schedules better */
2241                         size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2242
2243                         /* allow fallback to order-0 allocations */
2244                         size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2245
2246                         data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2247
2248                         data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2249
2250                         skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2251                                                    msg->msg_flags & MSG_DONTWAIT, &err,
2252                                                    get_order(UNIX_SKB_FRAGS_SZ));
2253                 }
2254                 if (!skb)
2255                         goto out_err;
2256
2257                 /* Only send the fds in the first buffer */
2258                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2259                 if (err < 0) {
2260                         kfree_skb(skb);
2261                         goto out_err;
2262                 }
2263                 fds_sent = true;
2264
2265                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2266                         err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2267                                                    sk->sk_allocation);
2268                         if (err < 0) {
2269                                 kfree_skb(skb);
2270                                 goto out_err;
2271                         }
2272                         size = err;
2273                         refcount_add(size, &sk->sk_wmem_alloc);
2274                 } else {
2275                         skb_put(skb, size - data_len);
2276                         skb->data_len = data_len;
2277                         skb->len = size;
2278                         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2279                         if (err) {
2280                                 kfree_skb(skb);
2281                                 goto out_err;
2282                         }
2283                 }
2284
2285                 unix_state_lock(other);
2286
2287                 if (sock_flag(other, SOCK_DEAD) ||
2288                     (other->sk_shutdown & RCV_SHUTDOWN))
2289                         goto pipe_err_free;
2290
2291                 maybe_add_creds(skb, sock, other);
2292                 scm_stat_add(other, skb);
2293                 skb_queue_tail(&other->sk_receive_queue, skb);
2294                 unix_state_unlock(other);
2295                 other->sk_data_ready(other);
2296                 sent += size;
2297         }
2298
2299 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2300         if (msg->msg_flags & MSG_OOB) {
2301                 err = queue_oob(sock, msg, other, &scm, fds_sent);
2302                 if (err)
2303                         goto out_err;
2304                 sent++;
2305         }
2306 #endif
2307
2308         scm_destroy(&scm);
2309
2310         return sent;
2311
2312 pipe_err_free:
2313         unix_state_unlock(other);
2314         kfree_skb(skb);
2315 pipe_err:
2316         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2317                 send_sig(SIGPIPE, current, 0);
2318         err = -EPIPE;
2319 out_err:
2320         scm_destroy(&scm);
2321         return sent ? : err;
2322 }
2323
2324 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2325                                   size_t len)
2326 {
2327         int err;
2328         struct sock *sk = sock->sk;
2329
2330         err = sock_error(sk);
2331         if (err)
2332                 return err;
2333
2334         if (sk->sk_state != TCP_ESTABLISHED)
2335                 return -ENOTCONN;
2336
2337         if (msg->msg_namelen)
2338                 msg->msg_namelen = 0;
2339
2340         return unix_dgram_sendmsg(sock, msg, len);
2341 }
2342
2343 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2344                                   size_t size, int flags)
2345 {
2346         struct sock *sk = sock->sk;
2347
2348         if (sk->sk_state != TCP_ESTABLISHED)
2349                 return -ENOTCONN;
2350
2351         return unix_dgram_recvmsg(sock, msg, size, flags);
2352 }
2353
2354 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2355 {
2356         struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2357
2358         if (addr) {
2359                 msg->msg_namelen = addr->len;
2360                 memcpy(msg->msg_name, addr->name, addr->len);
2361         }
2362 }
2363
2364 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2365                          int flags)
2366 {
2367         struct scm_cookie scm;
2368         struct socket *sock = sk->sk_socket;
2369         struct unix_sock *u = unix_sk(sk);
2370         struct sk_buff *skb, *last;
2371         long timeo;
2372         int skip;
2373         int err;
2374
2375         err = -EOPNOTSUPP;
2376         if (flags&MSG_OOB)
2377                 goto out;
2378
2379         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2380
2381         do {
2382                 mutex_lock(&u->iolock);
2383
2384                 skip = sk_peek_offset(sk, flags);
2385                 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2386                                               &skip, &err, &last);
2387                 if (skb) {
2388                         if (!(flags & MSG_PEEK))
2389                                 scm_stat_del(sk, skb);
2390                         break;
2391                 }
2392
2393                 mutex_unlock(&u->iolock);
2394
2395                 if (err != -EAGAIN)
2396                         break;
2397         } while (timeo &&
2398                  !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2399                                               &err, &timeo, last));
2400
2401         if (!skb) { /* implies iolock unlocked */
2402                 unix_state_lock(sk);
2403                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2404                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2405                     (sk->sk_shutdown & RCV_SHUTDOWN))
2406                         err = 0;
2407                 unix_state_unlock(sk);
2408                 goto out;
2409         }
2410
2411         if (wq_has_sleeper(&u->peer_wait))
2412                 wake_up_interruptible_sync_poll(&u->peer_wait,
2413                                                 EPOLLOUT | EPOLLWRNORM |
2414                                                 EPOLLWRBAND);
2415
2416         if (msg->msg_name) {
2417                 unix_copy_addr(msg, skb->sk);
2418
2419                 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2420                                                       msg->msg_name,
2421                                                       &msg->msg_namelen);
2422         }
2423
2424         if (size > skb->len - skip)
2425                 size = skb->len - skip;
2426         else if (size < skb->len - skip)
2427                 msg->msg_flags |= MSG_TRUNC;
2428
2429         err = skb_copy_datagram_msg(skb, skip, msg, size);
2430         if (err)
2431                 goto out_free;
2432
2433         if (sock_flag(sk, SOCK_RCVTSTAMP))
2434                 __sock_recv_timestamp(msg, sk, skb);
2435
2436         memset(&scm, 0, sizeof(scm));
2437
2438         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2439         unix_set_secdata(&scm, skb);
2440
2441         if (!(flags & MSG_PEEK)) {
2442                 if (UNIXCB(skb).fp)
2443                         unix_detach_fds(&scm, skb);
2444
2445                 sk_peek_offset_bwd(sk, skb->len);
2446         } else {
2447                 /* It is questionable: on PEEK we could:
2448                    - do not return fds - good, but too simple 8)
2449                    - return fds, and do not return them on read (old strategy,
2450                      apparently wrong)
2451                    - clone fds (I chose it for now, it is the most universal
2452                      solution)
2453
2454                    POSIX 1003.1g does not actually define this clearly
2455                    at all. POSIX 1003.1g doesn't define a lot of things
2456                    clearly however!
2457
2458                 */
2459
2460                 sk_peek_offset_fwd(sk, size);
2461
2462                 if (UNIXCB(skb).fp)
2463                         unix_peek_fds(&scm, skb);
2464         }
2465         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2466
2467         scm_recv_unix(sock, msg, &scm, flags);
2468
2469 out_free:
2470         skb_free_datagram(sk, skb);
2471         mutex_unlock(&u->iolock);
2472 out:
2473         return err;
2474 }
2475
2476 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2477                               int flags)
2478 {
2479         struct sock *sk = sock->sk;
2480
2481 #ifdef CONFIG_BPF_SYSCALL
2482         const struct proto *prot = READ_ONCE(sk->sk_prot);
2483
2484         if (prot != &unix_dgram_proto)
2485                 return prot->recvmsg(sk, msg, size, flags, NULL);
2486 #endif
2487         return __unix_dgram_recvmsg(sk, msg, size, flags);
2488 }
2489
2490 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2491 {
2492         struct unix_sock *u = unix_sk(sk);
2493         struct sk_buff *skb;
2494         int err;
2495
2496         mutex_lock(&u->iolock);
2497         skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2498         mutex_unlock(&u->iolock);
2499         if (!skb)
2500                 return err;
2501
2502         return recv_actor(sk, skb);
2503 }
2504
2505 /*
2506  *      Sleep until more data has arrived. But check for races..
2507  */
2508 static long unix_stream_data_wait(struct sock *sk, long timeo,
2509                                   struct sk_buff *last, unsigned int last_len,
2510                                   bool freezable)
2511 {
2512         unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2513         struct sk_buff *tail;
2514         DEFINE_WAIT(wait);
2515
2516         unix_state_lock(sk);
2517
2518         for (;;) {
2519                 prepare_to_wait(sk_sleep(sk), &wait, state);
2520
2521                 tail = skb_peek_tail(&sk->sk_receive_queue);
2522                 if (tail != last ||
2523                     (tail && tail->len != last_len) ||
2524                     sk->sk_err ||
2525                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2526                     signal_pending(current) ||
2527                     !timeo)
2528                         break;
2529
2530                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2531                 unix_state_unlock(sk);
2532                 timeo = schedule_timeout(timeo);
2533                 unix_state_lock(sk);
2534
2535                 if (sock_flag(sk, SOCK_DEAD))
2536                         break;
2537
2538                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2539         }
2540
2541         finish_wait(sk_sleep(sk), &wait);
2542         unix_state_unlock(sk);
2543         return timeo;
2544 }
2545
2546 static unsigned int unix_skb_len(const struct sk_buff *skb)
2547 {
2548         return skb->len - UNIXCB(skb).consumed;
2549 }
2550
2551 struct unix_stream_read_state {
2552         int (*recv_actor)(struct sk_buff *, int, int,
2553                           struct unix_stream_read_state *);
2554         struct socket *socket;
2555         struct msghdr *msg;
2556         struct pipe_inode_info *pipe;
2557         size_t size;
2558         int flags;
2559         unsigned int splice_flags;
2560 };
2561
2562 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2563 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2564 {
2565         struct socket *sock = state->socket;
2566         struct sock *sk = sock->sk;
2567         struct unix_sock *u = unix_sk(sk);
2568         int chunk = 1;
2569         struct sk_buff *oob_skb;
2570
2571         mutex_lock(&u->iolock);
2572         unix_state_lock(sk);
2573
2574         if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2575                 unix_state_unlock(sk);
2576                 mutex_unlock(&u->iolock);
2577                 return -EINVAL;
2578         }
2579
2580         oob_skb = u->oob_skb;
2581
2582         if (!(state->flags & MSG_PEEK))
2583                 WRITE_ONCE(u->oob_skb, NULL);
2584
2585         unix_state_unlock(sk);
2586
2587         chunk = state->recv_actor(oob_skb, 0, chunk, state);
2588
2589         if (!(state->flags & MSG_PEEK)) {
2590                 UNIXCB(oob_skb).consumed += 1;
2591                 kfree_skb(oob_skb);
2592         }
2593
2594         mutex_unlock(&u->iolock);
2595
2596         if (chunk < 0)
2597                 return -EFAULT;
2598
2599         state->msg->msg_flags |= MSG_OOB;
2600         return 1;
2601 }
2602
2603 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2604                                   int flags, int copied)
2605 {
2606         struct unix_sock *u = unix_sk(sk);
2607
2608         if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2609                 skb_unlink(skb, &sk->sk_receive_queue);
2610                 consume_skb(skb);
2611                 skb = NULL;
2612         } else {
2613                 if (skb == u->oob_skb) {
2614                         if (copied) {
2615                                 skb = NULL;
2616                         } else if (sock_flag(sk, SOCK_URGINLINE)) {
2617                                 if (!(flags & MSG_PEEK)) {
2618                                         WRITE_ONCE(u->oob_skb, NULL);
2619                                         consume_skb(skb);
2620                                 }
2621                         } else if (!(flags & MSG_PEEK)) {
2622                                 skb_unlink(skb, &sk->sk_receive_queue);
2623                                 consume_skb(skb);
2624                                 skb = skb_peek(&sk->sk_receive_queue);
2625                         }
2626                 }
2627         }
2628         return skb;
2629 }
2630 #endif
2631
2632 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2633 {
2634         if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2635                 return -ENOTCONN;
2636
2637         return unix_read_skb(sk, recv_actor);
2638 }
2639
2640 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2641                                     bool freezable)
2642 {
2643         struct scm_cookie scm;
2644         struct socket *sock = state->socket;
2645         struct sock *sk = sock->sk;
2646         struct unix_sock *u = unix_sk(sk);
2647         int copied = 0;
2648         int flags = state->flags;
2649         int noblock = flags & MSG_DONTWAIT;
2650         bool check_creds = false;
2651         int target;
2652         int err = 0;
2653         long timeo;
2654         int skip;
2655         size_t size = state->size;
2656         unsigned int last_len;
2657
2658         if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2659                 err = -EINVAL;
2660                 goto out;
2661         }
2662
2663         if (unlikely(flags & MSG_OOB)) {
2664                 err = -EOPNOTSUPP;
2665 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2666                 err = unix_stream_recv_urg(state);
2667 #endif
2668                 goto out;
2669         }
2670
2671         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2672         timeo = sock_rcvtimeo(sk, noblock);
2673
2674         memset(&scm, 0, sizeof(scm));
2675
2676         /* Lock the socket to prevent queue disordering
2677          * while sleeps in memcpy_tomsg
2678          */
2679         mutex_lock(&u->iolock);
2680
2681         skip = max(sk_peek_offset(sk, flags), 0);
2682
2683         do {
2684                 int chunk;
2685                 bool drop_skb;
2686                 struct sk_buff *skb, *last;
2687
2688 redo:
2689                 unix_state_lock(sk);
2690                 if (sock_flag(sk, SOCK_DEAD)) {
2691                         err = -ECONNRESET;
2692                         goto unlock;
2693                 }
2694                 last = skb = skb_peek(&sk->sk_receive_queue);
2695                 last_len = last ? last->len : 0;
2696
2697 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2698                 if (skb) {
2699                         skb = manage_oob(skb, sk, flags, copied);
2700                         if (!skb) {
2701                                 unix_state_unlock(sk);
2702                                 if (copied)
2703                                         break;
2704                                 goto redo;
2705                         }
2706                 }
2707 #endif
2708 again:
2709                 if (skb == NULL) {
2710                         if (copied >= target)
2711                                 goto unlock;
2712
2713                         /*
2714                          *      POSIX 1003.1g mandates this order.
2715                          */
2716
2717                         err = sock_error(sk);
2718                         if (err)
2719                                 goto unlock;
2720                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2721                                 goto unlock;
2722
2723                         unix_state_unlock(sk);
2724                         if (!timeo) {
2725                                 err = -EAGAIN;
2726                                 break;
2727                         }
2728
2729                         mutex_unlock(&u->iolock);
2730
2731                         timeo = unix_stream_data_wait(sk, timeo, last,
2732                                                       last_len, freezable);
2733
2734                         if (signal_pending(current)) {
2735                                 err = sock_intr_errno(timeo);
2736                                 scm_destroy(&scm);
2737                                 goto out;
2738                         }
2739
2740                         mutex_lock(&u->iolock);
2741                         goto redo;
2742 unlock:
2743                         unix_state_unlock(sk);
2744                         break;
2745                 }
2746
2747                 while (skip >= unix_skb_len(skb)) {
2748                         skip -= unix_skb_len(skb);
2749                         last = skb;
2750                         last_len = skb->len;
2751                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2752                         if (!skb)
2753                                 goto again;
2754                 }
2755
2756                 unix_state_unlock(sk);
2757
2758                 if (check_creds) {
2759                         /* Never glue messages from different writers */
2760                         if (!unix_skb_scm_eq(skb, &scm))
2761                                 break;
2762                 } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2763                            test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2764                         /* Copy credentials */
2765                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2766                         unix_set_secdata(&scm, skb);
2767                         check_creds = true;
2768                 }
2769
2770                 /* Copy address just once */
2771                 if (state->msg && state->msg->msg_name) {
2772                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2773                                          state->msg->msg_name);
2774                         unix_copy_addr(state->msg, skb->sk);
2775
2776                         BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2777                                                               state->msg->msg_name,
2778                                                               &state->msg->msg_namelen);
2779
2780                         sunaddr = NULL;
2781                 }
2782
2783                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2784                 skb_get(skb);
2785                 chunk = state->recv_actor(skb, skip, chunk, state);
2786                 drop_skb = !unix_skb_len(skb);
2787                 /* skb is only safe to use if !drop_skb */
2788                 consume_skb(skb);
2789                 if (chunk < 0) {
2790                         if (copied == 0)
2791                                 copied = -EFAULT;
2792                         break;
2793                 }
2794                 copied += chunk;
2795                 size -= chunk;
2796
2797                 if (drop_skb) {
2798                         /* the skb was touched by a concurrent reader;
2799                          * we should not expect anything from this skb
2800                          * anymore and assume it invalid - we can be
2801                          * sure it was dropped from the socket queue
2802                          *
2803                          * let's report a short read
2804                          */
2805                         err = 0;
2806                         break;
2807                 }
2808
2809                 /* Mark read part of skb as used */
2810                 if (!(flags & MSG_PEEK)) {
2811                         UNIXCB(skb).consumed += chunk;
2812
2813                         sk_peek_offset_bwd(sk, chunk);
2814
2815                         if (UNIXCB(skb).fp) {
2816                                 scm_stat_del(sk, skb);
2817                                 unix_detach_fds(&scm, skb);
2818                         }
2819
2820                         if (unix_skb_len(skb))
2821                                 break;
2822
2823                         skb_unlink(skb, &sk->sk_receive_queue);
2824                         consume_skb(skb);
2825
2826                         if (scm.fp)
2827                                 break;
2828                 } else {
2829                         /* It is questionable, see note in unix_dgram_recvmsg.
2830                          */
2831                         if (UNIXCB(skb).fp)
2832                                 unix_peek_fds(&scm, skb);
2833
2834                         sk_peek_offset_fwd(sk, chunk);
2835
2836                         if (UNIXCB(skb).fp)
2837                                 break;
2838
2839                         skip = 0;
2840                         last = skb;
2841                         last_len = skb->len;
2842                         unix_state_lock(sk);
2843                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2844                         if (skb)
2845                                 goto again;
2846                         unix_state_unlock(sk);
2847                         break;
2848                 }
2849         } while (size);
2850
2851         mutex_unlock(&u->iolock);
2852         if (state->msg)
2853                 scm_recv_unix(sock, state->msg, &scm, flags);
2854         else
2855                 scm_destroy(&scm);
2856 out:
2857         return copied ? : err;
2858 }
2859
2860 static int unix_stream_read_actor(struct sk_buff *skb,
2861                                   int skip, int chunk,
2862                                   struct unix_stream_read_state *state)
2863 {
2864         int ret;
2865
2866         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2867                                     state->msg, chunk);
2868         return ret ?: chunk;
2869 }
2870
2871 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2872                           size_t size, int flags)
2873 {
2874         struct unix_stream_read_state state = {
2875                 .recv_actor = unix_stream_read_actor,
2876                 .socket = sk->sk_socket,
2877                 .msg = msg,
2878                 .size = size,
2879                 .flags = flags
2880         };
2881
2882         return unix_stream_read_generic(&state, true);
2883 }
2884
2885 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2886                                size_t size, int flags)
2887 {
2888         struct unix_stream_read_state state = {
2889                 .recv_actor = unix_stream_read_actor,
2890                 .socket = sock,
2891                 .msg = msg,
2892                 .size = size,
2893                 .flags = flags
2894         };
2895
2896 #ifdef CONFIG_BPF_SYSCALL
2897         struct sock *sk = sock->sk;
2898         const struct proto *prot = READ_ONCE(sk->sk_prot);
2899
2900         if (prot != &unix_stream_proto)
2901                 return prot->recvmsg(sk, msg, size, flags, NULL);
2902 #endif
2903         return unix_stream_read_generic(&state, true);
2904 }
2905
2906 static int unix_stream_splice_actor(struct sk_buff *skb,
2907                                     int skip, int chunk,
2908                                     struct unix_stream_read_state *state)
2909 {
2910         return skb_splice_bits(skb, state->socket->sk,
2911                                UNIXCB(skb).consumed + skip,
2912                                state->pipe, chunk, state->splice_flags);
2913 }
2914
2915 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2916                                        struct pipe_inode_info *pipe,
2917                                        size_t size, unsigned int flags)
2918 {
2919         struct unix_stream_read_state state = {
2920                 .recv_actor = unix_stream_splice_actor,
2921                 .socket = sock,
2922                 .pipe = pipe,
2923                 .size = size,
2924                 .splice_flags = flags,
2925         };
2926
2927         if (unlikely(*ppos))
2928                 return -ESPIPE;
2929
2930         if (sock->file->f_flags & O_NONBLOCK ||
2931             flags & SPLICE_F_NONBLOCK)
2932                 state.flags = MSG_DONTWAIT;
2933
2934         return unix_stream_read_generic(&state, false);
2935 }
2936
2937 static int unix_shutdown(struct socket *sock, int mode)
2938 {
2939         struct sock *sk = sock->sk;
2940         struct sock *other;
2941
2942         if (mode < SHUT_RD || mode > SHUT_RDWR)
2943                 return -EINVAL;
2944         /* This maps:
2945          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2946          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2947          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2948          */
2949         ++mode;
2950
2951         unix_state_lock(sk);
2952         WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2953         other = unix_peer(sk);
2954         if (other)
2955                 sock_hold(other);
2956         unix_state_unlock(sk);
2957         sk->sk_state_change(sk);
2958
2959         if (other &&
2960                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2961
2962                 int peer_mode = 0;
2963                 const struct proto *prot = READ_ONCE(other->sk_prot);
2964
2965                 if (prot->unhash)
2966                         prot->unhash(other);
2967                 if (mode&RCV_SHUTDOWN)
2968                         peer_mode |= SEND_SHUTDOWN;
2969                 if (mode&SEND_SHUTDOWN)
2970                         peer_mode |= RCV_SHUTDOWN;
2971                 unix_state_lock(other);
2972                 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2973                 unix_state_unlock(other);
2974                 other->sk_state_change(other);
2975                 if (peer_mode == SHUTDOWN_MASK)
2976                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2977                 else if (peer_mode & RCV_SHUTDOWN)
2978                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2979         }
2980         if (other)
2981                 sock_put(other);
2982
2983         return 0;
2984 }
2985
2986 long unix_inq_len(struct sock *sk)
2987 {
2988         struct sk_buff *skb;
2989         long amount = 0;
2990
2991         if (sk->sk_state == TCP_LISTEN)
2992                 return -EINVAL;
2993
2994         spin_lock(&sk->sk_receive_queue.lock);
2995         if (sk->sk_type == SOCK_STREAM ||
2996             sk->sk_type == SOCK_SEQPACKET) {
2997                 skb_queue_walk(&sk->sk_receive_queue, skb)
2998                         amount += unix_skb_len(skb);
2999         } else {
3000                 skb = skb_peek(&sk->sk_receive_queue);
3001                 if (skb)
3002                         amount = skb->len;
3003         }
3004         spin_unlock(&sk->sk_receive_queue.lock);
3005
3006         return amount;
3007 }
3008 EXPORT_SYMBOL_GPL(unix_inq_len);
3009
3010 long unix_outq_len(struct sock *sk)
3011 {
3012         return sk_wmem_alloc_get(sk);
3013 }
3014 EXPORT_SYMBOL_GPL(unix_outq_len);
3015
3016 static int unix_open_file(struct sock *sk)
3017 {
3018         struct path path;
3019         struct file *f;
3020         int fd;
3021
3022         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3023                 return -EPERM;
3024
3025         if (!smp_load_acquire(&unix_sk(sk)->addr))
3026                 return -ENOENT;
3027
3028         path = unix_sk(sk)->path;
3029         if (!path.dentry)
3030                 return -ENOENT;
3031
3032         path_get(&path);
3033
3034         fd = get_unused_fd_flags(O_CLOEXEC);
3035         if (fd < 0)
3036                 goto out;
3037
3038         f = dentry_open(&path, O_PATH, current_cred());
3039         if (IS_ERR(f)) {
3040                 put_unused_fd(fd);
3041                 fd = PTR_ERR(f);
3042                 goto out;
3043         }
3044
3045         fd_install(fd, f);
3046 out:
3047         path_put(&path);
3048
3049         return fd;
3050 }
3051
3052 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3053 {
3054         struct sock *sk = sock->sk;
3055         long amount = 0;
3056         int err;
3057
3058         switch (cmd) {
3059         case SIOCOUTQ:
3060                 amount = unix_outq_len(sk);
3061                 err = put_user(amount, (int __user *)arg);
3062                 break;
3063         case SIOCINQ:
3064                 amount = unix_inq_len(sk);
3065                 if (amount < 0)
3066                         err = amount;
3067                 else
3068                         err = put_user(amount, (int __user *)arg);
3069                 break;
3070         case SIOCUNIXFILE:
3071                 err = unix_open_file(sk);
3072                 break;
3073 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3074         case SIOCATMARK:
3075                 {
3076                         struct sk_buff *skb;
3077                         int answ = 0;
3078
3079                         skb = skb_peek(&sk->sk_receive_queue);
3080                         if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3081                                 answ = 1;
3082                         err = put_user(answ, (int __user *)arg);
3083                 }
3084                 break;
3085 #endif
3086         default:
3087                 err = -ENOIOCTLCMD;
3088                 break;
3089         }
3090         return err;
3091 }
3092
3093 #ifdef CONFIG_COMPAT
3094 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3095 {
3096         return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3097 }
3098 #endif
3099
3100 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3101 {
3102         struct sock *sk = sock->sk;
3103         __poll_t mask;
3104         u8 shutdown;
3105
3106         sock_poll_wait(file, sock, wait);
3107         mask = 0;
3108         shutdown = READ_ONCE(sk->sk_shutdown);
3109
3110         /* exceptional events? */
3111         if (READ_ONCE(sk->sk_err))
3112                 mask |= EPOLLERR;
3113         if (shutdown == SHUTDOWN_MASK)
3114                 mask |= EPOLLHUP;
3115         if (shutdown & RCV_SHUTDOWN)
3116                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3117
3118         /* readable? */
3119         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3120                 mask |= EPOLLIN | EPOLLRDNORM;
3121         if (sk_is_readable(sk))
3122                 mask |= EPOLLIN | EPOLLRDNORM;
3123 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3124         if (READ_ONCE(unix_sk(sk)->oob_skb))
3125                 mask |= EPOLLPRI;
3126 #endif
3127
3128         /* Connection-based need to check for termination and startup */
3129         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3130             sk->sk_state == TCP_CLOSE)
3131                 mask |= EPOLLHUP;
3132
3133         /*
3134          * we set writable also when the other side has shut down the
3135          * connection. This prevents stuck sockets.
3136          */
3137         if (unix_writable(sk))
3138                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3139
3140         return mask;
3141 }
3142
3143 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3144                                     poll_table *wait)
3145 {
3146         struct sock *sk = sock->sk, *other;
3147         unsigned int writable;
3148         __poll_t mask;
3149         u8 shutdown;
3150
3151         sock_poll_wait(file, sock, wait);
3152         mask = 0;
3153         shutdown = READ_ONCE(sk->sk_shutdown);
3154
3155         /* exceptional events? */
3156         if (READ_ONCE(sk->sk_err) ||
3157             !skb_queue_empty_lockless(&sk->sk_error_queue))
3158                 mask |= EPOLLERR |
3159                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3160
3161         if (shutdown & RCV_SHUTDOWN)
3162                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3163         if (shutdown == SHUTDOWN_MASK)
3164                 mask |= EPOLLHUP;
3165
3166         /* readable? */
3167         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3168                 mask |= EPOLLIN | EPOLLRDNORM;
3169         if (sk_is_readable(sk))
3170                 mask |= EPOLLIN | EPOLLRDNORM;
3171
3172         /* Connection-based need to check for termination and startup */
3173         if (sk->sk_type == SOCK_SEQPACKET) {
3174                 if (sk->sk_state == TCP_CLOSE)
3175                         mask |= EPOLLHUP;
3176                 /* connection hasn't started yet? */
3177                 if (sk->sk_state == TCP_SYN_SENT)
3178                         return mask;
3179         }
3180
3181         /* No write status requested, avoid expensive OUT tests. */
3182         if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3183                 return mask;
3184
3185         writable = unix_writable(sk);
3186         if (writable) {
3187                 unix_state_lock(sk);
3188
3189                 other = unix_peer(sk);
3190                 if (other && unix_peer(other) != sk &&
3191                     unix_recvq_full_lockless(other) &&
3192                     unix_dgram_peer_wake_me(sk, other))
3193                         writable = 0;
3194
3195                 unix_state_unlock(sk);
3196         }
3197
3198         if (writable)
3199                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3200         else
3201                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3202
3203         return mask;
3204 }
3205
3206 #ifdef CONFIG_PROC_FS
3207
3208 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3209
3210 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3211 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3212 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3213
3214 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3215 {
3216         unsigned long offset = get_offset(*pos);
3217         unsigned long bucket = get_bucket(*pos);
3218         unsigned long count = 0;
3219         struct sock *sk;
3220
3221         for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3222              sk; sk = sk_next(sk)) {
3223                 if (++count == offset)
3224                         break;
3225         }
3226
3227         return sk;
3228 }
3229
3230 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3231 {
3232         unsigned long bucket = get_bucket(*pos);
3233         struct net *net = seq_file_net(seq);
3234         struct sock *sk;
3235
3236         while (bucket < UNIX_HASH_SIZE) {
3237                 spin_lock(&net->unx.table.locks[bucket]);
3238
3239                 sk = unix_from_bucket(seq, pos);
3240                 if (sk)
3241                         return sk;
3242
3243                 spin_unlock(&net->unx.table.locks[bucket]);
3244
3245                 *pos = set_bucket_offset(++bucket, 1);
3246         }
3247
3248         return NULL;
3249 }
3250
3251 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3252                                   loff_t *pos)
3253 {
3254         unsigned long bucket = get_bucket(*pos);
3255
3256         sk = sk_next(sk);
3257         if (sk)
3258                 return sk;
3259
3260
3261         spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3262
3263         *pos = set_bucket_offset(++bucket, 1);
3264
3265         return unix_get_first(seq, pos);
3266 }
3267
3268 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3269 {
3270         if (!*pos)
3271                 return SEQ_START_TOKEN;
3272
3273         return unix_get_first(seq, pos);
3274 }
3275
3276 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3277 {
3278         ++*pos;
3279
3280         if (v == SEQ_START_TOKEN)
3281                 return unix_get_first(seq, pos);
3282
3283         return unix_get_next(seq, v, pos);
3284 }
3285
3286 static void unix_seq_stop(struct seq_file *seq, void *v)
3287 {
3288         struct sock *sk = v;
3289
3290         if (sk)
3291                 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3292 }
3293
3294 static int unix_seq_show(struct seq_file *seq, void *v)
3295 {
3296
3297         if (v == SEQ_START_TOKEN)
3298                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3299                          "Inode Path\n");
3300         else {
3301                 struct sock *s = v;
3302                 struct unix_sock *u = unix_sk(s);
3303                 unix_state_lock(s);
3304
3305                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3306                         s,
3307                         refcount_read(&s->sk_refcnt),
3308                         0,
3309                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3310                         s->sk_type,
3311                         s->sk_socket ?
3312                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3313                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3314                         sock_i_ino(s));
3315
3316                 if (u->addr) {  // under a hash table lock here
3317                         int i, len;
3318                         seq_putc(seq, ' ');
3319
3320                         i = 0;
3321                         len = u->addr->len -
3322                                 offsetof(struct sockaddr_un, sun_path);
3323                         if (u->addr->name->sun_path[0]) {
3324                                 len--;
3325                         } else {
3326                                 seq_putc(seq, '@');
3327                                 i++;
3328                         }
3329                         for ( ; i < len; i++)
3330                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
3331                                          '@');
3332                 }
3333                 unix_state_unlock(s);
3334                 seq_putc(seq, '\n');
3335         }
3336
3337         return 0;
3338 }
3339
3340 static const struct seq_operations unix_seq_ops = {
3341         .start  = unix_seq_start,
3342         .next   = unix_seq_next,
3343         .stop   = unix_seq_stop,
3344         .show   = unix_seq_show,
3345 };
3346
3347 #ifdef CONFIG_BPF_SYSCALL
3348 struct bpf_unix_iter_state {
3349         struct seq_net_private p;
3350         unsigned int cur_sk;
3351         unsigned int end_sk;
3352         unsigned int max_sk;
3353         struct sock **batch;
3354         bool st_bucket_done;
3355 };
3356
3357 struct bpf_iter__unix {
3358         __bpf_md_ptr(struct bpf_iter_meta *, meta);
3359         __bpf_md_ptr(struct unix_sock *, unix_sk);
3360         uid_t uid __aligned(8);
3361 };
3362
3363 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3364                               struct unix_sock *unix_sk, uid_t uid)
3365 {
3366         struct bpf_iter__unix ctx;
3367
3368         meta->seq_num--;  /* skip SEQ_START_TOKEN */
3369         ctx.meta = meta;
3370         ctx.unix_sk = unix_sk;
3371         ctx.uid = uid;
3372         return bpf_iter_run_prog(prog, &ctx);
3373 }
3374
3375 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3376
3377 {
3378         struct bpf_unix_iter_state *iter = seq->private;
3379         unsigned int expected = 1;
3380         struct sock *sk;
3381
3382         sock_hold(start_sk);
3383         iter->batch[iter->end_sk++] = start_sk;
3384
3385         for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3386                 if (iter->end_sk < iter->max_sk) {
3387                         sock_hold(sk);
3388                         iter->batch[iter->end_sk++] = sk;
3389                 }
3390
3391                 expected++;
3392         }
3393
3394         spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3395
3396         return expected;
3397 }
3398
3399 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3400 {
3401         while (iter->cur_sk < iter->end_sk)
3402                 sock_put(iter->batch[iter->cur_sk++]);
3403 }
3404
3405 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3406                                        unsigned int new_batch_sz)
3407 {
3408         struct sock **new_batch;
3409
3410         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3411                              GFP_USER | __GFP_NOWARN);
3412         if (!new_batch)
3413                 return -ENOMEM;
3414
3415         bpf_iter_unix_put_batch(iter);
3416         kvfree(iter->batch);
3417         iter->batch = new_batch;
3418         iter->max_sk = new_batch_sz;
3419
3420         return 0;
3421 }
3422
3423 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3424                                         loff_t *pos)
3425 {
3426         struct bpf_unix_iter_state *iter = seq->private;
3427         unsigned int expected;
3428         bool resized = false;
3429         struct sock *sk;
3430
3431         if (iter->st_bucket_done)
3432                 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3433
3434 again:
3435         /* Get a new batch */
3436         iter->cur_sk = 0;
3437         iter->end_sk = 0;
3438
3439         sk = unix_get_first(seq, pos);
3440         if (!sk)
3441                 return NULL; /* Done */
3442
3443         expected = bpf_iter_unix_hold_batch(seq, sk);
3444
3445         if (iter->end_sk == expected) {
3446                 iter->st_bucket_done = true;
3447                 return sk;
3448         }
3449
3450         if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3451                 resized = true;
3452                 goto again;
3453         }
3454
3455         return sk;
3456 }
3457
3458 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3459 {
3460         if (!*pos)
3461                 return SEQ_START_TOKEN;
3462
3463         /* bpf iter does not support lseek, so it always
3464          * continue from where it was stop()-ped.
3465          */
3466         return bpf_iter_unix_batch(seq, pos);
3467 }
3468
3469 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3470 {
3471         struct bpf_unix_iter_state *iter = seq->private;
3472         struct sock *sk;
3473
3474         /* Whenever seq_next() is called, the iter->cur_sk is
3475          * done with seq_show(), so advance to the next sk in
3476          * the batch.
3477          */
3478         if (iter->cur_sk < iter->end_sk)
3479                 sock_put(iter->batch[iter->cur_sk++]);
3480
3481         ++*pos;
3482
3483         if (iter->cur_sk < iter->end_sk)
3484                 sk = iter->batch[iter->cur_sk];
3485         else
3486                 sk = bpf_iter_unix_batch(seq, pos);
3487
3488         return sk;
3489 }
3490
3491 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3492 {
3493         struct bpf_iter_meta meta;
3494         struct bpf_prog *prog;
3495         struct sock *sk = v;
3496         uid_t uid;
3497         bool slow;
3498         int ret;
3499
3500         if (v == SEQ_START_TOKEN)
3501                 return 0;
3502
3503         slow = lock_sock_fast(sk);
3504
3505         if (unlikely(sk_unhashed(sk))) {
3506                 ret = SEQ_SKIP;
3507                 goto unlock;
3508         }
3509
3510         uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3511         meta.seq = seq;
3512         prog = bpf_iter_get_info(&meta, false);
3513         ret = unix_prog_seq_show(prog, &meta, v, uid);
3514 unlock:
3515         unlock_sock_fast(sk, slow);
3516         return ret;
3517 }
3518
3519 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3520 {
3521         struct bpf_unix_iter_state *iter = seq->private;
3522         struct bpf_iter_meta meta;
3523         struct bpf_prog *prog;
3524
3525         if (!v) {
3526                 meta.seq = seq;
3527                 prog = bpf_iter_get_info(&meta, true);
3528                 if (prog)
3529                         (void)unix_prog_seq_show(prog, &meta, v, 0);
3530         }
3531
3532         if (iter->cur_sk < iter->end_sk)
3533                 bpf_iter_unix_put_batch(iter);
3534 }
3535
3536 static const struct seq_operations bpf_iter_unix_seq_ops = {
3537         .start  = bpf_iter_unix_seq_start,
3538         .next   = bpf_iter_unix_seq_next,
3539         .stop   = bpf_iter_unix_seq_stop,
3540         .show   = bpf_iter_unix_seq_show,
3541 };
3542 #endif
3543 #endif
3544
3545 static const struct net_proto_family unix_family_ops = {
3546         .family = PF_UNIX,
3547         .create = unix_create,
3548         .owner  = THIS_MODULE,
3549 };
3550
3551
3552 static int __net_init unix_net_init(struct net *net)
3553 {
3554         int i;
3555
3556         net->unx.sysctl_max_dgram_qlen = 10;
3557         if (unix_sysctl_register(net))
3558                 goto out;
3559
3560 #ifdef CONFIG_PROC_FS
3561         if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3562                              sizeof(struct seq_net_private)))
3563                 goto err_sysctl;
3564 #endif
3565
3566         net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3567                                               sizeof(spinlock_t), GFP_KERNEL);
3568         if (!net->unx.table.locks)
3569                 goto err_proc;
3570
3571         net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3572                                                 sizeof(struct hlist_head),
3573                                                 GFP_KERNEL);
3574         if (!net->unx.table.buckets)
3575                 goto free_locks;
3576
3577         for (i = 0; i < UNIX_HASH_SIZE; i++) {
3578                 spin_lock_init(&net->unx.table.locks[i]);
3579                 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3580         }
3581
3582         return 0;
3583
3584 free_locks:
3585         kvfree(net->unx.table.locks);
3586 err_proc:
3587 #ifdef CONFIG_PROC_FS
3588         remove_proc_entry("unix", net->proc_net);
3589 err_sysctl:
3590 #endif
3591         unix_sysctl_unregister(net);
3592 out:
3593         return -ENOMEM;
3594 }
3595
3596 static void __net_exit unix_net_exit(struct net *net)
3597 {
3598         kvfree(net->unx.table.buckets);
3599         kvfree(net->unx.table.locks);
3600         unix_sysctl_unregister(net);
3601         remove_proc_entry("unix", net->proc_net);
3602 }
3603
3604 static struct pernet_operations unix_net_ops = {
3605         .init = unix_net_init,
3606         .exit = unix_net_exit,
3607 };
3608
3609 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3610 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3611                      struct unix_sock *unix_sk, uid_t uid)
3612
3613 #define INIT_BATCH_SZ 16
3614
3615 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3616 {
3617         struct bpf_unix_iter_state *iter = priv_data;
3618         int err;
3619
3620         err = bpf_iter_init_seq_net(priv_data, aux);
3621         if (err)
3622                 return err;
3623
3624         err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3625         if (err) {
3626                 bpf_iter_fini_seq_net(priv_data);
3627                 return err;
3628         }
3629
3630         return 0;
3631 }
3632
3633 static void bpf_iter_fini_unix(void *priv_data)
3634 {
3635         struct bpf_unix_iter_state *iter = priv_data;
3636
3637         bpf_iter_fini_seq_net(priv_data);
3638         kvfree(iter->batch);
3639 }
3640
3641 static const struct bpf_iter_seq_info unix_seq_info = {
3642         .seq_ops                = &bpf_iter_unix_seq_ops,
3643         .init_seq_private       = bpf_iter_init_unix,
3644         .fini_seq_private       = bpf_iter_fini_unix,
3645         .seq_priv_size          = sizeof(struct bpf_unix_iter_state),
3646 };
3647
3648 static const struct bpf_func_proto *
3649 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3650                              const struct bpf_prog *prog)
3651 {
3652         switch (func_id) {
3653         case BPF_FUNC_setsockopt:
3654                 return &bpf_sk_setsockopt_proto;
3655         case BPF_FUNC_getsockopt:
3656                 return &bpf_sk_getsockopt_proto;
3657         default:
3658                 return NULL;
3659         }
3660 }
3661
3662 static struct bpf_iter_reg unix_reg_info = {
3663         .target                 = "unix",
3664         .ctx_arg_info_size      = 1,
3665         .ctx_arg_info           = {
3666                 { offsetof(struct bpf_iter__unix, unix_sk),
3667                   PTR_TO_BTF_ID_OR_NULL },
3668         },
3669         .get_func_proto         = bpf_iter_unix_get_func_proto,
3670         .seq_info               = &unix_seq_info,
3671 };
3672
3673 static void __init bpf_iter_register(void)
3674 {
3675         unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3676         if (bpf_iter_reg_target(&unix_reg_info))
3677                 pr_warn("Warning: could not register bpf iterator unix\n");
3678 }
3679 #endif
3680
3681 static int __init af_unix_init(void)
3682 {
3683         int i, rc = -1;
3684
3685         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3686
3687         for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3688                 spin_lock_init(&bsd_socket_locks[i]);
3689                 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3690         }
3691
3692         rc = proto_register(&unix_dgram_proto, 1);
3693         if (rc != 0) {
3694                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3695                 goto out;
3696         }
3697
3698         rc = proto_register(&unix_stream_proto, 1);
3699         if (rc != 0) {
3700                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3701                 proto_unregister(&unix_dgram_proto);
3702                 goto out;
3703         }
3704
3705         sock_register(&unix_family_ops);
3706         register_pernet_subsys(&unix_net_ops);
3707         unix_bpf_build_proto();
3708
3709 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3710         bpf_iter_register();
3711 #endif
3712
3713 out:
3714         return rc;
3715 }
3716
3717 /* Later than subsys_initcall() because we depend on stuff initialised there */
3718 fs_initcall(af_unix_init);