ipc/mqueue.c

   1 /*
   2  * POSIX message queues filesystem for Linux.
   3  *
   4  * Copyright (C) 2003,2004  Krzysztof Benedyczak    (golbi@mat.uni.torun.pl)
   5  *                          Michal Wronski          (michal.wronski@gmail.com)
   6  *
   7  * Spinlocks:               Mohamed Abbas           (abbas.mohamed@intel.com)
   8  * Lockless receive & send, fd based notify:
   9  *                          Manfred Spraul          (manfred@colorfullife.com)
  10  *
  11  * Audit:                   George Wilson           (ltcgcw@us.ibm.com)
  12  *
  13  * This file is released under the GPL.
  14  */
  15
  16 #include <linux/capability.h>
  17 #include <linux/init.h>
  18 #include <linux/pagemap.h>
  19 #include <linux/file.h>
  20 #include <linux/mount.h>
  21 #include <linux/fs_context.h>
  22 #include <linux/namei.h>
  23 #include <linux/sysctl.h>
  24 #include <linux/poll.h>
  25 #include <linux/mqueue.h>
  26 #include <linux/msg.h>
  27 #include <linux/skbuff.h>
  28 #include <linux/vmalloc.h>
  29 #include <linux/netlink.h>
  30 #include <linux/syscalls.h>
  31 #include <linux/audit.h>
  32 #include <linux/signal.h>
  33 #include <linux/mutex.h>
  34 #include <linux/nsproxy.h>
  35 #include <linux/pid.h>
  36 #include <linux/ipc_namespace.h>
  37 #include <linux/user_namespace.h>
  38 #include <linux/slab.h>
  39 #include <linux/sched/wake_q.h>
  40 #include <linux/sched/signal.h>
  41 #include <linux/sched/user.h>
  42
  43 #include <net/sock.h>
  44 #include "util.h"
  45
  46 struct mqueue_fs_context {
  47         struct ipc_namespace    *ipc_ns;
  48 };
  49
  50 #define MQUEUE_MAGIC    0x19800202
  51 #define DIRENT_SIZE     20
  52 #define FILENT_SIZE     80
  53
  54 #define SEND            0
  55 #define RECV            1
  56
  57 #define STATE_NONE      0
  58 #define STATE_READY     1
  59
  60 struct posix_msg_tree_node {
  61         struct rb_node          rb_node;
  62         struct list_head        msg_list;
  63         int                     priority;
  64 };
  65
  66 struct ext_wait_queue {         /* queue of sleeping tasks */
  67         struct task_struct *task;
  68         struct list_head list;
  69         struct msg_msg *msg;    /* ptr of loaded message */
  70         int state;              /* one of STATE_* values */
  71 };
  72
  73 struct mqueue_inode_info {
  74         spinlock_t lock;
  75         struct inode vfs_inode;
  76         wait_queue_head_t wait_q;
  77
  78         struct rb_root msg_tree;
  79         struct posix_msg_tree_node *node_cache;
  80         struct mq_attr attr;
  81
  82         struct sigevent notify;
  83         struct pid *notify_owner;
  84         struct user_namespace *notify_user_ns;
  85         struct user_struct *user;       /* user who created, for accounting */
  86         struct sock *notify_sock;
  87         struct sk_buff *notify_cookie;
  88
  89         /* for tasks waiting for free space and messages, respectively */
  90         struct ext_wait_queue e_wait_q[2];
  91
  92         unsigned long qsize; /* size of queue in memory (sum of all msgs) */
  93 };
  94
  95 static struct file_system_type mqueue_fs_type;
  96 static const struct inode_operations mqueue_dir_inode_operations;
  97 static const struct file_operations mqueue_file_operations;
  98 static const struct super_operations mqueue_super_ops;
  99 static const struct fs_context_operations mqueue_fs_context_ops;
 100 static void remove_notification(struct mqueue_inode_info *info);
 101
 102 static struct kmem_cache *mqueue_inode_cachep;
 103
 104 static struct ctl_table_header *mq_sysctl_table;
 105
 106 static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
 107 {
 108         return container_of(inode, struct mqueue_inode_info, vfs_inode);
 109 }
 110
 111 /*
 112  * This routine should be called with the mq_lock held.
 113  */
 114 static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode)
 115 {
 116         return get_ipc_ns(inode->i_sb->s_fs_info);
 117 }
 118
 119 static struct ipc_namespace *get_ns_from_inode(struct inode *inode)
 120 {
 121         struct ipc_namespace *ns;
 122
 123         spin_lock(&mq_lock);
 124         ns = __get_ns_from_inode(inode);
 125         spin_unlock(&mq_lock);
 126         return ns;
 127 }
 128
 129 /* Auxiliary functions to manipulate messages' list */
 130 static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info)
 131 {
 132         struct rb_node **p, *parent = NULL;
 133         struct posix_msg_tree_node *leaf;
 134
 135         p = &info->msg_tree.rb_node;
 136         while (*p) {
 137                 parent = *p;
 138                 leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node);
 139
 140                 if (likely(leaf->priority == msg->m_type))
 141                         goto insert_msg;
 142                 else if (msg->m_type < leaf->priority)
 143                         p = &(*p)->rb_left;
 144                 else
 145                         p = &(*p)->rb_right;
 146         }
 147         if (info->node_cache) {
 148                 leaf = info->node_cache;
 149                 info->node_cache = NULL;
 150         } else {
 151                 leaf = kmalloc(sizeof(*leaf), GFP_ATOMIC);
 152                 if (!leaf)
 153                         return -ENOMEM;
 154                 INIT_LIST_HEAD(&leaf->msg_list);
 155         }
 156         leaf->priority = msg->m_type;
 157         rb_link_node(&leaf->rb_node, parent, p);
 158         rb_insert_color(&leaf->rb_node, &info->msg_tree);
 159 insert_msg:
 160         info->attr.mq_curmsgs++;
 161         info->qsize += msg->m_ts;
 162         list_add_tail(&msg->m_list, &leaf->msg_list);
 163         return 0;
 164 }
 165
 166 static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
 167 {
 168         struct rb_node **p, *parent = NULL;
 169         struct posix_msg_tree_node *leaf;
 170         struct msg_msg *msg;
 171
 172 try_again:
 173         p = &info->msg_tree.rb_node;
 174         while (*p) {
 175                 parent = *p;
 176                 /*
 177                  * During insert, low priorities go to the left and high to the
 178                  * right.  On receive, we want the highest priorities first, so
 179                  * walk all the way to the right.
 180                  */
 181                 p = &(*p)->rb_right;
 182         }
 183         if (!parent) {
 184                 if (info->attr.mq_curmsgs) {
 185                         pr_warn_once("Inconsistency in POSIX message queue, "
 186                                      "no tree element, but supposedly messages "
 187                                      "should exist!\n");
 188                         info->attr.mq_curmsgs = 0;
 189                 }
 190                 return NULL;
 191         }
 192         leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node);
 193         if (unlikely(list_empty(&leaf->msg_list))) {
 194                 pr_warn_once("Inconsistency in POSIX message queue, "
 195                              "empty leaf node but we haven't implemented "
 196                              "lazy leaf delete!\n");
 197                 rb_erase(&leaf->rb_node, &info->msg_tree);
 198                 if (info->node_cache) {
 199                         kfree(leaf);
 200                 } else {
 201                         info->node_cache = leaf;
 202                 }
 203                 goto try_again;
 204         } else {
 205                 msg = list_first_entry(&leaf->msg_list,
 206                                        struct msg_msg, m_list);
 207                 list_del(&msg->m_list);
 208                 if (list_empty(&leaf->msg_list)) {
 209                         rb_erase(&leaf->rb_node, &info->msg_tree);
 210                         if (info->node_cache) {
 211                                 kfree(leaf);
 212                         } else {
 213                                 info->node_cache = leaf;
 214                         }
 215                 }
 216         }
 217         info->attr.mq_curmsgs--;
 218         info->qsize -= msg->m_ts;
 219         return msg;
 220 }
 221
 222 static struct inode *mqueue_get_inode(struct super_block *sb,
 223                 struct ipc_namespace *ipc_ns, umode_t mode,
 224                 struct mq_attr *attr)
 225 {
 226         struct user_struct *u = current_user();
 227         struct inode *inode;
 228         int ret = -ENOMEM;
 229
 230         inode = new_inode(sb);
 231         if (!inode)
 232                 goto err;
 233
 234         inode->i_ino = get_next_ino();
 235         inode->i_mode = mode;
 236         inode->i_uid = current_fsuid();
 237         inode->i_gid = current_fsgid();
 238         inode->i_mtime = inode->i_ctime = inode->i_atime = current_time(inode);
 239
 240         if (S_ISREG(mode)) {
 241                 struct mqueue_inode_info *info;
 242                 unsigned long mq_bytes, mq_treesize;
 243
 244                 inode->i_fop = &mqueue_file_operations;
 245                 inode->i_size = FILENT_SIZE;
 246                 /* mqueue specific info */
 247                 info = MQUEUE_I(inode);
 248                 spin_lock_init(&info->lock);
 249                 init_waitqueue_head(&info->wait_q);
 250                 INIT_LIST_HEAD(&info->e_wait_q[0].list);
 251                 INIT_LIST_HEAD(&info->e_wait_q[1].list);
 252                 info->notify_owner = NULL;
 253                 info->notify_user_ns = NULL;
 254                 info->qsize = 0;
 255                 info->user = NULL;      /* set when all is ok */
 256                 info->msg_tree = RB_ROOT;
 257                 info->node_cache = NULL;
 258                 memset(&info->attr, 0, sizeof(info->attr));
 259                 info->attr.mq_maxmsg = min(ipc_ns->mq_msg_max,
 260                                            ipc_ns->mq_msg_default);
 261                 info->attr.mq_msgsize = min(ipc_ns->mq_msgsize_max,
 262                                             ipc_ns->mq_msgsize_default);
 263                 if (attr) {
 264                         info->attr.mq_maxmsg = attr->mq_maxmsg;
 265                         info->attr.mq_msgsize = attr->mq_msgsize;
 266                 }
 267                 /*
 268                  * We used to allocate a static array of pointers and account
 269                  * the size of that array as well as one msg_msg struct per
 270                  * possible message into the queue size. That's no longer
 271                  * accurate as the queue is now an rbtree and will grow and
 272                  * shrink depending on usage patterns.  We can, however, still
 273                  * account one msg_msg struct per message, but the nodes are
 274                  * allocated depending on priority usage, and most programs
 275                  * only use one, or a handful, of priorities.  However, since
 276                  * this is pinned memory, we need to assume worst case, so
 277                  * that means the min(mq_maxmsg, max_priorities) * struct
 278                  * posix_msg_tree_node.
 279                  */
 280
 281                 ret = -EINVAL;
 282                 if (info->attr.mq_maxmsg <= 0 || info->attr.mq_msgsize <= 0)
 283                         goto out_inode;
 284                 if (capable(CAP_SYS_RESOURCE)) {
 285                         if (info->attr.mq_maxmsg > HARD_MSGMAX ||
 286                             info->attr.mq_msgsize > HARD_MSGSIZEMAX)
 287                                 goto out_inode;
 288                 } else {
 289                         if (info->attr.mq_maxmsg > ipc_ns->mq_msg_max ||
 290                                         info->attr.mq_msgsize > ipc_ns->mq_msgsize_max)
 291                                 goto out_inode;
 292                 }
 293                 ret = -EOVERFLOW;
 294                 /* check for overflow */
 295                 if (info->attr.mq_msgsize > ULONG_MAX/info->attr.mq_maxmsg)
 296                         goto out_inode;
 297                 mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) +
 298                         min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) *
 299                         sizeof(struct posix_msg_tree_node);
 300                 mq_bytes = info->attr.mq_maxmsg * info->attr.mq_msgsize;
 301                 if (mq_bytes + mq_treesize < mq_bytes)
 302                         goto out_inode;
 303                 mq_bytes += mq_treesize;
 304                 spin_lock(&mq_lock);
 305                 if (u->mq_bytes + mq_bytes < u->mq_bytes ||
 306                     u->mq_bytes + mq_bytes > rlimit(RLIMIT_MSGQUEUE)) {
 307                         spin_unlock(&mq_lock);
 308                         /* mqueue_evict_inode() releases info->messages */
 309                         ret = -EMFILE;
 310                         goto out_inode;
 311                 }
 312                 u->mq_bytes += mq_bytes;
 313                 spin_unlock(&mq_lock);
 314
 315                 /* all is ok */
 316                 info->user = get_uid(u);
 317         } else if (S_ISDIR(mode)) {
 318                 inc_nlink(inode);
 319                 /* Some things misbehave if size == 0 on a directory */
 320                 inode->i_size = 2 * DIRENT_SIZE;
 321                 inode->i_op = &mqueue_dir_inode_operations;
 322                 inode->i_fop = &simple_dir_operations;
 323         }
 324
 325         return inode;
 326 out_inode:
 327         iput(inode);
 328 err:
 329         return ERR_PTR(ret);
 330 }
 331
 332 static int mqueue_fill_super(struct super_block *sb, struct fs_context *fc)
 333 {
 334         struct inode *inode;
 335         struct ipc_namespace *ns = sb->s_fs_info;
 336
 337         sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
 338         sb->s_blocksize = PAGE_SIZE;
 339         sb->s_blocksize_bits = PAGE_SHIFT;
 340         sb->s_magic = MQUEUE_MAGIC;
 341         sb->s_op = &mqueue_super_ops;
 342
 343         inode = mqueue_get_inode(sb, ns, S_IFDIR | S_ISVTX | S_IRWXUGO, NULL);
 344         if (IS_ERR(inode))
 345                 return PTR_ERR(inode);
 346
 347         sb->s_root = d_make_root(inode);
 348         if (!sb->s_root)
 349                 return -ENOMEM;
 350         return 0;
 351 }
 352
 353 static int mqueue_get_tree(struct fs_context *fc)
 354 {
 355         struct mqueue_fs_context *ctx = fc->fs_private;
 356
 357         put_user_ns(fc->user_ns);
 358         fc->user_ns = get_user_ns(ctx->ipc_ns->user_ns);
 359         fc->s_fs_info = ctx->ipc_ns;
 360         return vfs_get_super(fc, vfs_get_keyed_super, mqueue_fill_super);
 361 }
 362
 363 static void mqueue_fs_context_free(struct fs_context *fc)
 364 {
 365         struct mqueue_fs_context *ctx = fc->fs_private;
 366
 367         if (ctx->ipc_ns)
 368                 put_ipc_ns(ctx->ipc_ns);
 369         kfree(ctx);
 370 }
 371
 372 static int mqueue_init_fs_context(struct fs_context *fc)
 373 {
 374         struct mqueue_fs_context *ctx;
 375
 376         ctx = kzalloc(sizeof(struct mqueue_fs_context), GFP_KERNEL);
 377         if (!ctx)
 378                 return -ENOMEM;
 379
 380         ctx->ipc_ns = get_ipc_ns(current->nsproxy->ipc_ns);
 381         fc->fs_private = ctx;
 382         fc->ops = &mqueue_fs_context_ops;
 383         return 0;
 384 }
 385
 386 static struct vfsmount *mq_create_mount(struct ipc_namespace *ns)
 387 {
 388         struct mqueue_fs_context *ctx;
 389         struct fs_context *fc;
 390         struct vfsmount *mnt;
 391
 392         fc = fs_context_for_mount(&mqueue_fs_type, SB_KERNMOUNT);
 393         if (IS_ERR(fc))
 394                 return ERR_CAST(fc);
 395
 396         ctx = fc->fs_private;
 397         put_ipc_ns(ctx->ipc_ns);
 398         ctx->ipc_ns = get_ipc_ns(ns);
 399
 400         mnt = fc_mount(fc);
 401         put_fs_context(fc);
 402         return mnt;
 403 }
 404
 405 static void init_once(void *foo)
 406 {
 407         struct mqueue_inode_info *p = (struct mqueue_inode_info *) foo;
 408
 409         inode_init_once(&p->vfs_inode);
 410 }
 411
 412 static struct inode *mqueue_alloc_inode(struct super_block *sb)
 413 {
 414         struct mqueue_inode_info *ei;
 415
 416         ei = kmem_cache_alloc(mqueue_inode_cachep, GFP_KERNEL);
 417         if (!ei)
 418                 return NULL;
 419         return &ei->vfs_inode;
 420 }
 421
 422 static void mqueue_i_callback(struct rcu_head *head)
 423 {
 424         struct inode *inode = container_of(head, struct inode, i_rcu);
 425         kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode));
 426 }
 427
 428 static void mqueue_destroy_inode(struct inode *inode)
 429 {
 430         call_rcu(&inode->i_rcu, mqueue_i_callback);
 431 }
 432
 433 static void mqueue_evict_inode(struct inode *inode)
 434 {
 435         struct mqueue_inode_info *info;
 436         struct user_struct *user;
 437         unsigned long mq_bytes, mq_treesize;
 438         struct ipc_namespace *ipc_ns;
 439         struct msg_msg *msg, *nmsg;
 440         LIST_HEAD(tmp_msg);
 441
 442         clear_inode(inode);
 443
 444         if (S_ISDIR(inode->i_mode))
 445                 return;
 446
 447         ipc_ns = get_ns_from_inode(inode);
 448         info = MQUEUE_I(inode);
 449         spin_lock(&info->lock);
 450         while ((msg = msg_get(info)) != NULL)
 451                 list_add_tail(&msg->m_list, &tmp_msg);
 452         kfree(info->node_cache);
 453         spin_unlock(&info->lock);
 454
 455         list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) {
 456                 list_del(&msg->m_list);
 457                 free_msg(msg);
 458         }
 459
 460         /* Total amount of bytes accounted for the mqueue */
 461         mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) +
 462                 min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) *
 463                 sizeof(struct posix_msg_tree_node);
 464
 465         mq_bytes = mq_treesize + (info->attr.mq_maxmsg *
 466                                   info->attr.mq_msgsize);
 467
 468         user = info->user;
 469         if (user) {
 470                 spin_lock(&mq_lock);
 471                 user->mq_bytes -= mq_bytes;
 472                 /*
 473                  * get_ns_from_inode() ensures that the
 474                  * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns
 475                  * to which we now hold a reference, or it is NULL.
 476                  * We can't put it here under mq_lock, though.
 477                  */
 478                 if (ipc_ns)
 479                         ipc_ns->mq_queues_count--;
 480                 spin_unlock(&mq_lock);
 481                 free_uid(user);
 482         }
 483         if (ipc_ns)
 484                 put_ipc_ns(ipc_ns);
 485 }
 486
 487 static int mqueue_create_attr(struct dentry *dentry, umode_t mode, void *arg)
 488 {
 489         struct inode *dir = dentry->d_parent->d_inode;
 490         struct inode *inode;
 491         struct mq_attr *attr = arg;
 492         int error;
 493         struct ipc_namespace *ipc_ns;
 494
 495         spin_lock(&mq_lock);
 496         ipc_ns = __get_ns_from_inode(dir);
 497         if (!ipc_ns) {
 498                 error = -EACCES;
 499                 goto out_unlock;
 500         }
 501
 502         if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&
 503             !capable(CAP_SYS_RESOURCE)) {
 504                 error = -ENOSPC;
 505                 goto out_unlock;
 506         }
 507         ipc_ns->mq_queues_count++;
 508         spin_unlock(&mq_lock);
 509
 510         inode = mqueue_get_inode(dir->i_sb, ipc_ns, mode, attr);
 511         if (IS_ERR(inode)) {
 512                 error = PTR_ERR(inode);
 513                 spin_lock(&mq_lock);
 514                 ipc_ns->mq_queues_count--;
 515                 goto out_unlock;
 516         }
 517
 518         put_ipc_ns(ipc_ns);
 519         dir->i_size += DIRENT_SIZE;
 520         dir->i_ctime = dir->i_mtime = dir->i_atime = current_time(dir);
 521
 522         d_instantiate(dentry, inode);
 523         dget(dentry);
 524         return 0;
 525 out_unlock:
 526         spin_unlock(&mq_lock);
 527         if (ipc_ns)
 528                 put_ipc_ns(ipc_ns);
 529         return error;
 530 }
 531
 532 static int mqueue_create(struct inode *dir, struct dentry *dentry,
 533                                 umode_t mode, bool excl)
 534 {
 535         return mqueue_create_attr(dentry, mode, NULL);
 536 }
 537
 538 static int mqueue_unlink(struct inode *dir, struct dentry *dentry)
 539 {
 540         struct inode *inode = d_inode(dentry);
 541
 542         dir->i_ctime = dir->i_mtime = dir->i_atime = current_time(dir);
 543         dir->i_size -= DIRENT_SIZE;
 544         drop_nlink(inode);
 545         dput(dentry);
 546         return 0;
 547 }
 548
 549 /*
 550 *       This is routine for system read from queue file.
 551 *       To avoid mess with doing here some sort of mq_receive we allow
 552 *       to read only queue size & notification info (the only values
 553 *       that are interesting from user point of view and aren't accessible
 554 *       through std routines)
 555 */
 556 static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
 557                                 size_t count, loff_t *off)
 558 {
 559         struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
 560         char buffer[FILENT_SIZE];
 561         ssize_t ret;
 562
 563         spin_lock(&info->lock);
 564         snprintf(buffer, sizeof(buffer),
 565                         "QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n",
 566                         info->qsize,
 567                         info->notify_owner ? info->notify.sigev_notify : 0,
 568                         (info->notify_owner &&
 569                          info->notify.sigev_notify == SIGEV_SIGNAL) ?
 570                                 info->notify.sigev_signo : 0,
 571                         pid_vnr(info->notify_owner));
 572         spin_unlock(&info->lock);
 573         buffer[sizeof(buffer)-1] = '\0';
 574
 575         ret = simple_read_from_buffer(u_data, count, off, buffer,
 576                                 strlen(buffer));
 577         if (ret <= 0)
 578                 return ret;
 579
 580         file_inode(filp)->i_atime = file_inode(filp)->i_ctime = current_time(file_inode(filp));
 581         return ret;
 582 }
 583
 584 static int mqueue_flush_file(struct file *filp, fl_owner_t id)
 585 {
 586         struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
 587
 588         spin_lock(&info->lock);
 589         if (task_tgid(current) == info->notify_owner)
 590                 remove_notification(info);
 591
 592         spin_unlock(&info->lock);
 593         return 0;
 594 }
 595
 596 static __poll_t mqueue_poll_file(struct file *filp, struct poll_table_struct *poll_tab)
 597 {
 598         struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
 599         __poll_t retval = 0;
 600
 601         poll_wait(filp, &info->wait_q, poll_tab);
 602
 603         spin_lock(&info->lock);
 604         if (info->attr.mq_curmsgs)
 605                 retval = EPOLLIN | EPOLLRDNORM;
 606
 607         if (info->attr.mq_curmsgs < info->attr.mq_maxmsg)
 608                 retval |= EPOLLOUT | EPOLLWRNORM;
 609         spin_unlock(&info->lock);
 610
 611         return retval;
 612 }
 613
 614 /* Adds current to info->e_wait_q[sr] before element with smaller prio */
 615 static void wq_add(struct mqueue_inode_info *info, int sr,
 616                         struct ext_wait_queue *ewp)
 617 {
 618         struct ext_wait_queue *walk;
 619
 620         ewp->task = current;
 621
 622         list_for_each_entry(walk, &info->e_wait_q[sr].list, list) {
 623                 if (walk->task->prio <= current->prio) {
 624                         list_add_tail(&ewp->list, &walk->list);
 625                         return;
 626                 }
 627         }
 628         list_add_tail(&ewp->list, &info->e_wait_q[sr].list);
 629 }
 630
 631 /*
 632  * Puts current task to sleep. Caller must hold queue lock. After return
 633  * lock isn't held.
 634  * sr: SEND or RECV
 635  */
 636 static int wq_sleep(struct mqueue_inode_info *info, int sr,
 637                     ktime_t *timeout, struct ext_wait_queue *ewp)
 638         __releases(&info->lock)
 639 {
 640         int retval;
 641         signed long time;
 642
 643         wq_add(info, sr, ewp);
 644
 645         for (;;) {
 646                 __set_current_state(TASK_INTERRUPTIBLE);
 647
 648                 spin_unlock(&info->lock);
 649                 time = schedule_hrtimeout_range_clock(timeout, 0,
 650                         HRTIMER_MODE_ABS, CLOCK_REALTIME);
 651
 652                 if (ewp->state == STATE_READY) {
 653                         retval = 0;
 654                         goto out;
 655                 }
 656                 spin_lock(&info->lock);
 657                 if (ewp->state == STATE_READY) {
 658                         retval = 0;
 659                         goto out_unlock;
 660                 }
 661                 if (signal_pending(current)) {
 662                         retval = -ERESTARTSYS;
 663                         break;
 664                 }
 665                 if (time == 0) {
 666                         retval = -ETIMEDOUT;
 667                         break;
 668                 }
 669         }
 670         list_del(&ewp->list);
 671 out_unlock:
 672         spin_unlock(&info->lock);
 673 out:
 674         return retval;
 675 }
 676
 677 /*
 678  * Returns waiting task that should be serviced first or NULL if none exists
 679  */
 680 static struct ext_wait_queue *wq_get_first_waiter(
 681                 struct mqueue_inode_info *info, int sr)
 682 {
 683         struct list_head *ptr;
 684
 685         ptr = info->e_wait_q[sr].list.prev;
 686         if (ptr == &info->e_wait_q[sr].list)
 687                 return NULL;
 688         return list_entry(ptr, struct ext_wait_queue, list);
 689 }
 690
 691
 692 static inline void set_cookie(struct sk_buff *skb, char code)
 693 {
 694         ((char *)skb->data)[NOTIFY_COOKIE_LEN-1] = code;
 695 }
 696
 697 /*
 698  * The next function is only to split too long sys_mq_timedsend
 699  */
 700 static void __do_notify(struct mqueue_inode_info *info)
 701 {
 702         /* notification
 703          * invoked when there is registered process and there isn't process
 704          * waiting synchronously for message AND state of queue changed from
 705          * empty to not empty. Here we are sure that no one is waiting
 706          * synchronously. */
 707         if (info->notify_owner &&
 708             info->attr.mq_curmsgs == 1) {
 709                 struct kernel_siginfo sig_i;
 710                 switch (info->notify.sigev_notify) {
 711                 case SIGEV_NONE:
 712                         break;
 713                 case SIGEV_SIGNAL:
 714                         /* sends signal */
 715
 716                         clear_siginfo(&sig_i);
 717                         sig_i.si_signo = info->notify.sigev_signo;
 718                         sig_i.si_errno = 0;
 719                         sig_i.si_code = SI_MESGQ;
 720                         sig_i.si_value = info->notify.sigev_value;
 721                         /* map current pid/uid into info->owner's namespaces */
 722                         rcu_read_lock();
 723                         sig_i.si_pid = task_tgid_nr_ns(current,
 724                                                 ns_of_pid(info->notify_owner));
 725                         sig_i.si_uid = from_kuid_munged(info->notify_user_ns, current_uid());
 726                         rcu_read_unlock();
 727
 728                         kill_pid_info(info->notify.sigev_signo,
 729                                       &sig_i, info->notify_owner);
 730                         break;
 731                 case SIGEV_THREAD:
 732                         set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
 733                         netlink_sendskb(info->notify_sock, info->notify_cookie);
 734                         break;
 735                 }
 736                 /* after notification unregisters process */
 737                 put_pid(info->notify_owner);
 738                 put_user_ns(info->notify_user_ns);
 739                 info->notify_owner = NULL;
 740                 info->notify_user_ns = NULL;
 741         }
 742         wake_up(&info->wait_q);
 743 }
 744
 745 static int prepare_timeout(const struct __kernel_timespec __user *u_abs_timeout,
 746                            struct timespec64 *ts)
 747 {
 748         if (get_timespec64(ts, u_abs_timeout))
 749                 return -EFAULT;
 750         if (!timespec64_valid(ts))
 751                 return -EINVAL;
 752         return 0;
 753 }
 754
 755 static void remove_notification(struct mqueue_inode_info *info)
 756 {
 757         if (info->notify_owner != NULL &&
 758             info->notify.sigev_notify == SIGEV_THREAD) {
 759                 set_cookie(info->notify_cookie, NOTIFY_REMOVED);
 760                 netlink_sendskb(info->notify_sock, info->notify_cookie);
 761         }
 762         put_pid(info->notify_owner);
 763         put_user_ns(info->notify_user_ns);
 764         info->notify_owner = NULL;
 765         info->notify_user_ns = NULL;
 766 }
 767
 768 static int prepare_open(struct dentry *dentry, int oflag, int ro,
 769                         umode_t mode, struct filename *name,
 770                         struct mq_attr *attr)
 771 {
 772         static const int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE,
 773                                                   MAY_READ | MAY_WRITE };
 774         int acc;
 775
 776         if (d_really_is_negative(dentry)) {
 777                 if (!(oflag & O_CREAT))
 778                         return -ENOENT;
 779                 if (ro)
 780                         return ro;
 781                 audit_inode_parent_hidden(name, dentry->d_parent);
 782                 return vfs_mkobj(dentry, mode & ~current_umask(),
 783                                   mqueue_create_attr, attr);
 784         }
 785         /* it already existed */
 786         audit_inode(name, dentry, 0);
 787         if ((oflag & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
 788                 return -EEXIST;
 789         if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY))
 790                 return -EINVAL;
 791         acc = oflag2acc[oflag & O_ACCMODE];
 792         return inode_permission(d_inode(dentry), acc);
 793 }
 794
 795 static int do_mq_open(const char __user *u_name, int oflag, umode_t mode,
 796                       struct mq_attr *attr)
 797 {
 798         struct vfsmount *mnt = current->nsproxy->ipc_ns->mq_mnt;
 799         struct dentry *root = mnt->mnt_root;
 800         struct filename *name;
 801         struct path path;
 802         int fd, error;
 803         int ro;
 804
 805         audit_mq_open(oflag, mode, attr);
 806
 807         if (IS_ERR(name = getname(u_name)))
 808                 return PTR_ERR(name);
 809
 810         fd = get_unused_fd_flags(O_CLOEXEC);
 811         if (fd < 0)
 812                 goto out_putname;
 813
 814         ro = mnt_want_write(mnt);       /* we'll drop it in any case */
 815         inode_lock(d_inode(root));
 816         path.dentry = lookup_one_len(name->name, root, strlen(name->name));
 817         if (IS_ERR(path.dentry)) {
 818                 error = PTR_ERR(path.dentry);
 819                 goto out_putfd;
 820         }
 821         path.mnt = mntget(mnt);
 822         error = prepare_open(path.dentry, oflag, ro, mode, name, attr);
 823         if (!error) {
 824                 struct file *file = dentry_open(&path, oflag, current_cred());
 825                 if (!IS_ERR(file))
 826                         fd_install(fd, file);
 827                 else
 828                         error = PTR_ERR(file);
 829         }
 830         path_put(&path);
 831 out_putfd:
 832         if (error) {
 833                 put_unused_fd(fd);
 834                 fd = error;
 835         }
 836         inode_unlock(d_inode(root));
 837         if (!ro)
 838                 mnt_drop_write(mnt);
 839 out_putname:
 840         putname(name);
 841         return fd;
 842 }
 843
 844 SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,
 845                 struct mq_attr __user *, u_attr)
 846 {
 847         struct mq_attr attr;
 848         if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr)))
 849                 return -EFAULT;
 850
 851         return do_mq_open(u_name, oflag, mode, u_attr ? &attr : NULL);
 852 }
 853
 854 SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
 855 {
 856         int err;
 857         struct filename *name;
 858         struct dentry *dentry;
 859         struct inode *inode = NULL;
 860         struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
 861         struct vfsmount *mnt = ipc_ns->mq_mnt;
 862
 863         name = getname(u_name);
 864         if (IS_ERR(name))
 865                 return PTR_ERR(name);
 866
 867         audit_inode_parent_hidden(name, mnt->mnt_root);
 868         err = mnt_want_write(mnt);
 869         if (err)
 870                 goto out_name;
 871         inode_lock_nested(d_inode(mnt->mnt_root), I_MUTEX_PARENT);
 872         dentry = lookup_one_len(name->name, mnt->mnt_root,
 873                                 strlen(name->name));
 874         if (IS_ERR(dentry)) {
 875                 err = PTR_ERR(dentry);
 876                 goto out_unlock;
 877         }
 878
 879         inode = d_inode(dentry);
 880         if (!inode) {
 881                 err = -ENOENT;
 882         } else {
 883                 ihold(inode);
 884                 err = vfs_unlink(d_inode(dentry->d_parent), dentry, NULL);
 885         }
 886         dput(dentry);
 887
 888 out_unlock:
 889         inode_unlock(d_inode(mnt->mnt_root));
 890         if (inode)
 891                 iput(inode);
 892         mnt_drop_write(mnt);
 893 out_name:
 894         putname(name);
 895
 896         return err;
 897 }
 898
 899 /* Pipelined send and receive functions.
 900  *
 901  * If a receiver finds no waiting message, then it registers itself in the
 902  * list of waiting receivers. A sender checks that list before adding the new
 903  * message into the message array. If there is a waiting receiver, then it
 904  * bypasses the message array and directly hands the message over to the
 905  * receiver. The receiver accepts the message and returns without grabbing the
 906  * queue spinlock:
 907  *
 908  * - Set pointer to message.
 909  * - Queue the receiver task for later wakeup (without the info->lock).
 910  * - Update its state to STATE_READY. Now the receiver can continue.
 911  * - Wake up the process after the lock is dropped. Should the process wake up
 912  *   before this wakeup (due to a timeout or a signal) it will either see
 913  *   STATE_READY and continue or acquire the lock to check the state again.
 914  *
 915  * The same algorithm is used for senders.
 916  */
 917
 918 /* pipelined_send() - send a message directly to the task waiting in
 919  * sys_mq_timedreceive() (without inserting message into a queue).
 920  */
 921 static inline void pipelined_send(struct wake_q_head *wake_q,
 922                                   struct mqueue_inode_info *info,
 923                                   struct msg_msg *message,
 924                                   struct ext_wait_queue *receiver)
 925 {
 926         receiver->msg = message;
 927         list_del(&receiver->list);
 928         wake_q_add(wake_q, receiver->task);
 929         /*
 930          * Rely on the implicit cmpxchg barrier from wake_q_add such
 931          * that we can ensure that updating receiver->state is the last
 932          * write operation: As once set, the receiver can continue,
 933          * and if we don't have the reference count from the wake_q,
 934          * yet, at that point we can later have a use-after-free
 935          * condition and bogus wakeup.
 936          */
 937         receiver->state = STATE_READY;
 938 }
 939
 940 /* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
 941  * gets its message and put to the queue (we have one free place for sure). */
 942 static inline void pipelined_receive(struct wake_q_head *wake_q,
 943                                      struct mqueue_inode_info *info)
 944 {
 945         struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND);
 946
 947         if (!sender) {
 948                 /* for poll */
 949                 wake_up_interruptible(&info->wait_q);
 950                 return;
 951         }
 952         if (msg_insert(sender->msg, info))
 953                 return;
 954
 955         list_del(&sender->list);
 956         wake_q_add(wake_q, sender->task);
 957         sender->state = STATE_READY;
 958 }
 959
 960 static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
 961                 size_t msg_len, unsigned int msg_prio,
 962                 struct timespec64 *ts)
 963 {
 964         struct fd f;
 965         struct inode *inode;
 966         struct ext_wait_queue wait;
 967         struct ext_wait_queue *receiver;
 968         struct msg_msg *msg_ptr;
 969         struct mqueue_inode_info *info;
 970         ktime_t expires, *timeout = NULL;
 971         struct posix_msg_tree_node *new_leaf = NULL;
 972         int ret = 0;
 973         DEFINE_WAKE_Q(wake_q);
 974
 975         if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX))
 976                 return -EINVAL;
 977
 978         if (ts) {
 979                 expires = timespec64_to_ktime(*ts);
 980                 timeout = &expires;
 981         }
 982
 983         audit_mq_sendrecv(mqdes, msg_len, msg_prio, ts);
 984
 985         f = fdget(mqdes);
 986         if (unlikely(!f.file)) {
 987                 ret = -EBADF;
 988                 goto out;
 989         }
 990
 991         inode = file_inode(f.file);
 992         if (unlikely(f.file->f_op != &mqueue_file_operations)) {
 993                 ret = -EBADF;
 994                 goto out_fput;
 995         }
 996         info = MQUEUE_I(inode);
 997         audit_file(f.file);
 998
 999         if (unlikely(!(f.file->f_mode & FMODE_WRITE))) {
1000                 ret = -EBADF;
1001                 goto out_fput;
1002         }
1003
1004         if (unlikely(msg_len > info->attr.mq_msgsize)) {
1005                 ret = -EMSGSIZE;
1006                 goto out_fput;
1007         }
1008
1009         /* First try to allocate memory, before doing anything with
1010          * existing queues. */
1011         msg_ptr = load_msg(u_msg_ptr, msg_len);
1012         if (IS_ERR(msg_ptr)) {
1013                 ret = PTR_ERR(msg_ptr);
1014                 goto out_fput;
1015         }
1016         msg_ptr->m_ts = msg_len;
1017         msg_ptr->m_type = msg_prio;
1018
1019         /*
1020          * msg_insert really wants us to have a valid, spare node struct so
1021          * it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will
1022          * fall back to that if necessary.
1023          */
1024         if (!info->node_cache)
1025                 new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
1026
1027         spin_lock(&info->lock);
1028
1029         if (!info->node_cache && new_leaf) {
1030                 /* Save our speculative allocation into the cache */
1031                 INIT_LIST_HEAD(&new_leaf->msg_list);
1032                 info->node_cache = new_leaf;
1033                 new_leaf = NULL;
1034         } else {
1035                 kfree(new_leaf);
1036         }
1037
1038         if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) {
1039                 if (f.file->f_flags & O_NONBLOCK) {
1040                         ret = -EAGAIN;
1041                 } else {
1042                         wait.task = current;
1043                         wait.msg = (void *) msg_ptr;
1044                         wait.state = STATE_NONE;
1045                         ret = wq_sleep(info, SEND, timeout, &wait);
1046                         /*
1047                          * wq_sleep must be called with info->lock held, and
1048                          * returns with the lock released
1049                          */
1050                         goto out_free;
1051                 }
1052         } else {
1053                 receiver = wq_get_first_waiter(info, RECV);
1054                 if (receiver) {
1055                         pipelined_send(&wake_q, info, msg_ptr, receiver);
1056                 } else {
1057                         /* adds message to the queue */
1058                         ret = msg_insert(msg_ptr, info);
1059                         if (ret)
1060                                 goto out_unlock;
1061                         __do_notify(info);
1062                 }
1063                 inode->i_atime = inode->i_mtime = inode->i_ctime =
1064                                 current_time(inode);
1065         }
1066 out_unlock:
1067         spin_unlock(&info->lock);
1068         wake_up_q(&wake_q);
1069 out_free:
1070         if (ret)
1071                 free_msg(msg_ptr);
1072 out_fput:
1073         fdput(f);
1074 out:
1075         return ret;
1076 }
1077
1078 static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
1079                 size_t msg_len, unsigned int __user *u_msg_prio,
1080                 struct timespec64 *ts)
1081 {
1082         ssize_t ret;
1083         struct msg_msg *msg_ptr;
1084         struct fd f;
1085         struct inode *inode;
1086         struct mqueue_inode_info *info;
1087         struct ext_wait_queue wait;
1088         ktime_t expires, *timeout = NULL;
1089         struct posix_msg_tree_node *new_leaf = NULL;
1090
1091         if (ts) {
1092                 expires = timespec64_to_ktime(*ts);
1093                 timeout = &expires;
1094         }
1095
1096         audit_mq_sendrecv(mqdes, msg_len, 0, ts);
1097
1098         f = fdget(mqdes);
1099         if (unlikely(!f.file)) {
1100                 ret = -EBADF;
1101                 goto out;
1102         }
1103
1104         inode = file_inode(f.file);
1105         if (unlikely(f.file->f_op != &mqueue_file_operations)) {
1106                 ret = -EBADF;
1107                 goto out_fput;
1108         }
1109         info = MQUEUE_I(inode);
1110         audit_file(f.file);
1111
1112         if (unlikely(!(f.file->f_mode & FMODE_READ))) {
1113                 ret = -EBADF;
1114                 goto out_fput;
1115         }
1116
1117         /* checks if buffer is big enough */
1118         if (unlikely(msg_len < info->attr.mq_msgsize)) {
1119                 ret = -EMSGSIZE;
1120                 goto out_fput;
1121         }
1122
1123         /*
1124          * msg_insert really wants us to have a valid, spare node struct so
1125          * it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will
1126          * fall back to that if necessary.
1127          */
1128         if (!info->node_cache)
1129                 new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
1130
1131         spin_lock(&info->lock);
1132
1133         if (!info->node_cache && new_leaf) {
1134                 /* Save our speculative allocation into the cache */
1135                 INIT_LIST_HEAD(&new_leaf->msg_list);
1136                 info->node_cache = new_leaf;
1137         } else {
1138                 kfree(new_leaf);
1139         }
1140
1141         if (info->attr.mq_curmsgs == 0) {
1142                 if (f.file->f_flags & O_NONBLOCK) {
1143                         spin_unlock(&info->lock);
1144                         ret = -EAGAIN;
1145                 } else {
1146                         wait.task = current;
1147                         wait.state = STATE_NONE;
1148                         ret = wq_sleep(info, RECV, timeout, &wait);
1149                         msg_ptr = wait.msg;
1150                 }
1151         } else {
1152                 DEFINE_WAKE_Q(wake_q);
1153
1154                 msg_ptr = msg_get(info);
1155
1156                 inode->i_atime = inode->i_mtime = inode->i_ctime =
1157                                 current_time(inode);
1158
1159                 /* There is now free space in queue. */
1160                 pipelined_receive(&wake_q, info);
1161                 spin_unlock(&info->lock);
1162                 wake_up_q(&wake_q);
1163                 ret = 0;
1164         }
1165         if (ret == 0) {
1166                 ret = msg_ptr->m_ts;
1167
1168                 if ((u_msg_prio && put_user(msg_ptr->m_type, u_msg_prio)) ||
1169                         store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) {
1170                         ret = -EFAULT;
1171                 }
1172                 free_msg(msg_ptr);
1173         }
1174 out_fput:
1175         fdput(f);
1176 out:
1177         return ret;
1178 }
1179
1180 SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
1181                 size_t, msg_len, unsigned int, msg_prio,
1182                 const struct __kernel_timespec __user *, u_abs_timeout)
1183 {
1184         struct timespec64 ts, *p = NULL;
1185         if (u_abs_timeout) {
1186                 int res = prepare_timeout(u_abs_timeout, &ts);
1187                 if (res)
1188                         return res;
1189                 p = &ts;
1190         }
1191         return do_mq_timedsend(mqdes, u_msg_ptr, msg_len, msg_prio, p);
1192 }
1193
1194 SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
1195                 size_t, msg_len, unsigned int __user *, u_msg_prio,
1196                 const struct __kernel_timespec __user *, u_abs_timeout)
1197 {
1198         struct timespec64 ts, *p = NULL;
1199         if (u_abs_timeout) {
1200                 int res = prepare_timeout(u_abs_timeout, &ts);
1201                 if (res)
1202                         return res;
1203                 p = &ts;
1204         }
1205         return do_mq_timedreceive(mqdes, u_msg_ptr, msg_len, u_msg_prio, p);
1206 }
1207
1208 /*
1209  * Notes: the case when user wants us to deregister (with NULL as pointer)
1210  * and he isn't currently owner of notification, will be silently discarded.
1211  * It isn't explicitly defined in the POSIX.
1212  */
1213 static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
1214 {
1215         int ret;
1216         struct fd f;
1217         struct sock *sock;
1218         struct inode *inode;
1219         struct mqueue_inode_info *info;
1220         struct sk_buff *nc;
1221
1222         audit_mq_notify(mqdes, notification);
1223
1224         nc = NULL;
1225         sock = NULL;
1226         if (notification != NULL) {
1227                 if (unlikely(notification->sigev_notify != SIGEV_NONE &&
1228                              notification->sigev_notify != SIGEV_SIGNAL &&
1229                              notification->sigev_notify != SIGEV_THREAD))
1230                         return -EINVAL;
1231                 if (notification->sigev_notify == SIGEV_SIGNAL &&
1232                         !valid_signal(notification->sigev_signo)) {
1233                         return -EINVAL;
1234                 }
1235                 if (notification->sigev_notify == SIGEV_THREAD) {
1236                         long timeo;
1237
1238                         /* create the notify skb */
1239                         nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL);
1240                         if (!nc) {
1241                                 ret = -ENOMEM;
1242                                 goto out;
1243                         }
1244                         if (copy_from_user(nc->data,
1245                                         notification->sigev_value.sival_ptr,
1246                                         NOTIFY_COOKIE_LEN)) {
1247                                 ret = -EFAULT;
1248                                 goto out;
1249                         }
1250
1251                         /* TODO: add a header? */
1252                         skb_put(nc, NOTIFY_COOKIE_LEN);
1253                         /* and attach it to the socket */
1254 retry:
1255                         f = fdget(notification->sigev_signo);
1256                         if (!f.file) {
1257                                 ret = -EBADF;
1258                                 goto out;
1259                         }
1260                         sock = netlink_getsockbyfilp(f.file);
1261                         fdput(f);
1262                         if (IS_ERR(sock)) {
1263                                 ret = PTR_ERR(sock);
1264                                 sock = NULL;
1265                                 goto out;
1266                         }
1267
1268                         timeo = MAX_SCHEDULE_TIMEOUT;
1269                         ret = netlink_attachskb(sock, nc, &timeo, NULL);
1270                         if (ret == 1) {
1271                                 sock = NULL;
1272                                 goto retry;
1273                         }
1274                         if (ret) {
1275                                 sock = NULL;
1276                                 nc = NULL;
1277                                 goto out;
1278                         }
1279                 }
1280         }
1281
1282         f = fdget(mqdes);
1283         if (!f.file) {
1284                 ret = -EBADF;
1285                 goto out;
1286         }
1287
1288         inode = file_inode(f.file);
1289         if (unlikely(f.file->f_op != &mqueue_file_operations)) {
1290                 ret = -EBADF;
1291                 goto out_fput;
1292         }
1293         info = MQUEUE_I(inode);
1294
1295         ret = 0;
1296         spin_lock(&info->lock);
1297         if (notification == NULL) {
1298                 if (info->notify_owner == task_tgid(current)) {
1299                         remove_notification(info);
1300                         inode->i_atime = inode->i_ctime = current_time(inode);
1301                 }
1302         } else if (info->notify_owner != NULL) {
1303                 ret = -EBUSY;
1304         } else {
1305                 switch (notification->sigev_notify) {
1306                 case SIGEV_NONE:
1307                         info->notify.sigev_notify = SIGEV_NONE;
1308                         break;
1309                 case SIGEV_THREAD:
1310                         info->notify_sock = sock;
1311                         info->notify_cookie = nc;
1312                         sock = NULL;
1313                         nc = NULL;
1314                         info->notify.sigev_notify = SIGEV_THREAD;
1315                         break;
1316                 case SIGEV_SIGNAL:
1317                         info->notify.sigev_signo = notification->sigev_signo;
1318                         info->notify.sigev_value = notification->sigev_value;
1319                         info->notify.sigev_notify = SIGEV_SIGNAL;
1320                         break;
1321                 }
1322
1323                 info->notify_owner = get_pid(task_tgid(current));
1324                 info->notify_user_ns = get_user_ns(current_user_ns());
1325                 inode->i_atime = inode->i_ctime = current_time(inode);
1326         }
1327         spin_unlock(&info->lock);
1328 out_fput:
1329         fdput(f);
1330 out:
1331         if (sock)
1332                 netlink_detachskb(sock, nc);
1333         else if (nc)
1334                 dev_kfree_skb(nc);
1335
1336         return ret;
1337 }
1338
1339 SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
1340                 const struct sigevent __user *, u_notification)
1341 {
1342         struct sigevent n, *p = NULL;
1343         if (u_notification) {
1344                 if (copy_from_user(&n, u_notification, sizeof(struct sigevent)))
1345                         return -EFAULT;
1346                 p = &n;
1347         }
1348         return do_mq_notify(mqdes, p);
1349 }
1350
1351 static int do_mq_getsetattr(int mqdes, struct mq_attr *new, struct mq_attr *old)
1352 {
1353         struct fd f;
1354         struct inode *inode;
1355         struct mqueue_inode_info *info;
1356
1357         if (new && (new->mq_flags & (~O_NONBLOCK)))
1358                 return -EINVAL;
1359
1360         f = fdget(mqdes);
1361         if (!f.file)
1362                 return -EBADF;
1363
1364         if (unlikely(f.file->f_op != &mqueue_file_operations)) {
1365                 fdput(f);
1366                 return -EBADF;
1367         }
1368
1369         inode = file_inode(f.file);
1370         info = MQUEUE_I(inode);
1371
1372         spin_lock(&info->lock);
1373
1374         if (old) {
1375                 *old = info->attr;
1376                 old->mq_flags = f.file->f_flags & O_NONBLOCK;
1377         }
1378         if (new) {
1379                 audit_mq_getsetattr(mqdes, new);
1380                 spin_lock(&f.file->f_lock);
1381                 if (new->mq_flags & O_NONBLOCK)
1382                         f.file->f_flags |= O_NONBLOCK;
1383                 else
1384                         f.file->f_flags &= ~O_NONBLOCK;
1385                 spin_unlock(&f.file->f_lock);
1386
1387                 inode->i_atime = inode->i_ctime = current_time(inode);
1388         }
1389
1390         spin_unlock(&info->lock);
1391         fdput(f);
1392         return 0;
1393 }
1394
1395 SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
1396                 const struct mq_attr __user *, u_mqstat,
1397                 struct mq_attr __user *, u_omqstat)
1398 {
1399         int ret;
1400         struct mq_attr mqstat, omqstat;
1401         struct mq_attr *new = NULL, *old = NULL;
1402
1403         if (u_mqstat) {
1404                 new = &mqstat;
1405                 if (copy_from_user(new, u_mqstat, sizeof(struct mq_attr)))
1406                         return -EFAULT;
1407         }
1408         if (u_omqstat)
1409                 old = &omqstat;
1410
1411         ret = do_mq_getsetattr(mqdes, new, old);
1412         if (ret || !old)
1413                 return ret;
1414
1415         if (copy_to_user(u_omqstat, old, sizeof(struct mq_attr)))
1416                 return -EFAULT;
1417         return 0;
1418 }
1419
1420 #ifdef CONFIG_COMPAT
1421
1422 struct compat_mq_attr {
1423         compat_long_t mq_flags;      /* message queue flags                  */
1424         compat_long_t mq_maxmsg;     /* maximum number of messages           */
1425         compat_long_t mq_msgsize;    /* maximum message size                 */
1426         compat_long_t mq_curmsgs;    /* number of messages currently queued  */
1427         compat_long_t __reserved[4]; /* ignored for input, zeroed for output */
1428 };
1429
1430 static inline int get_compat_mq_attr(struct mq_attr *attr,
1431                         const struct compat_mq_attr __user *uattr)
1432 {
1433         struct compat_mq_attr v;
1434
1435         if (copy_from_user(&v, uattr, sizeof(*uattr)))
1436                 return -EFAULT;
1437
1438         memset(attr, 0, sizeof(*attr));
1439         attr->mq_flags = v.mq_flags;
1440         attr->mq_maxmsg = v.mq_maxmsg;
1441         attr->mq_msgsize = v.mq_msgsize;
1442         attr->mq_curmsgs = v.mq_curmsgs;
1443         return 0;
1444 }
1445
1446 static inline int put_compat_mq_attr(const struct mq_attr *attr,
1447                         struct compat_mq_attr __user *uattr)
1448 {
1449         struct compat_mq_attr v;
1450
1451         memset(&v, 0, sizeof(v));
1452         v.mq_flags = attr->mq_flags;
1453         v.mq_maxmsg = attr->mq_maxmsg;
1454         v.mq_msgsize = attr->mq_msgsize;
1455         v.mq_curmsgs = attr->mq_curmsgs;
1456         if (copy_to_user(uattr, &v, sizeof(*uattr)))
1457                 return -EFAULT;
1458         return 0;
1459 }
1460
1461 COMPAT_SYSCALL_DEFINE4(mq_open, const char __user *, u_name,
1462                        int, oflag, compat_mode_t, mode,
1463                        struct compat_mq_attr __user *, u_attr)
1464 {
1465         struct mq_attr attr, *p = NULL;
1466         if (u_attr && oflag & O_CREAT) {
1467                 p = &attr;
1468                 if (get_compat_mq_attr(&attr, u_attr))
1469                         return -EFAULT;
1470         }
1471         return do_mq_open(u_name, oflag, mode, p);
1472 }
1473
1474 COMPAT_SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
1475                        const struct compat_sigevent __user *, u_notification)
1476 {
1477         struct sigevent n, *p = NULL;
1478         if (u_notification) {
1479                 if (get_compat_sigevent(&n, u_notification))
1480                         return -EFAULT;
1481                 if (n.sigev_notify == SIGEV_THREAD)
1482                         n.sigev_value.sival_ptr = compat_ptr(n.sigev_value.sival_int);
1483                 p = &n;
1484         }
1485         return do_mq_notify(mqdes, p);
1486 }
1487
1488 COMPAT_SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
1489                        const struct compat_mq_attr __user *, u_mqstat,
1490                        struct compat_mq_attr __user *, u_omqstat)
1491 {
1492         int ret;
1493         struct mq_attr mqstat, omqstat;
1494         struct mq_attr *new = NULL, *old = NULL;
1495
1496         if (u_mqstat) {
1497                 new = &mqstat;
1498                 if (get_compat_mq_attr(new, u_mqstat))
1499                         return -EFAULT;
1500         }
1501         if (u_omqstat)
1502                 old = &omqstat;
1503
1504         ret = do_mq_getsetattr(mqdes, new, old);
1505         if (ret || !old)
1506                 return ret;
1507
1508         if (put_compat_mq_attr(old, u_omqstat))
1509                 return -EFAULT;
1510         return 0;
1511 }
1512 #endif
1513
1514 #ifdef CONFIG_COMPAT_32BIT_TIME
1515 static int compat_prepare_timeout(const struct old_timespec32 __user *p,
1516                                    struct timespec64 *ts)
1517 {
1518         if (get_old_timespec32(ts, p))
1519                 return -EFAULT;
1520         if (!timespec64_valid(ts))
1521                 return -EINVAL;
1522         return 0;
1523 }
1524
1525 SYSCALL_DEFINE5(mq_timedsend_time32, mqd_t, mqdes,
1526                 const char __user *, u_msg_ptr,
1527                 unsigned int, msg_len, unsigned int, msg_prio,
1528                 const struct old_timespec32 __user *, u_abs_timeout)
1529 {
1530         struct timespec64 ts, *p = NULL;
1531         if (u_abs_timeout) {
1532                 int res = compat_prepare_timeout(u_abs_timeout, &ts);
1533                 if (res)
1534                         return res;
1535                 p = &ts;
1536         }
1537         return do_mq_timedsend(mqdes, u_msg_ptr, msg_len, msg_prio, p);
1538 }
1539
1540 SYSCALL_DEFINE5(mq_timedreceive_time32, mqd_t, mqdes,
1541                 char __user *, u_msg_ptr,
1542                 unsigned int, msg_len, unsigned int __user *, u_msg_prio,
1543                 const struct old_timespec32 __user *, u_abs_timeout)
1544 {
1545         struct timespec64 ts, *p = NULL;
1546         if (u_abs_timeout) {
1547                 int res = compat_prepare_timeout(u_abs_timeout, &ts);
1548                 if (res)
1549                         return res;
1550                 p = &ts;
1551         }
1552         return do_mq_timedreceive(mqdes, u_msg_ptr, msg_len, u_msg_prio, p);
1553 }
1554 #endif
1555
1556 static const struct inode_operations mqueue_dir_inode_operations = {
1557         .lookup = simple_lookup,
1558         .create = mqueue_create,
1559         .unlink = mqueue_unlink,
1560 };
1561
1562 static const struct file_operations mqueue_file_operations = {
1563         .flush = mqueue_flush_file,
1564         .poll = mqueue_poll_file,
1565         .read = mqueue_read_file,
1566         .llseek = default_llseek,
1567 };
1568
1569 static const struct super_operations mqueue_super_ops = {
1570         .alloc_inode = mqueue_alloc_inode,
1571         .destroy_inode = mqueue_destroy_inode,
1572         .evict_inode = mqueue_evict_inode,
1573         .statfs = simple_statfs,
1574 };
1575
1576 static const struct fs_context_operations mqueue_fs_context_ops = {
1577         .free           = mqueue_fs_context_free,
1578         .get_tree       = mqueue_get_tree,
1579 };
1580
1581 static struct file_system_type mqueue_fs_type = {
1582         .name                   = "mqueue",
1583         .init_fs_context        = mqueue_init_fs_context,
1584         .kill_sb                = kill_litter_super,
1585         .fs_flags               = FS_USERNS_MOUNT,
1586 };
1587
1588 int mq_init_ns(struct ipc_namespace *ns)
1589 {
1590         struct vfsmount *m;
1591
1592         ns->mq_queues_count  = 0;
1593         ns->mq_queues_max    = DFLT_QUEUESMAX;
1594         ns->mq_msg_max       = DFLT_MSGMAX;
1595         ns->mq_msgsize_max   = DFLT_MSGSIZEMAX;
1596         ns->mq_msg_default   = DFLT_MSG;
1597         ns->mq_msgsize_default  = DFLT_MSGSIZE;
1598
1599         m = mq_create_mount(ns);
1600         if (IS_ERR(m))
1601                 return PTR_ERR(m);
1602         ns->mq_mnt = m;
1603         return 0;
1604 }
1605
1606 void mq_clear_sbinfo(struct ipc_namespace *ns)
1607 {
1608         ns->mq_mnt->mnt_sb->s_fs_info = NULL;
1609 }
1610
1611 void mq_put_mnt(struct ipc_namespace *ns)
1612 {
1613         kern_unmount(ns->mq_mnt);
1614 }
1615
1616 static int __init init_mqueue_fs(void)
1617 {
1618         int error;
1619
1620         mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",
1621                                 sizeof(struct mqueue_inode_info), 0,
1622                                 SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, init_once);
1623         if (mqueue_inode_cachep == NULL)
1624                 return -ENOMEM;
1625
1626         /* ignore failures - they are not fatal */
1627         mq_sysctl_table = mq_register_sysctl_table();
1628
1629         error = register_filesystem(&mqueue_fs_type);
1630         if (error)
1631                 goto out_sysctl;
1632
1633         spin_lock_init(&mq_lock);
1634
1635         error = mq_init_ns(&init_ipc_ns);
1636         if (error)
1637                 goto out_filesystem;
1638
1639         return 0;
1640
1641 out_filesystem:
1642         unregister_filesystem(&mqueue_fs_type);
1643 out_sysctl:
1644         if (mq_sysctl_table)
1645                 unregister_sysctl_table(mq_sysctl_table);
1646         kmem_cache_destroy(mqueue_inode_cachep);
1647         return error;
1648 }
1649
1650 device_initcall(init_mqueue_fs);