git.ipfire.org Git - thirdparty/linux.git/blob

1 /*

2 * POSIX message queues filesystem for Linux.

3 *

5 * Michal Wronski (michal.wronski@gmail.com)

6 *

7 * Spinlocks: Mohamed Abbas (abbas.mohamed@intel.com)

8 * Lockless receive & send, fd based notify:

9 * Manfred Spraul (manfred@colorfullife.com)

10 *

11 * Audit: George Wilson (ltcgcw@us.ibm.com)

12 *

13 * This file is released under the GPL.

14 */

16 #include <linux/capability.h>

17 #include <linux/init.h>

18 #include <linux/pagemap.h>

19 #include <linux/file.h>

20 #include <linux/mount.h>

21 #include <linux/fs_context.h>

22 #include <linux/namei.h>

23 #include <linux/sysctl.h>

24 #include <linux/poll.h>

25 #include <linux/mqueue.h>

26 #include <linux/msg.h>

27 #include <linux/skbuff.h>

28 #include <linux/vmalloc.h>

29 #include <linux/netlink.h>

30 #include <linux/syscalls.h>

31 #include <linux/audit.h>

32 #include <linux/signal.h>

33 #include <linux/mutex.h>

34 #include <linux/nsproxy.h>

35 #include <linux/pid.h>

36 #include <linux/ipc_namespace.h>

37 #include <linux/user_namespace.h>

38 #include <linux/slab.h>

39 #include <linux/sched/wake_q.h>

40 #include <linux/sched/signal.h>

41 #include <linux/sched/user.h>

43 #include <net/sock.h>

44 #include "util.h"

46 struct mqueue_fs_context {

47 struct ipc_namespace *ipc_ns;

48 bool newns; /* Set if newly created ipc namespace */

49 };

51 #define MQUEUE_MAGIC 0x19800202

52 #define DIRENT_SIZE 20

53 #define FILENT_SIZE 80

55 #define SEND 0

56 #define RECV 1

58 #define STATE_NONE 0

59 #define STATE_READY 1

61 struct posix_msg_tree_node {

62 struct rb_node rb_node;

63 struct list_head msg_list;

64 int priority;

65 };

67 /*

68 * Locking:

69 *

70 * Accesses to a message queue are synchronized by acquiring info->lock.

71 *

72 * There are two notable exceptions:

73 * - The actual wakeup of a sleeping task is performed using the wake_q

74 * framework. info->lock is already released when wake_up_q is called.

75 * - The exit codepaths after sleeping check ext_wait_queue->state without

76 * any locks. If it is STATE_READY, then the syscall is completed without

77 * acquiring info->lock.

78 *

79 * MQ_BARRIER:

80 * To achieve proper release/acquire memory barrier pairing, the state is set to

81 * STATE_READY with smp_store_release(), and it is read with READ_ONCE followed

82 * by smp_acquire__after_ctrl_dep(). In addition, wake_q_add_safe() is used.

83 *

84 * This prevents the following races:

85 *

86 * 1) With the simple wake_q_add(), the task could be gone already before

87 * the increase of the reference happens

88 * Thread A

89 * Thread B

90 * WRITE_ONCE(wait.state, STATE_NONE);

91 * schedule_hrtimeout()

92 * wake_q_add(A)

93 * if (cmpxchg()) // success

94 * ->state = STATE_READY (reordered)

95 * <timeout returns>

96 * if (wait.state == STATE_READY) return;

97 * sysret to user space

98 * sys_exit()

99 * get_task_struct() // UaF

100 *

101 * Solution: Use wake_q_add_safe() and perform the get_task_struct() before

102 * the smp_store_release() that does ->state = STATE_READY.

103 *

104 * 2) Without proper _release/_acquire barriers, the woken up task

105 * could read stale data

106 *

107 * Thread A

108 * Thread B

109 * do_mq_timedreceive

110 * WRITE_ONCE(wait.state, STATE_NONE);

111 * schedule_hrtimeout()

112 * state = STATE_READY;

113 * <timeout returns>

114 * if (wait.state == STATE_READY) return;

115 * msg_ptr = wait.msg; // Access to stale data!

116 * receiver->msg = message; (reordered)

117 *

118 * Solution: use _release and _acquire barriers.

119 *

120 * 3) There is intentionally no barrier when setting current->state

121 * to TASK_INTERRUPTIBLE: spin_unlock(&info->lock) provides the

122 * release memory barrier, and the wakeup is triggered when holding

123 * info->lock, i.e. spin_lock(&info->lock) provided a pairing

124 * acquire memory barrier.

125 */

126

127 struct ext_wait_queue { /* queue of sleeping tasks */

128 struct task_struct *task;

129 struct list_head list;

130 struct msg_msg *msg; /* ptr of loaded message */

131 int state; /* one of STATE_* values */

132 };

133

134 struct mqueue_inode_info {

135 spinlock_t lock;

136 struct inode vfs_inode;

137 wait_queue_head_t wait_q;

138

139 struct rb_root msg_tree;

140 struct rb_node *msg_tree_rightmost;

141 struct posix_msg_tree_node *node_cache;

142 struct mq_attr attr;

143

144 struct sigevent notify;

145 struct pid *notify_owner;

146 u32 notify_self_exec_id;

147 struct user_namespace *notify_user_ns;

148 struct ucounts *ucounts; /* user who created, for accounting */

149 struct sock *notify_sock;

150 struct sk_buff *notify_cookie;

151

152 /* for tasks waiting for free space and messages, respectively */

153 struct ext_wait_queue e_wait_q[2];

154

155 unsigned long qsize; /* size of queue in memory (sum of all msgs) */

156 };

157

158 static struct file_system_type mqueue_fs_type;

159 static const struct inode_operations mqueue_dir_inode_operations;

160 static const struct file_operations mqueue_file_operations;

161 static const struct super_operations mqueue_super_ops;

162 static const struct fs_context_operations mqueue_fs_context_ops;

163 static void remove_notification(struct mqueue_inode_info *info);

164

165 static struct kmem_cache *mqueue_inode_cachep;

166

167 static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)

168 {

169 return container_of(inode, struct mqueue_inode_info, vfs_inode);

170 }

171

172 /*

173 * This routine should be called with the mq_lock held.

174 */

175 static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode)

176 {

177 return get_ipc_ns(inode->i_sb->s_fs_info);

178 }

179

180 static struct ipc_namespace *get_ns_from_inode(struct inode *inode)

181 {

182 struct ipc_namespace *ns;

183

184 spin_lock(&mq_lock);

185 ns = __get_ns_from_inode(inode);

186 spin_unlock(&mq_lock);

187 return ns;

188 }

189

190 /* Auxiliary functions to manipulate messages' list */

191 static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info)

192 {

193 struct rb_node **p, *parent = NULL;

194 struct posix_msg_tree_node *leaf;

195 bool rightmost = true;

196

197 p = &info->msg_tree.rb_node;

198 while (*p) {

199 parent = *p;

200 leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node);

201

202 if (likely(leaf->priority == msg->m_type))

203 goto insert_msg;

204 else if (msg->m_type < leaf->priority) {

205 p = &(*p)->rb_left;

206 rightmost = false;

207 } else

208 p = &(*p)->rb_right;

209 }

210 if (info->node_cache) {

211 leaf = info->node_cache;

212 info->node_cache = NULL;

213 } else {

214 leaf = kmalloc(sizeof(*leaf), GFP_ATOMIC);

215 if (!leaf)

216 return -ENOMEM;

217 INIT_LIST_HEAD(&leaf->msg_list);

218 }

219 leaf->priority = msg->m_type;

220

221 if (rightmost)

222 info->msg_tree_rightmost = &leaf->rb_node;

223

224 rb_link_node(&leaf->rb_node, parent, p);

225 rb_insert_color(&leaf->rb_node, &info->msg_tree);

226 insert_msg:

227 info->attr.mq_curmsgs++;

228 info->qsize += msg->m_ts;

229 list_add_tail(&msg->m_list, &leaf->msg_list);

230 return 0;

231 }

232

233 static inline void msg_tree_erase(struct posix_msg_tree_node *leaf,

234 struct mqueue_inode_info *info)

235 {

236 struct rb_node *node = &leaf->rb_node;

237

238 if (info->msg_tree_rightmost == node)

239 info->msg_tree_rightmost = rb_prev(node);

240

241 rb_erase(node, &info->msg_tree);

242 if (info->node_cache)

243 kfree(leaf);

244 else

245 info->node_cache = leaf;

246 }

247

248 static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)

249 {

250 struct rb_node *parent = NULL;

251 struct posix_msg_tree_node *leaf;

252 struct msg_msg *msg;

253

254 try_again:

255 /*

256 * During insert, low priorities go to the left and high to the

257 * right. On receive, we want the highest priorities first, so

258 * walk all the way to the right.

259 */

260 parent = info->msg_tree_rightmost;

261 if (!parent) {

262 if (info->attr.mq_curmsgs) {

263 pr_warn_once("Inconsistency in POSIX message queue, "

264 "no tree element, but supposedly messages "

265 "should exist!\n");

266 info->attr.mq_curmsgs = 0;

267 }

268 return NULL;

269 }

270 leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node);

271 if (unlikely(list_empty(&leaf->msg_list))) {

272 pr_warn_once("Inconsistency in POSIX message queue, "

273 "empty leaf node but we haven't implemented "

274 "lazy leaf delete!\n");

275 msg_tree_erase(leaf, info);

276 goto try_again;

277 } else {

278 msg = list_first_entry(&leaf->msg_list,

279 struct msg_msg, m_list);

280 list_del(&msg->m_list);

281 if (list_empty(&leaf->msg_list)) {

282 msg_tree_erase(leaf, info);

283 }

284 }

285 info->attr.mq_curmsgs--;

286 info->qsize -= msg->m_ts;

287 return msg;

288 }

289

290 static struct inode *mqueue_get_inode(struct super_block *sb,

291 struct ipc_namespace *ipc_ns, umode_t mode,

292 struct mq_attr *attr)

293 {

294 struct inode *inode;

295 int ret = -ENOMEM;

296

297 inode = new_inode(sb);

298 if (!inode)

299 goto err;

300

301 inode->i_ino = get_next_ino();

302 inode->i_mode = mode;

303 inode->i_uid = current_fsuid();

304 inode->i_gid = current_fsgid();

305 simple_inode_init_ts(inode);

306

307 if (S_ISREG(mode)) {

308 struct mqueue_inode_info *info;

309 unsigned long mq_bytes, mq_treesize;

310

311 inode->i_fop = &mqueue_file_operations;

312 inode->i_size = FILENT_SIZE;

313 /* mqueue specific info */

314 info = MQUEUE_I(inode);

315 spin_lock_init(&info->lock);

316 init_waitqueue_head(&info->wait_q);

317 INIT_LIST_HEAD(&info->e_wait_q[0].list);

318 INIT_LIST_HEAD(&info->e_wait_q[1].list);

319 info->notify_owner = NULL;

320 info->notify_user_ns = NULL;

321 info->qsize = 0;

322 info->ucounts = NULL; /* set when all is ok */

323 info->msg_tree = RB_ROOT;

324 info->msg_tree_rightmost = NULL;

325 info->node_cache = NULL;

326 memset(&info->attr, 0, sizeof(info->attr));

327 info->attr.mq_maxmsg = min(ipc_ns->mq_msg_max,

328 ipc_ns->mq_msg_default);

329 info->attr.mq_msgsize = min(ipc_ns->mq_msgsize_max,

330 ipc_ns->mq_msgsize_default);

331 if (attr) {

332 info->attr.mq_maxmsg = attr->mq_maxmsg;

333 info->attr.mq_msgsize = attr->mq_msgsize;

334 }

335 /*

336 * We used to allocate a static array of pointers and account

337 * the size of that array as well as one msg_msg struct per

338 * possible message into the queue size. That's no longer

339 * accurate as the queue is now an rbtree and will grow and

340 * shrink depending on usage patterns. We can, however, still

341 * account one msg_msg struct per message, but the nodes are

342 * allocated depending on priority usage, and most programs

343 * only use one, or a handful, of priorities. However, since

344 * this is pinned memory, we need to assume worst case, so

345 * that means the min(mq_maxmsg, max_priorities) * struct

346 * posix_msg_tree_node.

347 */

348

349 ret = -EINVAL;

350 if (info->attr.mq_maxmsg <= 0 || info->attr.mq_msgsize <= 0)

351 goto out_inode;

352 if (capable(CAP_SYS_RESOURCE)) {

353 if (info->attr.mq_maxmsg > HARD_MSGMAX ||

354 info->attr.mq_msgsize > HARD_MSGSIZEMAX)

355 goto out_inode;

356 } else {

357 if (info->attr.mq_maxmsg > ipc_ns->mq_msg_max ||

358 info->attr.mq_msgsize > ipc_ns->mq_msgsize_max)

359 goto out_inode;

360 }

361 ret = -EOVERFLOW;

362 /* check for overflow */

363 if (info->attr.mq_msgsize > ULONG_MAX/info->attr.mq_maxmsg)

364 goto out_inode;

365 mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) +

366 min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) *

367 sizeof(struct posix_msg_tree_node);

368 mq_bytes = info->attr.mq_maxmsg * info->attr.mq_msgsize;

369 if (mq_bytes + mq_treesize < mq_bytes)

370 goto out_inode;

371 mq_bytes += mq_treesize;

372 info->ucounts = get_ucounts(current_ucounts());

373 if (info->ucounts) {

374 long msgqueue;

375

376 spin_lock(&mq_lock);

377 msgqueue = inc_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);

378 if (msgqueue == LONG_MAX || msgqueue > rlimit(RLIMIT_MSGQUEUE)) {

379 dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);

380 spin_unlock(&mq_lock);

381 put_ucounts(info->ucounts);

382 info->ucounts = NULL;

383 /* mqueue_evict_inode() releases info->messages */

384 ret = -EMFILE;

385 goto out_inode;

386 }

387 spin_unlock(&mq_lock);

388 }

389 } else if (S_ISDIR(mode)) {

390 inc_nlink(inode);

391 /* Some things misbehave if size == 0 on a directory */

392 inode->i_size = 2 * DIRENT_SIZE;

393 inode->i_op = &mqueue_dir_inode_operations;

394 inode->i_fop = &simple_dir_operations;

395 }

396

397 return inode;

398 out_inode:

399 iput(inode);

400 err:

401 return ERR_PTR(ret);

402 }

403

404 static int mqueue_fill_super(struct super_block *sb, struct fs_context *fc)

405 {

406 struct inode *inode;

407 struct ipc_namespace *ns = sb->s_fs_info;

408

409 sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;

410 sb->s_blocksize = PAGE_SIZE;

411 sb->s_blocksize_bits = PAGE_SHIFT;

412 sb->s_magic = MQUEUE_MAGIC;

413 sb->s_op = &mqueue_super_ops;

414 sb->s_d_flags = DCACHE_DONTCACHE;

415

416 inode = mqueue_get_inode(sb, ns, S_IFDIR | S_ISVTX | S_IRWXUGO, NULL);

417 if (IS_ERR(inode))

418 return PTR_ERR(inode);

419

420 sb->s_root = d_make_root(inode);

421 if (!sb->s_root)

422 return -ENOMEM;

423 return 0;

424 }

425

426 static int mqueue_get_tree(struct fs_context *fc)

427 {

428 struct mqueue_fs_context *ctx = fc->fs_private;

429

430 /*

431 * With a newly created ipc namespace, we don't need to do a search

432 * for an ipc namespace match, but we still need to set s_fs_info.

433 */

434 if (ctx->newns) {

435 fc->s_fs_info = ctx->ipc_ns;

436 return get_tree_nodev(fc, mqueue_fill_super);

437 }

438 return get_tree_keyed(fc, mqueue_fill_super, ctx->ipc_ns);

439 }

440

441 static void mqueue_fs_context_free(struct fs_context *fc)

442 {

443 struct mqueue_fs_context *ctx = fc->fs_private;

444

445 put_ipc_ns(ctx->ipc_ns);

446 kfree(ctx);

447 }

448

449 static int mqueue_init_fs_context(struct fs_context *fc)

450 {

451 struct mqueue_fs_context *ctx;

452

453 ctx = kzalloc(sizeof(struct mqueue_fs_context), GFP_KERNEL);

454 if (!ctx)

455 return -ENOMEM;

456

457 ctx->ipc_ns = get_ipc_ns(current->nsproxy->ipc_ns);

458 put_user_ns(fc->user_ns);

459 fc->user_ns = get_user_ns(ctx->ipc_ns->user_ns);

460 fc->fs_private = ctx;

461 fc->ops = &mqueue_fs_context_ops;

462 return 0;

463 }

464

465 /*

466 * mq_init_ns() is currently the only caller of mq_create_mount().

467 * So the ns parameter is always a newly created ipc namespace.

468 */

469 static struct vfsmount *mq_create_mount(struct ipc_namespace *ns)

470 {

471 struct mqueue_fs_context *ctx;

472 struct fs_context *fc;

473 struct vfsmount *mnt;

474

475 fc = fs_context_for_mount(&mqueue_fs_type, SB_KERNMOUNT);

476 if (IS_ERR(fc))

477 return ERR_CAST(fc);

478

479 ctx = fc->fs_private;

480 ctx->newns = true;

481 put_ipc_ns(ctx->ipc_ns);

482 ctx->ipc_ns = get_ipc_ns(ns);

483 put_user_ns(fc->user_ns);

484 fc->user_ns = get_user_ns(ctx->ipc_ns->user_ns);

485

486 mnt = fc_mount_longterm(fc);

487 put_fs_context(fc);

488 return mnt;

489 }

490

491 static void init_once(void *foo)

492 {

493 struct mqueue_inode_info *p = foo;

494

495 inode_init_once(&p->vfs_inode);

496 }

497

498 static struct inode *mqueue_alloc_inode(struct super_block *sb)

499 {

500 struct mqueue_inode_info *ei;

501

502 ei = alloc_inode_sb(sb, mqueue_inode_cachep, GFP_KERNEL);

503 if (!ei)

504 return NULL;

505 return &ei->vfs_inode;

506 }

507

508 static void mqueue_free_inode(struct inode *inode)

509 {

510 kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode));

511 }

512

513 static void mqueue_evict_inode(struct inode *inode)

514 {

515 struct mqueue_inode_info *info;

516 struct ipc_namespace *ipc_ns;

517 struct msg_msg *msg, *nmsg;

518 LIST_HEAD(tmp_msg);

519

520 clear_inode(inode);

521

522 if (S_ISDIR(inode->i_mode))

523 return;

524

525 ipc_ns = get_ns_from_inode(inode);

526 info = MQUEUE_I(inode);

527 spin_lock(&info->lock);

528 while ((msg = msg_get(info)) != NULL)

529 list_add_tail(&msg->m_list, &tmp_msg);

530 kfree(info->node_cache);

531 spin_unlock(&info->lock);

532

533 list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) {

534 list_del(&msg->m_list);

535 free_msg(msg);

536 }

537

538 if (info->ucounts) {

539 unsigned long mq_bytes, mq_treesize;

540

541 /* Total amount of bytes accounted for the mqueue */

542 mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) +

543 min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) *

544 sizeof(struct posix_msg_tree_node);

545

546 mq_bytes = mq_treesize + (info->attr.mq_maxmsg *

547 info->attr.mq_msgsize);

548

549 spin_lock(&mq_lock);

550 dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);

551 /*

552 * get_ns_from_inode() ensures that the

553 * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns

554 * to which we now hold a reference, or it is NULL.

555 * We can't put it here under mq_lock, though.

556 */

557 if (ipc_ns)

558 ipc_ns->mq_queues_count--;

559 spin_unlock(&mq_lock);

560 put_ucounts(info->ucounts);

561 info->ucounts = NULL;

562 }

563 if (ipc_ns)

564 put_ipc_ns(ipc_ns);

565 }

566

567 static int mqueue_create_attr(struct dentry *dentry, umode_t mode, void *arg)

568 {

569 struct inode *dir = dentry->d_parent->d_inode;

570 struct inode *inode;

571 struct mq_attr *attr = arg;

572 int error;

573 struct ipc_namespace *ipc_ns;

574

575 spin_lock(&mq_lock);

576 ipc_ns = __get_ns_from_inode(dir);

577 if (!ipc_ns) {

578 error = -EACCES;

579 goto out_unlock;

580 }

581

582 if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&

583 !capable(CAP_SYS_RESOURCE)) {

584 error = -ENOSPC;

585 goto out_unlock;

586 }

587 ipc_ns->mq_queues_count++;

588 spin_unlock(&mq_lock);

589

590 inode = mqueue_get_inode(dir->i_sb, ipc_ns, mode, attr);

591 if (IS_ERR(inode)) {

592 error = PTR_ERR(inode);

593 spin_lock(&mq_lock);

594 ipc_ns->mq_queues_count--;

595 goto out_unlock;

596 }

597

598 put_ipc_ns(ipc_ns);

599 dir->i_size += DIRENT_SIZE;

600 simple_inode_init_ts(dir);

601

602 d_make_persistent(dentry, inode);

603 return 0;

604 out_unlock:

605 spin_unlock(&mq_lock);

606 if (ipc_ns)

607 put_ipc_ns(ipc_ns);

608 return error;

609 }

610

611 static int mqueue_create(struct mnt_idmap *idmap, struct inode *dir,

612 struct dentry *dentry, umode_t mode, bool excl)

613 {

614 return mqueue_create_attr(dentry, mode, NULL);

615 }

616

617 static int mqueue_unlink(struct inode *dir, struct dentry *dentry)

618 {

619 dir->i_size -= DIRENT_SIZE;

620 return simple_unlink(dir, dentry);

621 }

622

623 /*

624 * This is routine for system read from queue file.

625 * To avoid mess with doing here some sort of mq_receive we allow

626 * to read only queue size & notification info (the only values

627 * that are interesting from user point of view and aren't accessible

628 * through std routines)

629 */

630 static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,

631 size_t count, loff_t *off)

632 {

633 struct inode *inode = file_inode(filp);

634 struct mqueue_inode_info *info = MQUEUE_I(inode);

635 char buffer[FILENT_SIZE];

636 ssize_t ret;

637

638 spin_lock(&info->lock);

639 snprintf(buffer, sizeof(buffer),

640 "QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n",

641 info->qsize,

642 info->notify_owner ? info->notify.sigev_notify : 0,

643 (info->notify_owner &&

644 info->notify.sigev_notify == SIGEV_SIGNAL) ?

645 info->notify.sigev_signo : 0,

646 pid_vnr(info->notify_owner));

647 spin_unlock(&info->lock);

648 buffer[sizeof(buffer)-1] = '\0';

649

650 ret = simple_read_from_buffer(u_data, count, off, buffer,

651 strlen(buffer));

652 if (ret <= 0)

653 return ret;

654

655 inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));

656 return ret;

657 }

658

659 static int mqueue_flush_file(struct file *filp, fl_owner_t id)

660 {

661 struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));

662

663 spin_lock(&info->lock);

664 if (task_tgid(current) == info->notify_owner)

665 remove_notification(info);

666

667 spin_unlock(&info->lock);

668 return 0;

669 }

670

671 static __poll_t mqueue_poll_file(struct file *filp, struct poll_table_struct *poll_tab)

672 {

673 struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));

674 __poll_t retval = 0;

675

676 poll_wait(filp, &info->wait_q, poll_tab);

677

678 spin_lock(&info->lock);

679 if (info->attr.mq_curmsgs)

680 retval = EPOLLIN | EPOLLRDNORM;

681

682 if (info->attr.mq_curmsgs < info->attr.mq_maxmsg)

683 retval |= EPOLLOUT | EPOLLWRNORM;

684 spin_unlock(&info->lock);

685

686 return retval;

687 }

688

689 /* Adds current to info->e_wait_q[sr] before element with smaller prio */

690 static void wq_add(struct mqueue_inode_info *info, int sr,

691 struct ext_wait_queue *ewp)

692 {

693 struct ext_wait_queue *walk;

694

695 list_for_each_entry(walk, &info->e_wait_q[sr].list, list) {

696 if (walk->task->prio <= current->prio) {

697 list_add_tail(&ewp->list, &walk->list);

698 return;

699 }

700 }

701 list_add_tail(&ewp->list, &info->e_wait_q[sr].list);

702 }

703

704 /*

705 * Puts current task to sleep. Caller must hold queue lock. After return

706 * lock isn't held.

707 * sr: SEND or RECV

708 */

709 static int wq_sleep(struct mqueue_inode_info *info, int sr,

710 ktime_t *timeout, struct ext_wait_queue *ewp)

711 __releases(&info->lock)

712 {

713 int retval;

714 signed long time;

715

716 wq_add(info, sr, ewp);

717

718 for (;;) {

719 /* memory barrier not required, we hold info->lock */

720 __set_current_state(TASK_INTERRUPTIBLE);

721

722 spin_unlock(&info->lock);

723 time = schedule_hrtimeout_range_clock(timeout, 0,

724 HRTIMER_MODE_ABS, CLOCK_REALTIME);

725

726 if (READ_ONCE(ewp->state) == STATE_READY) {

727 /* see MQ_BARRIER for purpose/pairing */

728 smp_acquire__after_ctrl_dep();

729 retval = 0;

730 goto out;

731 }

732 spin_lock(&info->lock);

733

734 /* we hold info->lock, so no memory barrier required */

735 if (READ_ONCE(ewp->state) == STATE_READY) {

736 retval = 0;

737 goto out_unlock;

738 }

739 if (signal_pending(current)) {

740 retval = -ERESTARTSYS;

741 break;

742 }

743 if (time == 0) {

744 retval = -ETIMEDOUT;

745 break;

746 }

747 }

748 list_del(&ewp->list);

749 out_unlock:

750 spin_unlock(&info->lock);

751 out:

752 return retval;

753 }

754

755 /*

756 * Returns waiting task that should be serviced first or NULL if none exists

757 */

758 static struct ext_wait_queue *wq_get_first_waiter(

759 struct mqueue_inode_info *info, int sr)

760 {

761 struct list_head *ptr;

762

763 ptr = info->e_wait_q[sr].list.prev;

764 if (ptr == &info->e_wait_q[sr].list)

765 return NULL;

766 return list_entry(ptr, struct ext_wait_queue, list);

767 }

768

769

770 static inline void set_cookie(struct sk_buff *skb, char code)

771 {

772 ((char *)skb->data)[NOTIFY_COOKIE_LEN-1] = code;

773 }

774

775 /*

776 * The next function is only to split too long sys_mq_timedsend

777 */

778 static void __do_notify(struct mqueue_inode_info *info)

779 {

780 /* notification

781 * invoked when there is registered process and there isn't process

782 * waiting synchronously for message AND state of queue changed from

783 * empty to not empty. Here we are sure that no one is waiting

784 * synchronously. */

785 if (info->notify_owner &&

786 info->attr.mq_curmsgs == 1) {

787 switch (info->notify.sigev_notify) {

788 case SIGEV_NONE:

789 break;

790 case SIGEV_SIGNAL: {

791 struct kernel_siginfo sig_i;

792 struct task_struct *task;

793

794 /* do_mq_notify() accepts sigev_signo == 0, why?? */

795 if (!info->notify.sigev_signo)

796 break;

797

798 clear_siginfo(&sig_i);

799 sig_i.si_signo = info->notify.sigev_signo;

800 sig_i.si_errno = 0;

801 sig_i.si_code = SI_MESGQ;

802 sig_i.si_value = info->notify.sigev_value;

803 rcu_read_lock();

804 /* map current pid/uid into info->owner's namespaces */

805 sig_i.si_pid = task_tgid_nr_ns(current,

806 ns_of_pid(info->notify_owner));

807 sig_i.si_uid = from_kuid_munged(info->notify_user_ns,

808 current_uid());

809 /*

810 * We can't use kill_pid_info(), this signal should

811 * bypass check_kill_permission(). It is from kernel

812 * but si_fromuser() can't know this.

813 * We do check the self_exec_id, to avoid sending

814 * signals to programs that don't expect them.

815 */

816 task = pid_task(info->notify_owner, PIDTYPE_TGID);

817 if (task && task->self_exec_id ==

818 info->notify_self_exec_id) {

819 do_send_sig_info(info->notify.sigev_signo,

820 &sig_i, task, PIDTYPE_TGID);

821 }

822 rcu_read_unlock();

823 break;

824 }

825 case SIGEV_THREAD:

826 set_cookie(info->notify_cookie, NOTIFY_WOKENUP);

827 netlink_sendskb(info->notify_sock, info->notify_cookie);

828 break;

829 }

830 /* after notification unregisters process */

831 put_pid(info->notify_owner);

832 put_user_ns(info->notify_user_ns);

833 info->notify_owner = NULL;

834 info->notify_user_ns = NULL;

835 }

836 wake_up(&info->wait_q);

837 }

838

839 static int prepare_timeout(const struct __kernel_timespec __user *u_abs_timeout,

840 struct timespec64 *ts)

841 {

842 if (get_timespec64(ts, u_abs_timeout))

843 return -EFAULT;

844 if (!timespec64_valid(ts))

845 return -EINVAL;

846 return 0;

847 }

848

849 static void remove_notification(struct mqueue_inode_info *info)

850 {

851 if (info->notify_owner != NULL &&

852 info->notify.sigev_notify == SIGEV_THREAD) {

853 set_cookie(info->notify_cookie, NOTIFY_REMOVED);

854 netlink_sendskb(info->notify_sock, info->notify_cookie);

855 }

856 put_pid(info->notify_owner);

857 put_user_ns(info->notify_user_ns);

858 info->notify_owner = NULL;

859 info->notify_user_ns = NULL;

860 }

861

862 static int prepare_open(struct dentry *dentry, int oflag, int ro,

863 umode_t mode, struct filename *name,

864 struct mq_attr *attr)

865 {

866 static const int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE,

867 MAY_READ | MAY_WRITE };

868 int acc;

869

870 if (d_really_is_negative(dentry)) {

871 if (!(oflag & O_CREAT))

872 return -ENOENT;

873 if (ro)

874 return ro;

875 audit_inode_parent_hidden(name, dentry->d_parent);

876 return vfs_mkobj(dentry, mode & ~current_umask(),

877 mqueue_create_attr, attr);

878 }

879 /* it already existed */

880 audit_inode(name, dentry, 0);

881 if ((oflag & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))

882 return -EEXIST;

883 if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY))

884 return -EINVAL;

885 acc = oflag2acc[oflag & O_ACCMODE];

886 return inode_permission(&nop_mnt_idmap, d_inode(dentry), acc);

887 }

888

889 static int do_mq_open(const char __user *u_name, int oflag, umode_t mode,

890 struct mq_attr *attr)

891 {

892 struct vfsmount *mnt = current->nsproxy->ipc_ns->mq_mnt;

893 struct dentry *root = mnt->mnt_root;

894 struct filename *name;

895 struct path path;

896 int fd, error;

897 int ro;

898

899 audit_mq_open(oflag, mode, attr);

900

901 name = getname(u_name);

902 if (IS_ERR(name))

903 return PTR_ERR(name);

904

905 fd = get_unused_fd_flags(O_CLOEXEC);

906 if (fd < 0)

907 goto out_putname;

908

909 ro = mnt_want_write(mnt); /* we'll drop it in any case */

910 inode_lock(d_inode(root));

911 path.dentry = lookup_noperm(&QSTR(name->name), root);

912 if (IS_ERR(path.dentry)) {

913 error = PTR_ERR(path.dentry);

914 goto out_putfd;

915 }

916 path.mnt = mntget(mnt);

917 error = prepare_open(path.dentry, oflag, ro, mode, name, attr);

918 if (!error) {

919 struct file *file = dentry_open(&path, oflag, current_cred());

920 if (!IS_ERR(file))

921 fd_install(fd, file);

922 else

923 error = PTR_ERR(file);

924 }

925 path_put(&path);

926 out_putfd:

927 if (error) {

928 put_unused_fd(fd);

929 fd = error;

930 }

931 inode_unlock(d_inode(root));

932 if (!ro)

933 mnt_drop_write(mnt);

934 out_putname:

935 putname(name);

936 return fd;

937 }

938

939 SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,

940 struct mq_attr __user *, u_attr)

941 {

942 struct mq_attr attr;

943 if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr)))

944 return -EFAULT;

945

946 return do_mq_open(u_name, oflag, mode, u_attr ? &attr : NULL);

947 }

948

949 SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)

950 {

951 int err;

952 struct filename *name;

953 struct dentry *dentry;

954 struct inode *inode = NULL;

955 struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;

956 struct vfsmount *mnt = ipc_ns->mq_mnt;

957

958 name = getname(u_name);

959 if (IS_ERR(name))

960 return PTR_ERR(name);

961

962 audit_inode_parent_hidden(name, mnt->mnt_root);

963 err = mnt_want_write(mnt);

964 if (err)

965 goto out_name;

966 inode_lock_nested(d_inode(mnt->mnt_root), I_MUTEX_PARENT);

967 dentry = lookup_noperm(&QSTR(name->name), mnt->mnt_root);

968 if (IS_ERR(dentry)) {

969 err = PTR_ERR(dentry);

970 goto out_unlock;

971 }

972

973 inode = d_inode(dentry);

974 if (!inode) {

975 err = -ENOENT;

976 } else {

977 ihold(inode);

978 err = vfs_unlink(&nop_mnt_idmap, d_inode(dentry->d_parent),

979 dentry, NULL);

980 }

981 dput(dentry);

982

983 out_unlock:

984 inode_unlock(d_inode(mnt->mnt_root));

985 iput(inode);

986 mnt_drop_write(mnt);

987 out_name:

988 putname(name);

989

990 return err;

991 }

992

993 /* Pipelined send and receive functions.

994 *

995 * If a receiver finds no waiting message, then it registers itself in the

996 * list of waiting receivers. A sender checks that list before adding the new

997 * message into the message array. If there is a waiting receiver, then it

998 * bypasses the message array and directly hands the message over to the

999 * receiver. The receiver accepts the message and returns without grabbing the

1000 * queue spinlock:

1001 *

1002 * - Set pointer to message.

1003 * - Queue the receiver task for later wakeup (without the info->lock).

1004 * - Update its state to STATE_READY. Now the receiver can continue.

1005 * - Wake up the process after the lock is dropped. Should the process wake up

1006 * before this wakeup (due to a timeout or a signal) it will either see

1007 * STATE_READY and continue or acquire the lock to check the state again.

1008 *

1009 * The same algorithm is used for senders.

1010 */

1011

1012 static inline void __pipelined_op(struct wake_q_head *wake_q,

1013 struct mqueue_inode_info *info,

1014 struct ext_wait_queue *this)

1015 {

1016 struct task_struct *task;

1017

1018 list_del(&this->list);

1019 task = get_task_struct(this->task);

1020

1021 /* see MQ_BARRIER for purpose/pairing */

1022 smp_store_release(&this->state, STATE_READY);

1023 wake_q_add_safe(wake_q, task);

1024 }

1025

1026 /* pipelined_send() - send a message directly to the task waiting in

1027 * sys_mq_timedreceive() (without inserting message into a queue).

1028 */

1029 static inline void pipelined_send(struct wake_q_head *wake_q,

1030 struct mqueue_inode_info *info,

1031 struct msg_msg *message,

1032 struct ext_wait_queue *receiver)

1033 {

1034 receiver->msg = message;

1035 __pipelined_op(wake_q, info, receiver);

1036 }

1037

1038 /* pipelined_receive() - if there is task waiting in sys_mq_timedsend()

1039 * gets its message and put to the queue (we have one free place for sure). */

1040 static inline void pipelined_receive(struct wake_q_head *wake_q,

1041 struct mqueue_inode_info *info)

1042 {

1043 struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND);

1044

1045 if (!sender) {

1046 /* for poll */

1047 wake_up_interruptible(&info->wait_q);

1048 return;

1049 }

1050 if (msg_insert(sender->msg, info))

1051 return;

1052

1053 __pipelined_op(wake_q, info, sender);

1054 }

1055

1056 static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,

1057 size_t msg_len, unsigned int msg_prio,

1058 struct timespec64 *ts)

1059 {

1060 struct inode *inode;

1061 struct ext_wait_queue wait;

1062 struct ext_wait_queue *receiver;

1063 struct msg_msg *msg_ptr;

1064 struct mqueue_inode_info *info;

1065 ktime_t expires, *timeout = NULL;

1066 struct posix_msg_tree_node *new_leaf = NULL;

1067 int ret = 0;

1068 DEFINE_WAKE_Q(wake_q);

1069

1070 if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX))

1071 return -EINVAL;

1072

1073 if (ts) {

1074 expires = timespec64_to_ktime(*ts);

1075 timeout = &expires;

1076 }

1077

1078 audit_mq_sendrecv(mqdes, msg_len, msg_prio, ts);

1079

1080 CLASS(fd, f)(mqdes);

1081 if (fd_empty(f))

1082 return -EBADF;

1083

1084 inode = file_inode(fd_file(f));

1085 if (unlikely(fd_file(f)->f_op != &mqueue_file_operations))

1086 return -EBADF;

1087 info = MQUEUE_I(inode);

1088 audit_file(fd_file(f));

1089

1090 if (unlikely(!(fd_file(f)->f_mode & FMODE_WRITE)))

1091 return -EBADF;

1092

1093 if (unlikely(msg_len > info->attr.mq_msgsize))

1094 return -EMSGSIZE;

1095

1096 /* First try to allocate memory, before doing anything with

1097 * existing queues. */

1098 msg_ptr = load_msg(u_msg_ptr, msg_len);

1099 if (IS_ERR(msg_ptr))

1100 return PTR_ERR(msg_ptr);

1101 msg_ptr->m_ts = msg_len;

1102 msg_ptr->m_type = msg_prio;

1103

1104 /*

1105 * msg_insert really wants us to have a valid, spare node struct so

1106 * it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will

1107 * fall back to that if necessary.

1108 */

1109 if (!info->node_cache)

1110 new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);

1111

1112 spin_lock(&info->lock);

1113

1114 if (!info->node_cache && new_leaf) {

1115 /* Save our speculative allocation into the cache */

1116 INIT_LIST_HEAD(&new_leaf->msg_list);

1117 info->node_cache = new_leaf;

1118 new_leaf = NULL;

1119 } else {

1120 kfree(new_leaf);

1121 }

1122

1123 if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) {

1124 if (fd_file(f)->f_flags & O_NONBLOCK) {

1125 ret = -EAGAIN;

1126 } else {

1127 wait.task = current;

1128 wait.msg = (void *) msg_ptr;

1129

1130 /* memory barrier not required, we hold info->lock */

1131 WRITE_ONCE(wait.state, STATE_NONE);

1132 ret = wq_sleep(info, SEND, timeout, &wait);

1133 /*

1134 * wq_sleep must be called with info->lock held, and

1135 * returns with the lock released

1136 */

1137 goto out_free;

1138 }

1139 } else {

1140 receiver = wq_get_first_waiter(info, RECV);

1141 if (receiver) {

1142 pipelined_send(&wake_q, info, msg_ptr, receiver);

1143 } else {

1144 /* adds message to the queue */

1145 ret = msg_insert(msg_ptr, info);

1146 if (ret)

1147 goto out_unlock;

1148 __do_notify(info);

1149 }

1150 simple_inode_init_ts(inode);

1151 }

1152 out_unlock:

1153 spin_unlock(&info->lock);

1154 wake_up_q(&wake_q);

1155 out_free:

1156 if (ret)

1157 free_msg(msg_ptr);

1158 return ret;

1159 }

1160

1161 static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,

1162 size_t msg_len, unsigned int __user *u_msg_prio,

1163 struct timespec64 *ts)

1164 {

1165 ssize_t ret;

1166 struct msg_msg *msg_ptr;

1167 struct inode *inode;

1168 struct mqueue_inode_info *info;

1169 struct ext_wait_queue wait;

1170 ktime_t expires, *timeout = NULL;

1171 struct posix_msg_tree_node *new_leaf = NULL;

1172

1173 if (ts) {

1174 expires = timespec64_to_ktime(*ts);

1175 timeout = &expires;

1176 }

1177

1178 audit_mq_sendrecv(mqdes, msg_len, 0, ts);

1179

1180 CLASS(fd, f)(mqdes);

1181 if (fd_empty(f))

1182 return -EBADF;

1183

1184 inode = file_inode(fd_file(f));

1185 if (unlikely(fd_file(f)->f_op != &mqueue_file_operations))

1186 return -EBADF;

1187 info = MQUEUE_I(inode);

1188 audit_file(fd_file(f));

1189

1190 if (unlikely(!(fd_file(f)->f_mode & FMODE_READ)))

1191 return -EBADF;

1192

1193 /* checks if buffer is big enough */

1194 if (unlikely(msg_len < info->attr.mq_msgsize))

1195 return -EMSGSIZE;

1196

1197 /*

1198 * msg_insert really wants us to have a valid, spare node struct so

1199 * it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will

1200 * fall back to that if necessary.

1201 */

1202 if (!info->node_cache)

1203 new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);

1204

1205 spin_lock(&info->lock);

1206

1207 if (!info->node_cache && new_leaf) {

1208 /* Save our speculative allocation into the cache */

1209 INIT_LIST_HEAD(&new_leaf->msg_list);

1210 info->node_cache = new_leaf;

1211 } else {

1212 kfree(new_leaf);

1213 }

1214

1215 if (info->attr.mq_curmsgs == 0) {

1216 if (fd_file(f)->f_flags & O_NONBLOCK) {

1217 spin_unlock(&info->lock);

1218 ret = -EAGAIN;

1219 } else {

1220 wait.task = current;

1221

1222 /* memory barrier not required, we hold info->lock */

1223 WRITE_ONCE(wait.state, STATE_NONE);

1224 ret = wq_sleep(info, RECV, timeout, &wait);

1225 msg_ptr = wait.msg;

1226 }

1227 } else {

1228 DEFINE_WAKE_Q(wake_q);

1229

1230 msg_ptr = msg_get(info);

1231

1232 simple_inode_init_ts(inode);

1233

1234 /* There is now free space in queue. */

1235 pipelined_receive(&wake_q, info);

1236 spin_unlock(&info->lock);

1237 wake_up_q(&wake_q);

1238 ret = 0;

1239 }

1240 if (ret == 0) {

1241 ret = msg_ptr->m_ts;

1242

1243 if ((u_msg_prio && put_user(msg_ptr->m_type, u_msg_prio)) ||

1244 store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) {

1245 ret = -EFAULT;

1246 }

1247 free_msg(msg_ptr);

1248 }

1249 return ret;

1250 }

1251

1252 SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,

1253 size_t, msg_len, unsigned int, msg_prio,

1254 const struct __kernel_timespec __user *, u_abs_timeout)

1255 {

1256 struct timespec64 ts, *p = NULL;

1257 if (u_abs_timeout) {

1258 int res = prepare_timeout(u_abs_timeout, &ts);

1259 if (res)

1260 return res;

1261 p = &ts;

1262 }

1263 return do_mq_timedsend(mqdes, u_msg_ptr, msg_len, msg_prio, p);

1264 }

1265

1266 SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,

1267 size_t, msg_len, unsigned int __user *, u_msg_prio,

1268 const struct __kernel_timespec __user *, u_abs_timeout)

1269 {

1270 struct timespec64 ts, *p = NULL;

1271 if (u_abs_timeout) {

1272 int res = prepare_timeout(u_abs_timeout, &ts);

1273 if (res)

1274 return res;

1275 p = &ts;

1276 }

1277 return do_mq_timedreceive(mqdes, u_msg_ptr, msg_len, u_msg_prio, p);

1278 }

1279

1280 /*

1281 * Notes: the case when user wants us to deregister (with NULL as pointer)

1282 * and he isn't currently owner of notification, will be silently discarded.

1283 * It isn't explicitly defined in the POSIX.

1284 */

1285 static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)

1286 {

1287 int ret;

1288 struct sock *sock;

1289 struct inode *inode;

1290 struct mqueue_inode_info *info;

1291 struct sk_buff *nc;

1292

1293 audit_mq_notify(mqdes, notification);

1294

1295 nc = NULL;

1296 sock = NULL;

1297 if (notification != NULL) {

1298 if (unlikely(notification->sigev_notify != SIGEV_NONE &&

1299 notification->sigev_notify != SIGEV_SIGNAL &&

1300 notification->sigev_notify != SIGEV_THREAD))

1301 return -EINVAL;

1302 if (notification->sigev_notify == SIGEV_SIGNAL &&

1303 !valid_signal(notification->sigev_signo)) {

1304 return -EINVAL;

1305 }

1306 if (notification->sigev_notify == SIGEV_THREAD) {

1307 long timeo;

1308

1309 /* create the notify skb */

1310 nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL);

1311 if (!nc)

1312 return -ENOMEM;

1313

1314 if (copy_from_user(nc->data,

1315 notification->sigev_value.sival_ptr,

1316 NOTIFY_COOKIE_LEN)) {

1317 kfree_skb(nc);

1318 return -EFAULT;

1319 }

1320

1321 /* TODO: add a header? */

1322 skb_put(nc, NOTIFY_COOKIE_LEN);

1323 /* and attach it to the socket */

1324 retry:

1325 sock = netlink_getsockbyfd(notification->sigev_signo);

1326 if (IS_ERR(sock)) {

1327 kfree_skb(nc);

1328 return PTR_ERR(sock);

1329 }

1330

1331 timeo = MAX_SCHEDULE_TIMEOUT;

1332 ret = netlink_attachskb(sock, nc, &timeo, NULL);

1333 if (ret == 1)

1334 goto retry;

1335 if (ret)

1336 return ret;

1337 }

1338 }

1339

1340 CLASS(fd, f)(mqdes);

1341 if (fd_empty(f)) {

1342 ret = -EBADF;

1343 goto out;

1344 }

1345

1346 inode = file_inode(fd_file(f));

1347 if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) {

1348 ret = -EBADF;

1349 goto out;

1350 }

1351 info = MQUEUE_I(inode);

1352

1353 ret = 0;

1354 spin_lock(&info->lock);

1355 if (notification == NULL) {

1356 if (info->notify_owner == task_tgid(current)) {

1357 remove_notification(info);

1358 inode_set_atime_to_ts(inode,

1359 inode_set_ctime_current(inode));

1360 }

1361 } else if (info->notify_owner != NULL) {

1362 ret = -EBUSY;

1363 } else {

1364 switch (notification->sigev_notify) {

1365 case SIGEV_NONE:

1366 info->notify.sigev_notify = SIGEV_NONE;

1367 break;

1368 case SIGEV_THREAD:

1369 info->notify_sock = sock;

1370 info->notify_cookie = nc;

1371 sock = NULL;

1372 nc = NULL;

1373 info->notify.sigev_notify = SIGEV_THREAD;

1374 break;

1375 case SIGEV_SIGNAL:

1376 info->notify.sigev_signo = notification->sigev_signo;

1377 info->notify.sigev_value = notification->sigev_value;

1378 info->notify.sigev_notify = SIGEV_SIGNAL;

1379 info->notify_self_exec_id = current->self_exec_id;

1380 break;

1381 }

1382

1383 info->notify_owner = get_pid(task_tgid(current));

1384 info->notify_user_ns = get_user_ns(current_user_ns());

1385 inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));

1386 }

1387 spin_unlock(&info->lock);

1388 out:

1389 if (sock)

1390 netlink_detachskb(sock, nc);

1391 return ret;

1392 }

1393

1394 SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,

1395 const struct sigevent __user *, u_notification)

1396 {

1397 struct sigevent n, *p = NULL;

1398 if (u_notification) {

1399 if (copy_from_user(&n, u_notification, sizeof(struct sigevent)))

1400 return -EFAULT;

1401 p = &n;

1402 }

1403 return do_mq_notify(mqdes, p);

1404 }

1405

1406 static int do_mq_getsetattr(int mqdes, struct mq_attr *new, struct mq_attr *old)

1407 {

1408 struct inode *inode;

1409 struct mqueue_inode_info *info;

1410

1411 if (new && (new->mq_flags & (~O_NONBLOCK)))

1412 return -EINVAL;

1413

1414 CLASS(fd, f)(mqdes);

1415 if (fd_empty(f))

1416 return -EBADF;

1417

1418 if (unlikely(fd_file(f)->f_op != &mqueue_file_operations))

1419 return -EBADF;

1420

1421 inode = file_inode(fd_file(f));

1422 info = MQUEUE_I(inode);

1423

1424 spin_lock(&info->lock);

1425

1426 if (old) {

1427 *old = info->attr;

1428 old->mq_flags = fd_file(f)->f_flags & O_NONBLOCK;

1429 }

1430 if (new) {

1431 audit_mq_getsetattr(mqdes, new);

1432 spin_lock(&fd_file(f)->f_lock);

1433 if (new->mq_flags & O_NONBLOCK)

1434 fd_file(f)->f_flags |= O_NONBLOCK;

1435 else

1436 fd_file(f)->f_flags &= ~O_NONBLOCK;

1437 spin_unlock(&fd_file(f)->f_lock);

1438

1439 inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));

1440 }

1441

1442 spin_unlock(&info->lock);

1443 return 0;

1444 }

1445

1446 SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,

1447 const struct mq_attr __user *, u_mqstat,

1448 struct mq_attr __user *, u_omqstat)

1449 {

1450 int ret;

1451 struct mq_attr mqstat, omqstat;

1452 struct mq_attr *new = NULL, *old = NULL;

1453

1454 if (u_mqstat) {

1455 new = &mqstat;

1456 if (copy_from_user(new, u_mqstat, sizeof(struct mq_attr)))

1457 return -EFAULT;

1458 }

1459 if (u_omqstat)

1460 old = &omqstat;

1461

1462 ret = do_mq_getsetattr(mqdes, new, old);

1463 if (ret || !old)

1464 return ret;

1465

1466 if (copy_to_user(u_omqstat, old, sizeof(struct mq_attr)))

1467 return -EFAULT;

1468 return 0;

1469 }

1470

1471 #ifdef CONFIG_COMPAT

1472

1473 struct compat_mq_attr {

1474 compat_long_t mq_flags; /* message queue flags */

1475 compat_long_t mq_maxmsg; /* maximum number of messages */

1476 compat_long_t mq_msgsize; /* maximum message size */

1477 compat_long_t mq_curmsgs; /* number of messages currently queued */

1478 compat_long_t __reserved[4]; /* ignored for input, zeroed for output */

1479 };

1480

1481 static inline int get_compat_mq_attr(struct mq_attr *attr,

1482 const struct compat_mq_attr __user *uattr)

1483 {

1484 struct compat_mq_attr v;

1485

1486 if (copy_from_user(&v, uattr, sizeof(*uattr)))

1487 return -EFAULT;

1488

1489 memset(attr, 0, sizeof(*attr));

1490 attr->mq_flags = v.mq_flags;

1491 attr->mq_maxmsg = v.mq_maxmsg;

1492 attr->mq_msgsize = v.mq_msgsize;

1493 attr->mq_curmsgs = v.mq_curmsgs;

1494 return 0;

1495 }

1496

1497 static inline int put_compat_mq_attr(const struct mq_attr *attr,

1498 struct compat_mq_attr __user *uattr)

1499 {

1500 struct compat_mq_attr v;

1501

1502 memset(&v, 0, sizeof(v));

1503 v.mq_flags = attr->mq_flags;

1504 v.mq_maxmsg = attr->mq_maxmsg;

1505 v.mq_msgsize = attr->mq_msgsize;

1506 v.mq_curmsgs = attr->mq_curmsgs;

1507 if (copy_to_user(uattr, &v, sizeof(*uattr)))

1508 return -EFAULT;

1509 return 0;

1510 }

1511

1512 COMPAT_SYSCALL_DEFINE4(mq_open, const char __user *, u_name,

1513 int, oflag, compat_mode_t, mode,

1514 struct compat_mq_attr __user *, u_attr)

1515 {

1516 struct mq_attr attr, *p = NULL;

1517 if (u_attr && oflag & O_CREAT) {

1518 p = &attr;

1519 if (get_compat_mq_attr(&attr, u_attr))

1520 return -EFAULT;

1521 }

1522 return do_mq_open(u_name, oflag, mode, p);

1523 }

1524

1525 COMPAT_SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,

1526 const struct compat_sigevent __user *, u_notification)

1527 {

1528 struct sigevent n, *p = NULL;

1529 if (u_notification) {

1530 if (get_compat_sigevent(&n, u_notification))

1531 return -EFAULT;

1532 if (n.sigev_notify == SIGEV_THREAD)

1533 n.sigev_value.sival_ptr = compat_ptr(n.sigev_value.sival_int);

1534 p = &n;

1535 }

1536 return do_mq_notify(mqdes, p);

1537 }

1538

1539 COMPAT_SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,

1540 const struct compat_mq_attr __user *, u_mqstat,

1541 struct compat_mq_attr __user *, u_omqstat)

1542 {

1543 int ret;

1544 struct mq_attr mqstat, omqstat;

1545 struct mq_attr *new = NULL, *old = NULL;

1546

1547 if (u_mqstat) {

1548 new = &mqstat;

1549 if (get_compat_mq_attr(new, u_mqstat))

1550 return -EFAULT;

1551 }

1552 if (u_omqstat)

1553 old = &omqstat;

1554

1555 ret = do_mq_getsetattr(mqdes, new, old);

1556 if (ret || !old)

1557 return ret;

1558

1559 if (put_compat_mq_attr(old, u_omqstat))

1560 return -EFAULT;

1561 return 0;

1562 }

1563 #endif

1564

1565 #ifdef CONFIG_COMPAT_32BIT_TIME

1566 static int compat_prepare_timeout(const struct old_timespec32 __user *p,

1567 struct timespec64 *ts)

1568 {

1569 if (get_old_timespec32(ts, p))

1570 return -EFAULT;

1571 if (!timespec64_valid(ts))

1572 return -EINVAL;

1573 return 0;

1574 }

1575

1576 SYSCALL_DEFINE5(mq_timedsend_time32, mqd_t, mqdes,

1577 const char __user *, u_msg_ptr,

1578 unsigned int, msg_len, unsigned int, msg_prio,

1579 const struct old_timespec32 __user *, u_abs_timeout)

1580 {

1581 struct timespec64 ts, *p = NULL;

1582 if (u_abs_timeout) {

1583 int res = compat_prepare_timeout(u_abs_timeout, &ts);

1584 if (res)

1585 return res;

1586 p = &ts;

1587 }

1588 return do_mq_timedsend(mqdes, u_msg_ptr, msg_len, msg_prio, p);

1589 }

1590

1591 SYSCALL_DEFINE5(mq_timedreceive_time32, mqd_t, mqdes,

1592 char __user *, u_msg_ptr,

1593 unsigned int, msg_len, unsigned int __user *, u_msg_prio,

1594 const struct old_timespec32 __user *, u_abs_timeout)

1595 {

1596 struct timespec64 ts, *p = NULL;

1597 if (u_abs_timeout) {

1598 int res = compat_prepare_timeout(u_abs_timeout, &ts);

1599 if (res)

1600 return res;

1601 p = &ts;

1602 }

1603 return do_mq_timedreceive(mqdes, u_msg_ptr, msg_len, u_msg_prio, p);

1604 }

1605 #endif

1606

1607 static const struct inode_operations mqueue_dir_inode_operations = {

1608 .lookup = simple_lookup,

1609 .create = mqueue_create,

1610 .unlink = mqueue_unlink,

1611 };

1612

1613 static const struct file_operations mqueue_file_operations = {

1614 .flush = mqueue_flush_file,

1615 .poll = mqueue_poll_file,

1616 .read = mqueue_read_file,

1617 .llseek = default_llseek,

1618 };

1619

1620 static const struct super_operations mqueue_super_ops = {

1621 .alloc_inode = mqueue_alloc_inode,

1622 .free_inode = mqueue_free_inode,

1623 .evict_inode = mqueue_evict_inode,

1624 .statfs = simple_statfs,

1625 };

1626

1627 static const struct fs_context_operations mqueue_fs_context_ops = {

1628 .free = mqueue_fs_context_free,

1629 .get_tree = mqueue_get_tree,

1630 };

1631

1632 static struct file_system_type mqueue_fs_type = {

1633 .name = "mqueue",

1634 .init_fs_context = mqueue_init_fs_context,

1635 .kill_sb = kill_anon_super,

1636 .fs_flags = FS_USERNS_MOUNT,

1637 };

1638

1639 int mq_init_ns(struct ipc_namespace *ns)

1640 {

1641 struct vfsmount *m;

1642

1643 ns->mq_queues_count = 0;

1644 ns->mq_queues_max = DFLT_QUEUESMAX;

1645 ns->mq_msg_max = DFLT_MSGMAX;

1646 ns->mq_msgsize_max = DFLT_MSGSIZEMAX;

1647 ns->mq_msg_default = DFLT_MSG;

1648 ns->mq_msgsize_default = DFLT_MSGSIZE;

1649

1650 m = mq_create_mount(ns);

1651 if (IS_ERR(m))

1652 return PTR_ERR(m);

1653 ns->mq_mnt = m;

1654 return 0;

1655 }

1656

1657 void mq_clear_sbinfo(struct ipc_namespace *ns)

1658 {

1659 ns->mq_mnt->mnt_sb->s_fs_info = NULL;

1660 }

1661

1662 static int __init init_mqueue_fs(void)

1663 {

1664 int error;

1665

1666 mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",

1667 sizeof(struct mqueue_inode_info), 0,

1668 SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, init_once);

1669 if (mqueue_inode_cachep == NULL)

1670 return -ENOMEM;

1671

1672 if (!setup_mq_sysctls(&init_ipc_ns)) {

1673 pr_warn("sysctl registration failed\n");

1674 error = -ENOMEM;

1675 goto out_kmem;

1676 }

1677

1678 error = register_filesystem(&mqueue_fs_type);

1679 if (error)

1680 goto out_sysctl;

1681

1682 spin_lock_init(&mq_lock);

1683

1684 error = mq_init_ns(&init_ipc_ns);

1685 if (error)

1686 goto out_filesystem;

1687

1688 return 0;

1689

1690 out_filesystem:

1691 unregister_filesystem(&mqueue_fs_type);

1692 out_sysctl:

1693 retire_mq_sysctls(&init_ipc_ns);

1694 out_kmem:

1695 kmem_cache_destroy(mqueue_inode_cachep);

1696 return error;

1697 }

1698

1699 device_initcall(init_mqueue_fs);