kernel/events/ring_buffer.c

   1 /*
   2  * Performance events ring-buffer code:
   3  *
   4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
   6  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
   7  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   8  *
   9  * For licensing details see kernel-base/COPYING
  10  */
  11
  12 #include <linux/perf_event.h>
  13 #include <linux/vmalloc.h>
  14 #include <linux/slab.h>
  15 #include <linux/circ_buf.h>
  16 #include <linux/poll.h>
  17 #include <linux/nospec.h>
  18
  19 #include "internal.h"
  20
  21 static void perf_output_wakeup(struct perf_output_handle *handle)
  22 {
  23         atomic_set(&handle->rb->poll, EPOLLIN);
  24
  25         handle->event->pending_wakeup = 1;
  26         irq_work_queue(&handle->event->pending);
  27 }
  28
  29 /*
  30  * We need to ensure a later event_id doesn't publish a head when a former
  31  * event isn't done writing. However since we need to deal with NMIs we
  32  * cannot fully serialize things.
  33  *
  34  * We only publish the head (and generate a wakeup) when the outer-most
  35  * event completes.
  36  */
  37 static void perf_output_get_handle(struct perf_output_handle *handle)
  38 {
  39         struct ring_buffer *rb = handle->rb;
  40
  41         preempt_disable();
  42         local_inc(&rb->nest);
  43         handle->wakeup = local_read(&rb->wakeup);
  44 }
  45
  46 static void perf_output_put_handle(struct perf_output_handle *handle)
  47 {
  48         struct ring_buffer *rb = handle->rb;
  49         unsigned long head;
  50
  51 again:
  52         head = local_read(&rb->head);
  53
  54         /*
  55          * IRQ/NMI can happen here and advance @rb->head, causing our
  56          * load above to be stale.
  57          */
  58
  59         /*
  60          * If this isn't the outermost nesting, we don't have to update
  61          * @rb->user_page->data_head.
  62          */
  63         if (local_read(&rb->nest) > 1) {
  64                 local_dec(&rb->nest);
  65                 goto out;
  66         }
  67
  68         /*
  69          * Since the mmap() consumer (userspace) can run on a different CPU:
  70          *
  71          *   kernel                             user
  72          *
  73          *   if (LOAD ->data_tail) {            LOAD ->data_head
  74          *                      (A)             smp_rmb()       (C)
  75          *      STORE $data                     LOAD $data
  76          *      smp_wmb()       (B)             smp_mb()        (D)
  77          *      STORE ->data_head               STORE ->data_tail
  78          *   }
  79          *
  80          * Where A pairs with D, and B pairs with C.
  81          *
  82          * In our case (A) is a control dependency that separates the load of
  83          * the ->data_tail and the stores of $data. In case ->data_tail
  84          * indicates there is no room in the buffer to store $data we do not.
  85          *
  86          * D needs to be a full barrier since it separates the data READ
  87          * from the tail WRITE.
  88          *
  89          * For B a WMB is sufficient since it separates two WRITEs, and for C
  90          * an RMB is sufficient since it separates two READs.
  91          *
  92          * See perf_output_begin().
  93          */
  94         smp_wmb(); /* B, matches C */
  95         rb->user_page->data_head = head;
  96
  97         /*
  98          * We must publish the head before decrementing the nest count,
  99          * otherwise an IRQ/NMI can publish a more recent head value and our
 100          * write will (temporarily) publish a stale value.
 101          */
 102         barrier();
 103         local_set(&rb->nest, 0);
 104
 105         /*
 106          * Ensure we decrement @rb->nest before we validate the @rb->head.
 107          * Otherwise we cannot be sure we caught the 'last' nested update.
 108          */
 109         barrier();
 110         if (unlikely(head != local_read(&rb->head))) {
 111                 local_inc(&rb->nest);
 112                 goto again;
 113         }
 114
 115         if (handle->wakeup != local_read(&rb->wakeup))
 116                 perf_output_wakeup(handle);
 117
 118 out:
 119         preempt_enable();
 120 }
 121
 122 static __always_inline bool
 123 ring_buffer_has_space(unsigned long head, unsigned long tail,
 124                       unsigned long data_size, unsigned int size,
 125                       bool backward)
 126 {
 127         if (!backward)
 128                 return CIRC_SPACE(head, tail, data_size) >= size;
 129         else
 130                 return CIRC_SPACE(tail, head, data_size) >= size;
 131 }
 132
 133 static __always_inline int
 134 __perf_output_begin(struct perf_output_handle *handle,
 135                     struct perf_event *event, unsigned int size,
 136                     bool backward)
 137 {
 138         struct ring_buffer *rb;
 139         unsigned long tail, offset, head;
 140         int have_lost, page_shift;
 141         struct {
 142                 struct perf_event_header header;
 143                 u64                      id;
 144                 u64                      lost;
 145         } lost_event;
 146
 147         rcu_read_lock();
 148         /*
 149          * For inherited events we send all the output towards the parent.
 150          */
 151         if (event->parent)
 152                 event = event->parent;
 153
 154         rb = rcu_dereference(event->rb);
 155         if (unlikely(!rb))
 156                 goto out;
 157
 158         if (unlikely(rb->paused)) {
 159                 if (rb->nr_pages)
 160                         local_inc(&rb->lost);
 161                 goto out;
 162         }
 163
 164         handle->rb    = rb;
 165         handle->event = event;
 166
 167         have_lost = local_read(&rb->lost);
 168         if (unlikely(have_lost)) {
 169                 size += sizeof(lost_event);
 170                 if (event->attr.sample_id_all)
 171                         size += event->id_header_size;
 172         }
 173
 174         perf_output_get_handle(handle);
 175
 176         do {
 177                 tail = READ_ONCE(rb->user_page->data_tail);
 178                 offset = head = local_read(&rb->head);
 179                 if (!rb->overwrite) {
 180                         if (unlikely(!ring_buffer_has_space(head, tail,
 181                                                             perf_data_size(rb),
 182                                                             size, backward)))
 183                                 goto fail;
 184                 }
 185
 186                 /*
 187                  * The above forms a control dependency barrier separating the
 188                  * @tail load above from the data stores below. Since the @tail
 189                  * load is required to compute the branch to fail below.
 190                  *
 191                  * A, matches D; the full memory barrier userspace SHOULD issue
 192                  * after reading the data and before storing the new tail
 193                  * position.
 194                  *
 195                  * See perf_output_put_handle().
 196                  */
 197
 198                 if (!backward)
 199                         head += size;
 200                 else
 201                         head -= size;
 202         } while (local_cmpxchg(&rb->head, offset, head) != offset);
 203
 204         if (backward) {
 205                 offset = head;
 206                 head = (u64)(-head);
 207         }
 208
 209         /*
 210          * We rely on the implied barrier() by local_cmpxchg() to ensure
 211          * none of the data stores below can be lifted up by the compiler.
 212          */
 213
 214         if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
 215                 local_add(rb->watermark, &rb->wakeup);
 216
 217         page_shift = PAGE_SHIFT + page_order(rb);
 218
 219         handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
 220         offset &= (1UL << page_shift) - 1;
 221         handle->addr = rb->data_pages[handle->page] + offset;
 222         handle->size = (1UL << page_shift) - offset;
 223
 224         if (unlikely(have_lost)) {
 225                 struct perf_sample_data sample_data;
 226
 227                 lost_event.header.size = sizeof(lost_event);
 228                 lost_event.header.type = PERF_RECORD_LOST;
 229                 lost_event.header.misc = 0;
 230                 lost_event.id          = event->id;
 231                 lost_event.lost        = local_xchg(&rb->lost, 0);
 232
 233                 perf_event_header__init_id(&lost_event.header,
 234                                            &sample_data, event);
 235                 perf_output_put(handle, lost_event);
 236                 perf_event__output_id_sample(event, handle, &sample_data);
 237         }
 238
 239         return 0;
 240
 241 fail:
 242         local_inc(&rb->lost);
 243         perf_output_put_handle(handle);
 244 out:
 245         rcu_read_unlock();
 246
 247         return -ENOSPC;
 248 }
 249
 250 int perf_output_begin_forward(struct perf_output_handle *handle,
 251                              struct perf_event *event, unsigned int size)
 252 {
 253         return __perf_output_begin(handle, event, size, false);
 254 }
 255
 256 int perf_output_begin_backward(struct perf_output_handle *handle,
 257                                struct perf_event *event, unsigned int size)
 258 {
 259         return __perf_output_begin(handle, event, size, true);
 260 }
 261
 262 int perf_output_begin(struct perf_output_handle *handle,
 263                       struct perf_event *event, unsigned int size)
 264 {
 265
 266         return __perf_output_begin(handle, event, size,
 267                                    unlikely(is_write_backward(event)));
 268 }
 269
 270 unsigned int perf_output_copy(struct perf_output_handle *handle,
 271                       const void *buf, unsigned int len)
 272 {
 273         return __output_copy(handle, buf, len);
 274 }
 275
 276 unsigned int perf_output_skip(struct perf_output_handle *handle,
 277                               unsigned int len)
 278 {
 279         return __output_skip(handle, NULL, len);
 280 }
 281
 282 void perf_output_end(struct perf_output_handle *handle)
 283 {
 284         perf_output_put_handle(handle);
 285         rcu_read_unlock();
 286 }
 287
 288 static void
 289 ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
 290 {
 291         long max_size = perf_data_size(rb);
 292
 293         if (watermark)
 294                 rb->watermark = min(max_size, watermark);
 295
 296         if (!rb->watermark)
 297                 rb->watermark = max_size / 2;
 298
 299         if (flags & RING_BUFFER_WRITABLE)
 300                 rb->overwrite = 0;
 301         else
 302                 rb->overwrite = 1;
 303
 304         atomic_set(&rb->refcount, 1);
 305
 306         INIT_LIST_HEAD(&rb->event_list);
 307         spin_lock_init(&rb->event_lock);
 308
 309         /*
 310          * perf_output_begin() only checks rb->paused, therefore
 311          * rb->paused must be true if we have no pages for output.
 312          */
 313         if (!rb->nr_pages)
 314                 rb->paused = 1;
 315 }
 316
 317 void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
 318 {
 319         /*
 320          * OVERWRITE is determined by perf_aux_output_end() and can't
 321          * be passed in directly.
 322          */
 323         if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE))
 324                 return;
 325
 326         handle->aux_flags |= flags;
 327 }
 328 EXPORT_SYMBOL_GPL(perf_aux_output_flag);
 329
 330 /*
 331  * This is called before hardware starts writing to the AUX area to
 332  * obtain an output handle and make sure there's room in the buffer.
 333  * When the capture completes, call perf_aux_output_end() to commit
 334  * the recorded data to the buffer.
 335  *
 336  * The ordering is similar to that of perf_output_{begin,end}, with
 337  * the exception of (B), which should be taken care of by the pmu
 338  * driver, since ordering rules will differ depending on hardware.
 339  *
 340  * Call this from pmu::start(); see the comment in perf_aux_output_end()
 341  * about its use in pmu callbacks. Both can also be called from the PMI
 342  * handler if needed.
 343  */
 344 void *perf_aux_output_begin(struct perf_output_handle *handle,
 345                             struct perf_event *event)
 346 {
 347         struct perf_event *output_event = event;
 348         unsigned long aux_head, aux_tail;
 349         struct ring_buffer *rb;
 350
 351         if (output_event->parent)
 352                 output_event = output_event->parent;
 353
 354         /*
 355          * Since this will typically be open across pmu::add/pmu::del, we
 356          * grab ring_buffer's refcount instead of holding rcu read lock
 357          * to make sure it doesn't disappear under us.
 358          */
 359         rb = ring_buffer_get(output_event);
 360         if (!rb)
 361                 return NULL;
 362
 363         if (!rb_has_aux(rb))
 364                 goto err;
 365
 366         /*
 367          * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(),
 368          * about to get freed, so we leave immediately.
 369          *
 370          * Checking rb::aux_mmap_count and rb::refcount has to be done in
 371          * the same order, see perf_mmap_close. Otherwise we end up freeing
 372          * aux pages in this path, which is a bug, because in_atomic().
 373          */
 374         if (!atomic_read(&rb->aux_mmap_count))
 375                 goto err;
 376
 377         if (!atomic_inc_not_zero(&rb->aux_refcount))
 378                 goto err;
 379
 380         /*
 381          * Nesting is not supported for AUX area, make sure nested
 382          * writers are caught early
 383          */
 384         if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
 385                 goto err_put;
 386
 387         aux_head = rb->aux_head;
 388
 389         handle->rb = rb;
 390         handle->event = event;
 391         handle->head = aux_head;
 392         handle->size = 0;
 393         handle->aux_flags = 0;
 394
 395         /*
 396          * In overwrite mode, AUX data stores do not depend on aux_tail,
 397          * therefore (A) control dependency barrier does not exist. The
 398          * (B) <-> (C) ordering is still observed by the pmu driver.
 399          */
 400         if (!rb->aux_overwrite) {
 401                 aux_tail = READ_ONCE(rb->user_page->aux_tail);
 402                 handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
 403                 if (aux_head - aux_tail < perf_aux_size(rb))
 404                         handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
 405
 406                 /*
 407                  * handle->size computation depends on aux_tail load; this forms a
 408                  * control dependency barrier separating aux_tail load from aux data
 409                  * store that will be enabled on successful return
 410                  */
 411                 if (!handle->size) { /* A, matches D */
 412                         event->pending_disable = smp_processor_id();
 413                         perf_output_wakeup(handle);
 414                         local_set(&rb->aux_nest, 0);
 415                         goto err_put;
 416                 }
 417         }
 418
 419         return handle->rb->aux_priv;
 420
 421 err_put:
 422         /* can't be last */
 423         rb_free_aux(rb);
 424
 425 err:
 426         ring_buffer_put(rb);
 427         handle->event = NULL;
 428
 429         return NULL;
 430 }
 431 EXPORT_SYMBOL_GPL(perf_aux_output_begin);
 432
 433 static __always_inline bool rb_need_aux_wakeup(struct ring_buffer *rb)
 434 {
 435         if (rb->aux_overwrite)
 436                 return false;
 437
 438         if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
 439                 rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
 440                 return true;
 441         }
 442
 443         return false;
 444 }
 445
 446 /*
 447  * Commit the data written by hardware into the ring buffer by adjusting
 448  * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
 449  * pmu driver's responsibility to observe ordering rules of the hardware,
 450  * so that all the data is externally visible before this is called.
 451  *
 452  * Note: this has to be called from pmu::stop() callback, as the assumption
 453  * of the AUX buffer management code is that after pmu::stop(), the AUX
 454  * transaction must be stopped and therefore drop the AUX reference count.
 455  */
 456 void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
 457 {
 458         bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED);
 459         struct ring_buffer *rb = handle->rb;
 460         unsigned long aux_head;
 461
 462         /* in overwrite mode, driver provides aux_head via handle */
 463         if (rb->aux_overwrite) {
 464                 handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
 465
 466                 aux_head = handle->head;
 467                 rb->aux_head = aux_head;
 468         } else {
 469                 handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
 470
 471                 aux_head = rb->aux_head;
 472                 rb->aux_head += size;
 473         }
 474
 475         if (size || handle->aux_flags) {
 476                 /*
 477                  * Only send RECORD_AUX if we have something useful to communicate
 478                  */
 479
 480                 perf_event_aux_event(handle->event, aux_head, size,
 481                                      handle->aux_flags);
 482         }
 483
 484         rb->user_page->aux_head = rb->aux_head;
 485         if (rb_need_aux_wakeup(rb))
 486                 wakeup = true;
 487
 488         if (wakeup) {
 489                 if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
 490                         handle->event->pending_disable = smp_processor_id();
 491                 perf_output_wakeup(handle);
 492         }
 493
 494         handle->event = NULL;
 495
 496         local_set(&rb->aux_nest, 0);
 497         /* can't be last */
 498         rb_free_aux(rb);
 499         ring_buffer_put(rb);
 500 }
 501 EXPORT_SYMBOL_GPL(perf_aux_output_end);
 502
 503 /*
 504  * Skip over a given number of bytes in the AUX buffer, due to, for example,
 505  * hardware's alignment constraints.
 506  */
 507 int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
 508 {
 509         struct ring_buffer *rb = handle->rb;
 510
 511         if (size > handle->size)
 512                 return -ENOSPC;
 513
 514         rb->aux_head += size;
 515
 516         rb->user_page->aux_head = rb->aux_head;
 517         if (rb_need_aux_wakeup(rb)) {
 518                 perf_output_wakeup(handle);
 519                 handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
 520         }
 521
 522         handle->head = rb->aux_head;
 523         handle->size -= size;
 524
 525         return 0;
 526 }
 527 EXPORT_SYMBOL_GPL(perf_aux_output_skip);
 528
 529 void *perf_get_aux(struct perf_output_handle *handle)
 530 {
 531         /* this is only valid between perf_aux_output_begin and *_end */
 532         if (!handle->event)
 533                 return NULL;
 534
 535         return handle->rb->aux_priv;
 536 }
 537 EXPORT_SYMBOL_GPL(perf_get_aux);
 538
 539 #define PERF_AUX_GFP    (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
 540
 541 static struct page *rb_alloc_aux_page(int node, int order)
 542 {
 543         struct page *page;
 544
 545         if (order > MAX_ORDER)
 546                 order = MAX_ORDER;
 547
 548         do {
 549                 page = alloc_pages_node(node, PERF_AUX_GFP, order);
 550         } while (!page && order--);
 551
 552         if (page && order) {
 553                 /*
 554                  * Communicate the allocation size to the driver:
 555                  * if we managed to secure a high-order allocation,
 556                  * set its first page's private to this order;
 557                  * !PagePrivate(page) means it's just a normal page.
 558                  */
 559                 split_page(page, order);
 560                 SetPagePrivate(page);
 561                 set_page_private(page, order);
 562         }
 563
 564         return page;
 565 }
 566
 567 static void rb_free_aux_page(struct ring_buffer *rb, int idx)
 568 {
 569         struct page *page = virt_to_page(rb->aux_pages[idx]);
 570
 571         ClearPagePrivate(page);
 572         page->mapping = NULL;
 573         __free_page(page);
 574 }
 575
 576 static void __rb_free_aux(struct ring_buffer *rb)
 577 {
 578         int pg;
 579
 580         /*
 581          * Should never happen, the last reference should be dropped from
 582          * perf_mmap_close() path, which first stops aux transactions (which
 583          * in turn are the atomic holders of aux_refcount) and then does the
 584          * last rb_free_aux().
 585          */
 586         WARN_ON_ONCE(in_atomic());
 587
 588         if (rb->aux_priv) {
 589                 rb->free_aux(rb->aux_priv);
 590                 rb->free_aux = NULL;
 591                 rb->aux_priv = NULL;
 592         }
 593
 594         if (rb->aux_nr_pages) {
 595                 for (pg = 0; pg < rb->aux_nr_pages; pg++)
 596                         rb_free_aux_page(rb, pg);
 597
 598                 kfree(rb->aux_pages);
 599                 rb->aux_nr_pages = 0;
 600         }
 601 }
 602
 603 int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
 604                  pgoff_t pgoff, int nr_pages, long watermark, int flags)
 605 {
 606         bool overwrite = !(flags & RING_BUFFER_WRITABLE);
 607         int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
 608         int ret = -ENOMEM, max_order = 0;
 609
 610         if (!has_aux(event))
 611                 return -EOPNOTSUPP;
 612
 613         if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {
 614                 /*
 615                  * We need to start with the max_order that fits in nr_pages,
 616                  * not the other way around, hence ilog2() and not get_order.
 617                  */
 618                 max_order = ilog2(nr_pages);
 619
 620                 /*
 621                  * PMU requests more than one contiguous chunks of memory
 622                  * for SW double buffering
 623                  */
 624                 if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) &&
 625                     !overwrite) {
 626                         if (!max_order)
 627                                 return -EINVAL;
 628
 629                         max_order--;
 630                 }
 631         }
 632
 633         rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
 634                                      node);
 635         if (!rb->aux_pages)
 636                 return -ENOMEM;
 637
 638         rb->free_aux = event->pmu->free_aux;
 639         for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) {
 640                 struct page *page;
 641                 int last, order;
 642
 643                 order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages));
 644                 page = rb_alloc_aux_page(node, order);
 645                 if (!page)
 646                         goto out;
 647
 648                 for (last = rb->aux_nr_pages + (1 << page_private(page));
 649                      last > rb->aux_nr_pages; rb->aux_nr_pages++)
 650                         rb->aux_pages[rb->aux_nr_pages] = page_address(page++);
 651         }
 652
 653         /*
 654          * In overwrite mode, PMUs that don't support SG may not handle more
 655          * than one contiguous allocation, since they rely on PMI to do double
 656          * buffering. In this case, the entire buffer has to be one contiguous
 657          * chunk.
 658          */
 659         if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) &&
 660             overwrite) {
 661                 struct page *page = virt_to_page(rb->aux_pages[0]);
 662
 663                 if (page_private(page) != max_order)
 664                         goto out;
 665         }
 666
 667         rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages,
 668                                              overwrite);
 669         if (!rb->aux_priv)
 670                 goto out;
 671
 672         ret = 0;
 673
 674         /*
 675          * aux_pages (and pmu driver's private data, aux_priv) will be
 676          * referenced in both producer's and consumer's contexts, thus
 677          * we keep a refcount here to make sure either of the two can
 678          * reference them safely.
 679          */
 680         atomic_set(&rb->aux_refcount, 1);
 681
 682         rb->aux_overwrite = overwrite;
 683         rb->aux_watermark = watermark;
 684
 685         if (!rb->aux_watermark && !rb->aux_overwrite)
 686                 rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1);
 687
 688 out:
 689         if (!ret)
 690                 rb->aux_pgoff = pgoff;
 691         else
 692                 __rb_free_aux(rb);
 693
 694         return ret;
 695 }
 696
 697 void rb_free_aux(struct ring_buffer *rb)
 698 {
 699         if (atomic_dec_and_test(&rb->aux_refcount))
 700                 __rb_free_aux(rb);
 701 }
 702
 703 #ifndef CONFIG_PERF_USE_VMALLOC
 704
 705 /*
 706  * Back perf_mmap() with regular GFP_KERNEL-0 pages.
 707  */
 708
 709 static struct page *
 710 __perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
 711 {
 712         if (pgoff > rb->nr_pages)
 713                 return NULL;
 714
 715         if (pgoff == 0)
 716                 return virt_to_page(rb->user_page);
 717
 718         return virt_to_page(rb->data_pages[pgoff - 1]);
 719 }
 720
 721 static void *perf_mmap_alloc_page(int cpu)
 722 {
 723         struct page *page;
 724         int node;
 725
 726         node = (cpu == -1) ? cpu : cpu_to_node(cpu);
 727         page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
 728         if (!page)
 729                 return NULL;
 730
 731         return page_address(page);
 732 }
 733
 734 struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
 735 {
 736         struct ring_buffer *rb;
 737         unsigned long size;
 738         int i;
 739
 740         size = sizeof(struct ring_buffer);
 741         size += nr_pages * sizeof(void *);
 742
 743         if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER)
 744                 goto fail;
 745
 746         rb = kzalloc(size, GFP_KERNEL);
 747         if (!rb)
 748                 goto fail;
 749
 750         rb->user_page = perf_mmap_alloc_page(cpu);
 751         if (!rb->user_page)
 752                 goto fail_user_page;
 753
 754         for (i = 0; i < nr_pages; i++) {
 755                 rb->data_pages[i] = perf_mmap_alloc_page(cpu);
 756                 if (!rb->data_pages[i])
 757                         goto fail_data_pages;
 758         }
 759
 760         rb->nr_pages = nr_pages;
 761
 762         ring_buffer_init(rb, watermark, flags);
 763
 764         return rb;
 765
 766 fail_data_pages:
 767         for (i--; i >= 0; i--)
 768                 free_page((unsigned long)rb->data_pages[i]);
 769
 770         free_page((unsigned long)rb->user_page);
 771
 772 fail_user_page:
 773         kfree(rb);
 774
 775 fail:
 776         return NULL;
 777 }
 778
 779 static void perf_mmap_free_page(unsigned long addr)
 780 {
 781         struct page *page = virt_to_page((void *)addr);
 782
 783         page->mapping = NULL;
 784         __free_page(page);
 785 }
 786
 787 void rb_free(struct ring_buffer *rb)
 788 {
 789         int i;
 790
 791         perf_mmap_free_page((unsigned long)rb->user_page);
 792         for (i = 0; i < rb->nr_pages; i++)
 793                 perf_mmap_free_page((unsigned long)rb->data_pages[i]);
 794         kfree(rb);
 795 }
 796
 797 #else
 798 static int data_page_nr(struct ring_buffer *rb)
 799 {
 800         return rb->nr_pages << page_order(rb);
 801 }
 802
 803 static struct page *
 804 __perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
 805 {
 806         /* The '>' counts in the user page. */
 807         if (pgoff > data_page_nr(rb))
 808                 return NULL;
 809
 810         return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
 811 }
 812
 813 static void perf_mmap_unmark_page(void *addr)
 814 {
 815         struct page *page = vmalloc_to_page(addr);
 816
 817         page->mapping = NULL;
 818 }
 819
 820 static void rb_free_work(struct work_struct *work)
 821 {
 822         struct ring_buffer *rb;
 823         void *base;
 824         int i, nr;
 825
 826         rb = container_of(work, struct ring_buffer, work);
 827         nr = data_page_nr(rb);
 828
 829         base = rb->user_page;
 830         /* The '<=' counts in the user page. */
 831         for (i = 0; i <= nr; i++)
 832                 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
 833
 834         vfree(base);
 835         kfree(rb);
 836 }
 837
 838 void rb_free(struct ring_buffer *rb)
 839 {
 840         schedule_work(&rb->work);
 841 }
 842
 843 struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
 844 {
 845         struct ring_buffer *rb;
 846         unsigned long size;
 847         void *all_buf;
 848
 849         size = sizeof(struct ring_buffer);
 850         size += sizeof(void *);
 851
 852         rb = kzalloc(size, GFP_KERNEL);
 853         if (!rb)
 854                 goto fail;
 855
 856         INIT_WORK(&rb->work, rb_free_work);
 857
 858         all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
 859         if (!all_buf)
 860                 goto fail_all_buf;
 861
 862         rb->user_page = all_buf;
 863         rb->data_pages[0] = all_buf + PAGE_SIZE;
 864         if (nr_pages) {
 865                 rb->nr_pages = 1;
 866                 rb->page_order = ilog2(nr_pages);
 867         }
 868
 869         ring_buffer_init(rb, watermark, flags);
 870
 871         return rb;
 872
 873 fail_all_buf:
 874         kfree(rb);
 875
 876 fail:
 877         return NULL;
 878 }
 879
 880 #endif
 881
 882 struct page *
 883 perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
 884 {
 885         if (rb->aux_nr_pages) {
 886                 /* above AUX space */
 887                 if (pgoff > rb->aux_pgoff + rb->aux_nr_pages)
 888                         return NULL;
 889
 890                 /* AUX space */
 891                 if (pgoff >= rb->aux_pgoff) {
 892                         int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages);
 893                         return virt_to_page(rb->aux_pages[aux_pgoff]);
 894                 }
 895         }
 896
 897         return __perf_mmap_to_page(rb, pgoff);
 898 }