arch/powerpc/lib/qspinlock.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 #include <linux/bug.h>
   3 #include <linux/compiler.h>
   4 #include <linux/export.h>
   5 #include <linux/percpu.h>
   6 #include <linux/processor.h>
   7 #include <linux/smp.h>
   8 #include <linux/topology.h>
   9 #include <linux/sched/clock.h>
  10 #include <asm/qspinlock.h>
  11 #include <asm/paravirt.h>
  12
  13 #define MAX_NODES       4
  14
  15 struct qnode {
  16         struct qnode    *next;
  17         struct qspinlock *lock;
  18         int             cpu;
  19         int             yield_cpu;
  20         u8              locked; /* 1 if lock acquired */
  21 };
  22
  23 struct qnodes {
  24         int             count;
  25         struct qnode nodes[MAX_NODES];
  26 };
  27
  28 /* Tuning parameters */
  29 static int steal_spins __read_mostly = (1 << 5);
  30 static int remote_steal_spins __read_mostly = (1 << 2);
  31 #if _Q_SPIN_TRY_LOCK_STEAL == 1
  32 static const bool maybe_stealers = true;
  33 #else
  34 static bool maybe_stealers __read_mostly = true;
  35 #endif
  36 static int head_spins __read_mostly = (1 << 8);
  37
  38 static bool pv_yield_owner __read_mostly = true;
  39 static bool pv_yield_allow_steal __read_mostly = false;
  40 static bool pv_spin_on_preempted_owner __read_mostly = false;
  41 static bool pv_sleepy_lock __read_mostly = true;
  42 static bool pv_sleepy_lock_sticky __read_mostly = false;
  43 static u64 pv_sleepy_lock_interval_ns __read_mostly = 0;
  44 static int pv_sleepy_lock_factor __read_mostly = 256;
  45 static bool pv_yield_prev __read_mostly = true;
  46 static bool pv_yield_propagate_owner __read_mostly = true;
  47 static bool pv_prod_head __read_mostly = false;
  48
  49 static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
  50 static DEFINE_PER_CPU_ALIGNED(u64, sleepy_lock_seen_clock);
  51
  52 #if _Q_SPIN_SPEC_BARRIER == 1
  53 #define spec_barrier() do { asm volatile("ori 31,31,0" ::: "memory"); } while (0)
  54 #else
  55 #define spec_barrier() do { } while (0)
  56 #endif
  57
  58 static __always_inline bool recently_sleepy(void)
  59 {
  60         /* pv_sleepy_lock is true when this is called */
  61         if (pv_sleepy_lock_interval_ns) {
  62                 u64 seen = this_cpu_read(sleepy_lock_seen_clock);
  63
  64                 if (seen) {
  65                         u64 delta = sched_clock() - seen;
  66                         if (delta < pv_sleepy_lock_interval_ns)
  67                                 return true;
  68                         this_cpu_write(sleepy_lock_seen_clock, 0);
  69                 }
  70         }
  71
  72         return false;
  73 }
  74
  75 static __always_inline int get_steal_spins(bool paravirt, bool sleepy)
  76 {
  77         if (paravirt && sleepy)
  78                 return steal_spins * pv_sleepy_lock_factor;
  79         else
  80                 return steal_spins;
  81 }
  82
  83 static __always_inline int get_remote_steal_spins(bool paravirt, bool sleepy)
  84 {
  85         if (paravirt && sleepy)
  86                 return remote_steal_spins * pv_sleepy_lock_factor;
  87         else
  88                 return remote_steal_spins;
  89 }
  90
  91 static __always_inline int get_head_spins(bool paravirt, bool sleepy)
  92 {
  93         if (paravirt && sleepy)
  94                 return head_spins * pv_sleepy_lock_factor;
  95         else
  96                 return head_spins;
  97 }
  98
  99 static inline u32 encode_tail_cpu(int cpu)
 100 {
 101         return (cpu + 1) << _Q_TAIL_CPU_OFFSET;
 102 }
 103
 104 static inline int decode_tail_cpu(u32 val)
 105 {
 106         return (val >> _Q_TAIL_CPU_OFFSET) - 1;
 107 }
 108
 109 static inline int get_owner_cpu(u32 val)
 110 {
 111         return (val & _Q_OWNER_CPU_MASK) >> _Q_OWNER_CPU_OFFSET;
 112 }
 113
 114 /*
 115  * Try to acquire the lock if it was not already locked. If the tail matches
 116  * mytail then clear it, otherwise leave it unchnaged. Return previous value.
 117  *
 118  * This is used by the head of the queue to acquire the lock and clean up
 119  * its tail if it was the last one queued.
 120  */
 121 static __always_inline u32 trylock_clean_tail(struct qspinlock *lock, u32 tail)
 122 {
 123         u32 newval = queued_spin_encode_locked_val();
 124         u32 prev, tmp;
 125
 126         asm volatile(
 127 "1:     lwarx   %0,0,%2,%7      # trylock_clean_tail                    \n"
 128         /* This test is necessary if there could be stealers */
 129 "       andi.   %1,%0,%5                                                \n"
 130 "       bne     3f                                                      \n"
 131         /* Test whether the lock tail == mytail */
 132 "       and     %1,%0,%6                                                \n"
 133 "       cmpw    0,%1,%3                                                 \n"
 134         /* Merge the new locked value */
 135 "       or      %1,%1,%4                                                \n"
 136 "       bne     2f                                                      \n"
 137         /* If the lock tail matched, then clear it, otherwise leave it. */
 138 "       andc    %1,%1,%6                                                \n"
 139 "2:     stwcx.  %1,0,%2                                                 \n"
 140 "       bne-    1b                                                      \n"
 141 "\t"    PPC_ACQUIRE_BARRIER "                                           \n"
 142 "3:                                                                     \n"
 143         : "=&r" (prev), "=&r" (tmp)
 144         : "r" (&lock->val), "r"(tail), "r" (newval),
 145           "i" (_Q_LOCKED_VAL),
 146           "r" (_Q_TAIL_CPU_MASK),
 147           "i" (_Q_SPIN_EH_HINT)
 148         : "cr0", "memory");
 149
 150         return prev;
 151 }
 152
 153 /*
 154  * Publish our tail, replacing previous tail. Return previous value.
 155  *
 156  * This provides a release barrier for publishing node, this pairs with the
 157  * acquire barrier in get_tail_qnode() when the next CPU finds this tail
 158  * value.
 159  */
 160 static __always_inline u32 publish_tail_cpu(struct qspinlock *lock, u32 tail)
 161 {
 162         u32 prev, tmp;
 163
 164         kcsan_release();
 165
 166         asm volatile(
 167 "\t"    PPC_RELEASE_BARRIER "                                           \n"
 168 "1:     lwarx   %0,0,%2         # publish_tail_cpu                      \n"
 169 "       andc    %1,%0,%4                                                \n"
 170 "       or      %1,%1,%3                                                \n"
 171 "       stwcx.  %1,0,%2                                                 \n"
 172 "       bne-    1b                                                      \n"
 173         : "=&r" (prev), "=&r"(tmp)
 174         : "r" (&lock->val), "r" (tail), "r"(_Q_TAIL_CPU_MASK)
 175         : "cr0", "memory");
 176
 177         return prev;
 178 }
 179
 180 static __always_inline u32 set_mustq(struct qspinlock *lock)
 181 {
 182         u32 prev;
 183
 184         asm volatile(
 185 "1:     lwarx   %0,0,%1         # set_mustq                             \n"
 186 "       or      %0,%0,%2                                                \n"
 187 "       stwcx.  %0,0,%1                                                 \n"
 188 "       bne-    1b                                                      \n"
 189         : "=&r" (prev)
 190         : "r" (&lock->val), "r" (_Q_MUST_Q_VAL)
 191         : "cr0", "memory");
 192
 193         return prev;
 194 }
 195
 196 static __always_inline u32 clear_mustq(struct qspinlock *lock)
 197 {
 198         u32 prev;
 199
 200         asm volatile(
 201 "1:     lwarx   %0,0,%1         # clear_mustq                           \n"
 202 "       andc    %0,%0,%2                                                \n"
 203 "       stwcx.  %0,0,%1                                                 \n"
 204 "       bne-    1b                                                      \n"
 205         : "=&r" (prev)
 206         : "r" (&lock->val), "r" (_Q_MUST_Q_VAL)
 207         : "cr0", "memory");
 208
 209         return prev;
 210 }
 211
 212 static __always_inline bool try_set_sleepy(struct qspinlock *lock, u32 old)
 213 {
 214         u32 prev;
 215         u32 new = old | _Q_SLEEPY_VAL;
 216
 217         BUG_ON(!(old & _Q_LOCKED_VAL));
 218         BUG_ON(old & _Q_SLEEPY_VAL);
 219
 220         asm volatile(
 221 "1:     lwarx   %0,0,%1         # try_set_sleepy                        \n"
 222 "       cmpw    0,%0,%2                                                 \n"
 223 "       bne-    2f                                                      \n"
 224 "       stwcx.  %3,0,%1                                                 \n"
 225 "       bne-    1b                                                      \n"
 226 "2:                                                                     \n"
 227         : "=&r" (prev)
 228         : "r" (&lock->val), "r"(old), "r" (new)
 229         : "cr0", "memory");
 230
 231         return likely(prev == old);
 232 }
 233
 234 static __always_inline void seen_sleepy_owner(struct qspinlock *lock, u32 val)
 235 {
 236         if (pv_sleepy_lock) {
 237                 if (pv_sleepy_lock_interval_ns)
 238                         this_cpu_write(sleepy_lock_seen_clock, sched_clock());
 239                 if (!(val & _Q_SLEEPY_VAL))
 240                         try_set_sleepy(lock, val);
 241         }
 242 }
 243
 244 static __always_inline void seen_sleepy_lock(void)
 245 {
 246         if (pv_sleepy_lock && pv_sleepy_lock_interval_ns)
 247                 this_cpu_write(sleepy_lock_seen_clock, sched_clock());
 248 }
 249
 250 static __always_inline void seen_sleepy_node(struct qspinlock *lock, u32 val)
 251 {
 252         if (pv_sleepy_lock) {
 253                 if (pv_sleepy_lock_interval_ns)
 254                         this_cpu_write(sleepy_lock_seen_clock, sched_clock());
 255                 if (val & _Q_LOCKED_VAL) {
 256                         if (!(val & _Q_SLEEPY_VAL))
 257                                 try_set_sleepy(lock, val);
 258                 }
 259         }
 260 }
 261
 262 static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val)
 263 {
 264         int cpu = decode_tail_cpu(val);
 265         struct qnodes *qnodesp = per_cpu_ptr(&qnodes, cpu);
 266         int idx;
 267
 268         /*
 269          * After publishing the new tail and finding a previous tail in the
 270          * previous val (which is the control dependency), this barrier
 271          * orders the release barrier in publish_tail_cpu performed by the
 272          * last CPU, with subsequently looking at its qnode structures
 273          * after the barrier.
 274          */
 275         smp_acquire__after_ctrl_dep();
 276
 277         for (idx = 0; idx < MAX_NODES; idx++) {
 278                 struct qnode *qnode = &qnodesp->nodes[idx];
 279                 if (qnode->lock == lock)
 280                         return qnode;
 281         }
 282
 283         BUG();
 284 }
 285
 286 /* Called inside spin_begin(). Returns whether or not the vCPU was preempted. */
 287 static __always_inline bool __yield_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt, bool mustq)
 288 {
 289         int owner;
 290         u32 yield_count;
 291         bool preempted = false;
 292
 293         BUG_ON(!(val & _Q_LOCKED_VAL));
 294
 295         if (!paravirt)
 296                 goto relax;
 297
 298         if (!pv_yield_owner)
 299                 goto relax;
 300
 301         owner = get_owner_cpu(val);
 302         yield_count = yield_count_of(owner);
 303
 304         if ((yield_count & 1) == 0)
 305                 goto relax; /* owner vcpu is running */
 306
 307         spin_end();
 308
 309         seen_sleepy_owner(lock, val);
 310         preempted = true;
 311
 312         /*
 313          * Read the lock word after sampling the yield count. On the other side
 314          * there may a wmb because the yield count update is done by the
 315          * hypervisor preemption and the value update by the OS, however this
 316          * ordering might reduce the chance of out of order accesses and
 317          * improve the heuristic.
 318          */
 319         smp_rmb();
 320
 321         if (READ_ONCE(lock->val) == val) {
 322                 if (mustq)
 323                         clear_mustq(lock);
 324                 yield_to_preempted(owner, yield_count);
 325                 if (mustq)
 326                         set_mustq(lock);
 327                 spin_begin();
 328
 329                 /* Don't relax if we yielded. Maybe we should? */
 330                 return preempted;
 331         }
 332         spin_begin();
 333 relax:
 334         spin_cpu_relax();
 335
 336         return preempted;
 337 }
 338
 339 /* Called inside spin_begin(). Returns whether or not the vCPU was preempted. */
 340 static __always_inline bool yield_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt)
 341 {
 342         return __yield_to_locked_owner(lock, val, paravirt, false);
 343 }
 344
 345 /* Called inside spin_begin(). Returns whether or not the vCPU was preempted. */
 346 static __always_inline bool yield_head_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt)
 347 {
 348         bool mustq = false;
 349
 350         if ((val & _Q_MUST_Q_VAL) && pv_yield_allow_steal)
 351                 mustq = true;
 352
 353         return __yield_to_locked_owner(lock, val, paravirt, mustq);
 354 }
 355
 356 static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, int *set_yield_cpu, bool paravirt)
 357 {
 358         struct qnode *next;
 359         int owner;
 360
 361         if (!paravirt)
 362                 return;
 363         if (!pv_yield_propagate_owner)
 364                 return;
 365
 366         owner = get_owner_cpu(val);
 367         if (*set_yield_cpu == owner)
 368                 return;
 369
 370         next = READ_ONCE(node->next);
 371         if (!next)
 372                 return;
 373
 374         if (vcpu_is_preempted(owner)) {
 375                 next->yield_cpu = owner;
 376                 *set_yield_cpu = owner;
 377         } else if (*set_yield_cpu != -1) {
 378                 next->yield_cpu = owner;
 379                 *set_yield_cpu = owner;
 380         }
 381 }
 382
 383 /* Called inside spin_begin() */
 384 static __always_inline bool yield_to_prev(struct qspinlock *lock, struct qnode *node, u32 val, bool paravirt)
 385 {
 386         int prev_cpu = decode_tail_cpu(val);
 387         u32 yield_count;
 388         int yield_cpu;
 389         bool preempted = false;
 390
 391         if (!paravirt)
 392                 goto relax;
 393
 394         if (!pv_yield_propagate_owner)
 395                 goto yield_prev;
 396
 397         yield_cpu = READ_ONCE(node->yield_cpu);
 398         if (yield_cpu == -1) {
 399                 /* Propagate back the -1 CPU */
 400                 if (node->next && node->next->yield_cpu != -1)
 401                         node->next->yield_cpu = yield_cpu;
 402                 goto yield_prev;
 403         }
 404
 405         yield_count = yield_count_of(yield_cpu);
 406         if ((yield_count & 1) == 0)
 407                 goto yield_prev; /* owner vcpu is running */
 408
 409         spin_end();
 410
 411         preempted = true;
 412         seen_sleepy_node(lock, val);
 413
 414         smp_rmb();
 415
 416         if (yield_cpu == node->yield_cpu) {
 417                 if (node->next && node->next->yield_cpu != yield_cpu)
 418                         node->next->yield_cpu = yield_cpu;
 419                 yield_to_preempted(yield_cpu, yield_count);
 420                 spin_begin();
 421                 return preempted;
 422         }
 423         spin_begin();
 424
 425 yield_prev:
 426         if (!pv_yield_prev)
 427                 goto relax;
 428
 429         yield_count = yield_count_of(prev_cpu);
 430         if ((yield_count & 1) == 0)
 431                 goto relax; /* owner vcpu is running */
 432
 433         spin_end();
 434
 435         preempted = true;
 436         seen_sleepy_node(lock, val);
 437
 438         smp_rmb(); /* See __yield_to_locked_owner comment */
 439
 440         if (!READ_ONCE(node->locked)) {
 441                 yield_to_preempted(prev_cpu, yield_count);
 442                 spin_begin();
 443                 return preempted;
 444         }
 445         spin_begin();
 446
 447 relax:
 448         spin_cpu_relax();
 449
 450         return preempted;
 451 }
 452
 453 static __always_inline bool steal_break(u32 val, int iters, bool paravirt, bool sleepy)
 454 {
 455         if (iters >= get_steal_spins(paravirt, sleepy))
 456                 return true;
 457
 458         if (IS_ENABLED(CONFIG_NUMA) &&
 459             (iters >= get_remote_steal_spins(paravirt, sleepy))) {
 460                 int cpu = get_owner_cpu(val);
 461                 if (numa_node_id() != cpu_to_node(cpu))
 462                         return true;
 463         }
 464         return false;
 465 }
 466
 467 static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool paravirt)
 468 {
 469         bool seen_preempted = false;
 470         bool sleepy = false;
 471         int iters = 0;
 472         u32 val;
 473
 474         if (!steal_spins) {
 475                 /* XXX: should spin_on_preempted_owner do anything here? */
 476                 return false;
 477         }
 478
 479         /* Attempt to steal the lock */
 480         spin_begin();
 481         do {
 482                 bool preempted = false;
 483
 484                 val = READ_ONCE(lock->val);
 485                 if (val & _Q_MUST_Q_VAL)
 486                         break;
 487                 spec_barrier();
 488
 489                 if (unlikely(!(val & _Q_LOCKED_VAL))) {
 490                         spin_end();
 491                         if (__queued_spin_trylock_steal(lock))
 492                                 return true;
 493                         spin_begin();
 494                 } else {
 495                         preempted = yield_to_locked_owner(lock, val, paravirt);
 496                 }
 497
 498                 if (paravirt && pv_sleepy_lock) {
 499                         if (!sleepy) {
 500                                 if (val & _Q_SLEEPY_VAL) {
 501                                         seen_sleepy_lock();
 502                                         sleepy = true;
 503                                 } else if (recently_sleepy()) {
 504                                         sleepy = true;
 505                                 }
 506                         }
 507                         if (pv_sleepy_lock_sticky && seen_preempted &&
 508                             !(val & _Q_SLEEPY_VAL)) {
 509                                 if (try_set_sleepy(lock, val))
 510                                         val |= _Q_SLEEPY_VAL;
 511                         }
 512                 }
 513
 514                 if (preempted) {
 515                         seen_preempted = true;
 516                         sleepy = true;
 517                         if (!pv_spin_on_preempted_owner)
 518                                 iters++;
 519                         /*
 520                          * pv_spin_on_preempted_owner don't increase iters
 521                          * while the owner is preempted -- we won't interfere
 522                          * with it by definition. This could introduce some
 523                          * latency issue if we continually observe preempted
 524                          * owners, but hopefully that's a rare corner case of
 525                          * a badly oversubscribed system.
 526                          */
 527                 } else {
 528                         iters++;
 529                 }
 530         } while (!steal_break(val, iters, paravirt, sleepy));
 531
 532         spin_end();
 533
 534         return false;
 535 }
 536
 537 static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, bool paravirt)
 538 {
 539         struct qnodes *qnodesp;
 540         struct qnode *next, *node;
 541         u32 val, old, tail;
 542         bool seen_preempted = false;
 543         bool sleepy = false;
 544         bool mustq = false;
 545         int idx;
 546         int set_yield_cpu = -1;
 547         int iters = 0;
 548
 549         BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
 550
 551         qnodesp = this_cpu_ptr(&qnodes);
 552         if (unlikely(qnodesp->count >= MAX_NODES)) {
 553                 spec_barrier();
 554                 while (!queued_spin_trylock(lock))
 555                         cpu_relax();
 556                 return;
 557         }
 558
 559         idx = qnodesp->count++;
 560         /*
 561          * Ensure that we increment the head node->count before initialising
 562          * the actual node. If the compiler is kind enough to reorder these
 563          * stores, then an IRQ could overwrite our assignments.
 564          */
 565         barrier();
 566         node = &qnodesp->nodes[idx];
 567         node->next = NULL;
 568         node->lock = lock;
 569         node->cpu = smp_processor_id();
 570         node->yield_cpu = -1;
 571         node->locked = 0;
 572
 573         tail = encode_tail_cpu(node->cpu);
 574
 575         /*
 576          * Assign all attributes of a node before it can be published.
 577          * Issues an lwsync, serving as a release barrier, as well as a
 578          * compiler barrier.
 579          */
 580         old = publish_tail_cpu(lock, tail);
 581
 582         /*
 583          * If there was a previous node; link it and wait until reaching the
 584          * head of the waitqueue.
 585          */
 586         if (old & _Q_TAIL_CPU_MASK) {
 587                 struct qnode *prev = get_tail_qnode(lock, old);
 588
 589                 /* Link @node into the waitqueue. */
 590                 WRITE_ONCE(prev->next, node);
 591
 592                 /* Wait for mcs node lock to be released */
 593                 spin_begin();
 594                 while (!READ_ONCE(node->locked)) {
 595                         spec_barrier();
 596
 597                         if (yield_to_prev(lock, node, old, paravirt))
 598                                 seen_preempted = true;
 599                 }
 600                 spec_barrier();
 601                 spin_end();
 602
 603                 /* Clear out stale propagated yield_cpu */
 604                 if (paravirt && pv_yield_propagate_owner && node->yield_cpu != -1)
 605                         node->yield_cpu = -1;
 606
 607                 smp_rmb(); /* acquire barrier for the mcs lock */
 608
 609                 /*
 610                  * Generic qspinlocks have this prefetch here, but it seems
 611                  * like it could cause additional line transitions because
 612                  * the waiter will keep loading from it.
 613                  */
 614                 if (_Q_SPIN_PREFETCH_NEXT) {
 615                         next = READ_ONCE(node->next);
 616                         if (next)
 617                                 prefetchw(next);
 618                 }
 619         }
 620
 621         /* We're at the head of the waitqueue, wait for the lock. */
 622 again:
 623         spin_begin();
 624         for (;;) {
 625                 bool preempted;
 626
 627                 val = READ_ONCE(lock->val);
 628                 if (!(val & _Q_LOCKED_VAL))
 629                         break;
 630                 spec_barrier();
 631
 632                 if (paravirt && pv_sleepy_lock && maybe_stealers) {
 633                         if (!sleepy) {
 634                                 if (val & _Q_SLEEPY_VAL) {
 635                                         seen_sleepy_lock();
 636                                         sleepy = true;
 637                                 } else if (recently_sleepy()) {
 638                                         sleepy = true;
 639                                 }
 640                         }
 641                         if (pv_sleepy_lock_sticky && seen_preempted &&
 642                             !(val & _Q_SLEEPY_VAL)) {
 643                                 if (try_set_sleepy(lock, val))
 644                                         val |= _Q_SLEEPY_VAL;
 645                         }
 646                 }
 647
 648                 propagate_yield_cpu(node, val, &set_yield_cpu, paravirt);
 649                 preempted = yield_head_to_locked_owner(lock, val, paravirt);
 650                 if (!maybe_stealers)
 651                         continue;
 652
 653                 if (preempted)
 654                         seen_preempted = true;
 655
 656                 if (paravirt && preempted) {
 657                         sleepy = true;
 658
 659                         if (!pv_spin_on_preempted_owner)
 660                                 iters++;
 661                 } else {
 662                         iters++;
 663                 }
 664
 665                 if (!mustq && iters >= get_head_spins(paravirt, sleepy)) {
 666                         mustq = true;
 667                         set_mustq(lock);
 668                         val |= _Q_MUST_Q_VAL;
 669                 }
 670         }
 671         spec_barrier();
 672         spin_end();
 673
 674         /* If we're the last queued, must clean up the tail. */
 675         old = trylock_clean_tail(lock, tail);
 676         if (unlikely(old & _Q_LOCKED_VAL)) {
 677                 BUG_ON(!maybe_stealers);
 678                 goto again; /* Can only be true if maybe_stealers. */
 679         }
 680
 681         if ((old & _Q_TAIL_CPU_MASK) == tail)
 682                 goto release; /* We were the tail, no next. */
 683
 684         /* There is a next, must wait for node->next != NULL (MCS protocol) */
 685         next = READ_ONCE(node->next);
 686         if (!next) {
 687                 spin_begin();
 688                 while (!(next = READ_ONCE(node->next)))
 689                         cpu_relax();
 690                 spin_end();
 691         }
 692         spec_barrier();
 693
 694         /*
 695          * Unlock the next mcs waiter node. Release barrier is not required
 696          * here because the acquirer is only accessing the lock word, and
 697          * the acquire barrier we took the lock with orders that update vs
 698          * this store to locked. The corresponding barrier is the smp_rmb()
 699          * acquire barrier for mcs lock, above.
 700          */
 701         if (paravirt && pv_prod_head) {
 702                 int next_cpu = next->cpu;
 703                 WRITE_ONCE(next->locked, 1);
 704                 if (_Q_SPIN_MISO)
 705                         asm volatile("miso" ::: "memory");
 706                 if (vcpu_is_preempted(next_cpu))
 707                         prod_cpu(next_cpu);
 708         } else {
 709                 WRITE_ONCE(next->locked, 1);
 710                 if (_Q_SPIN_MISO)
 711                         asm volatile("miso" ::: "memory");
 712         }
 713
 714 release:
 715         qnodesp->count--; /* release the node */
 716 }
 717
 718 void queued_spin_lock_slowpath(struct qspinlock *lock)
 719 {
 720         /*
 721          * This looks funny, but it induces the compiler to inline both
 722          * sides of the branch rather than share code as when the condition
 723          * is passed as the paravirt argument to the functions.
 724          */
 725         if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) && is_shared_processor()) {
 726                 if (try_to_steal_lock(lock, true)) {
 727                         spec_barrier();
 728                         return;
 729                 }
 730                 queued_spin_lock_mcs_queue(lock, true);
 731         } else {
 732                 if (try_to_steal_lock(lock, false)) {
 733                         spec_barrier();
 734                         return;
 735                 }
 736                 queued_spin_lock_mcs_queue(lock, false);
 737         }
 738 }
 739 EXPORT_SYMBOL(queued_spin_lock_slowpath);
 740
 741 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 742 void pv_spinlocks_init(void)
 743 {
 744 }
 745 #endif
 746
 747 #include <linux/debugfs.h>
 748 static int steal_spins_set(void *data, u64 val)
 749 {
 750 #if _Q_SPIN_TRY_LOCK_STEAL == 1
 751         /* MAYBE_STEAL remains true */
 752         steal_spins = val;
 753 #else
 754         static DEFINE_MUTEX(lock);
 755
 756         /*
 757          * The lock slow path has a !maybe_stealers case that can assume
 758          * the head of queue will not see concurrent waiters. That waiter
 759          * is unsafe in the presence of stealers, so must keep them away
 760          * from one another.
 761          */
 762
 763         mutex_lock(&lock);
 764         if (val && !steal_spins) {
 765                 maybe_stealers = true;
 766                 /* wait for queue head waiter to go away */
 767                 synchronize_rcu();
 768                 steal_spins = val;
 769         } else if (!val && steal_spins) {
 770                 steal_spins = val;
 771                 /* wait for all possible stealers to go away */
 772                 synchronize_rcu();
 773                 maybe_stealers = false;
 774         } else {
 775                 steal_spins = val;
 776         }
 777         mutex_unlock(&lock);
 778 #endif
 779
 780         return 0;
 781 }
 782
 783 static int steal_spins_get(void *data, u64 *val)
 784 {
 785         *val = steal_spins;
 786
 787         return 0;
 788 }
 789
 790 DEFINE_SIMPLE_ATTRIBUTE(fops_steal_spins, steal_spins_get, steal_spins_set, "%llu\n");
 791
 792 static int remote_steal_spins_set(void *data, u64 val)
 793 {
 794         remote_steal_spins = val;
 795
 796         return 0;
 797 }
 798
 799 static int remote_steal_spins_get(void *data, u64 *val)
 800 {
 801         *val = remote_steal_spins;
 802
 803         return 0;
 804 }
 805
 806 DEFINE_SIMPLE_ATTRIBUTE(fops_remote_steal_spins, remote_steal_spins_get, remote_steal_spins_set, "%llu\n");
 807
 808 static int head_spins_set(void *data, u64 val)
 809 {
 810         head_spins = val;
 811
 812         return 0;
 813 }
 814
 815 static int head_spins_get(void *data, u64 *val)
 816 {
 817         *val = head_spins;
 818
 819         return 0;
 820 }
 821
 822 DEFINE_SIMPLE_ATTRIBUTE(fops_head_spins, head_spins_get, head_spins_set, "%llu\n");
 823
 824 static int pv_yield_owner_set(void *data, u64 val)
 825 {
 826         pv_yield_owner = !!val;
 827
 828         return 0;
 829 }
 830
 831 static int pv_yield_owner_get(void *data, u64 *val)
 832 {
 833         *val = pv_yield_owner;
 834
 835         return 0;
 836 }
 837
 838 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_owner, pv_yield_owner_get, pv_yield_owner_set, "%llu\n");
 839
 840 static int pv_yield_allow_steal_set(void *data, u64 val)
 841 {
 842         pv_yield_allow_steal = !!val;
 843
 844         return 0;
 845 }
 846
 847 static int pv_yield_allow_steal_get(void *data, u64 *val)
 848 {
 849         *val = pv_yield_allow_steal;
 850
 851         return 0;
 852 }
 853
 854 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_allow_steal, pv_yield_allow_steal_get, pv_yield_allow_steal_set, "%llu\n");
 855
 856 static int pv_spin_on_preempted_owner_set(void *data, u64 val)
 857 {
 858         pv_spin_on_preempted_owner = !!val;
 859
 860         return 0;
 861 }
 862
 863 static int pv_spin_on_preempted_owner_get(void *data, u64 *val)
 864 {
 865         *val = pv_spin_on_preempted_owner;
 866
 867         return 0;
 868 }
 869
 870 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_spin_on_preempted_owner, pv_spin_on_preempted_owner_get, pv_spin_on_preempted_owner_set, "%llu\n");
 871
 872 static int pv_sleepy_lock_set(void *data, u64 val)
 873 {
 874         pv_sleepy_lock = !!val;
 875
 876         return 0;
 877 }
 878
 879 static int pv_sleepy_lock_get(void *data, u64 *val)
 880 {
 881         *val = pv_sleepy_lock;
 882
 883         return 0;
 884 }
 885
 886 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock, pv_sleepy_lock_get, pv_sleepy_lock_set, "%llu\n");
 887
 888 static int pv_sleepy_lock_sticky_set(void *data, u64 val)
 889 {
 890         pv_sleepy_lock_sticky = !!val;
 891
 892         return 0;
 893 }
 894
 895 static int pv_sleepy_lock_sticky_get(void *data, u64 *val)
 896 {
 897         *val = pv_sleepy_lock_sticky;
 898
 899         return 0;
 900 }
 901
 902 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_sticky, pv_sleepy_lock_sticky_get, pv_sleepy_lock_sticky_set, "%llu\n");
 903
 904 static int pv_sleepy_lock_interval_ns_set(void *data, u64 val)
 905 {
 906         pv_sleepy_lock_interval_ns = val;
 907
 908         return 0;
 909 }
 910
 911 static int pv_sleepy_lock_interval_ns_get(void *data, u64 *val)
 912 {
 913         *val = pv_sleepy_lock_interval_ns;
 914
 915         return 0;
 916 }
 917
 918 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_interval_ns, pv_sleepy_lock_interval_ns_get, pv_sleepy_lock_interval_ns_set, "%llu\n");
 919
 920 static int pv_sleepy_lock_factor_set(void *data, u64 val)
 921 {
 922         pv_sleepy_lock_factor = val;
 923
 924         return 0;
 925 }
 926
 927 static int pv_sleepy_lock_factor_get(void *data, u64 *val)
 928 {
 929         *val = pv_sleepy_lock_factor;
 930
 931         return 0;
 932 }
 933
 934 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_factor, pv_sleepy_lock_factor_get, pv_sleepy_lock_factor_set, "%llu\n");
 935
 936 static int pv_yield_prev_set(void *data, u64 val)
 937 {
 938         pv_yield_prev = !!val;
 939
 940         return 0;
 941 }
 942
 943 static int pv_yield_prev_get(void *data, u64 *val)
 944 {
 945         *val = pv_yield_prev;
 946
 947         return 0;
 948 }
 949
 950 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_prev, pv_yield_prev_get, pv_yield_prev_set, "%llu\n");
 951
 952 static int pv_yield_propagate_owner_set(void *data, u64 val)
 953 {
 954         pv_yield_propagate_owner = !!val;
 955
 956         return 0;
 957 }
 958
 959 static int pv_yield_propagate_owner_get(void *data, u64 *val)
 960 {
 961         *val = pv_yield_propagate_owner;
 962
 963         return 0;
 964 }
 965
 966 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_propagate_owner, pv_yield_propagate_owner_get, pv_yield_propagate_owner_set, "%llu\n");
 967
 968 static int pv_prod_head_set(void *data, u64 val)
 969 {
 970         pv_prod_head = !!val;
 971
 972         return 0;
 973 }
 974
 975 static int pv_prod_head_get(void *data, u64 *val)
 976 {
 977         *val = pv_prod_head;
 978
 979         return 0;
 980 }
 981
 982 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_prod_head, pv_prod_head_get, pv_prod_head_set, "%llu\n");
 983
 984 static __init int spinlock_debugfs_init(void)
 985 {
 986         debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_steal_spins);
 987         debugfs_create_file("qspl_remote_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_remote_steal_spins);
 988         debugfs_create_file("qspl_head_spins", 0600, arch_debugfs_dir, NULL, &fops_head_spins);
 989         if (is_shared_processor()) {
 990                 debugfs_create_file("qspl_pv_yield_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_owner);
 991                 debugfs_create_file("qspl_pv_yield_allow_steal", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_allow_steal);
 992                 debugfs_create_file("qspl_pv_spin_on_preempted_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_spin_on_preempted_owner);
 993                 debugfs_create_file("qspl_pv_sleepy_lock", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock);
 994                 debugfs_create_file("qspl_pv_sleepy_lock_sticky", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_sticky);
 995                 debugfs_create_file("qspl_pv_sleepy_lock_interval_ns", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_interval_ns);
 996                 debugfs_create_file("qspl_pv_sleepy_lock_factor", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_factor);
 997                 debugfs_create_file("qspl_pv_yield_prev", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_prev);
 998                 debugfs_create_file("qspl_pv_yield_propagate_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_propagate_owner);
 999                 debugfs_create_file("qspl_pv_prod_head", 0600, arch_debugfs_dir, NULL, &fops_pv_prod_head);
1000         }
1001
1002         return 0;
1003 }
1004 device_initcall(spinlock_debugfs_init);