man2/membarrier.2

   1 .\" Copyright 2015-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
   2 .\"
   3 .\" %%%LICENSE_START(VERBATIM)
   4 .\" Permission is granted to make and distribute verbatim copies of this
   5 .\" manual provided the copyright notice and this permission notice are
   6 .\" preserved on all copies.
   7 .\"
   8 .\" Permission is granted to copy and distribute modified versions of this
   9 .\" manual under the conditions for verbatim copying, provided that the
  10 .\" entire resulting derived work is distributed under the terms of a
  11 .\" permission notice identical to this one.
  12 .\"
  13 .\" Since the Linux kernel and libraries are constantly changing, this
  14 .\" manual page may be incorrect or out-of-date.  The author(s) assume no
  15 .\" responsibility for errors or omissions, or for damages resulting from
  16 .\" the use of the information contained herein.  The author(s) may not
  17 .\" have taken the same level of care in the production of this manual,
  18 .\" which is licensed free of charge, as they might when working
  19 .\" professionally.
  20 .\"
  21 .\" Formatted or processed versions of this manual, if unaccompanied by
  22 .\" the source, must acknowledge the copyright and authors of this work.
  23 .\" %%%LICENSE_END
  24 .\"
  25 .TH MEMBARRIER 2 2017-11-15 "Linux" "Linux Programmer's Manual"
  26 .SH NAME
  27 membarrier \- issue memory barriers on a set of threads
  28 .SH SYNOPSIS
  29 .B #include <linux/membarrier.h>
  30 .PP
  31 .BI "int membarrier(int " cmd ", int " flags ");
  32 .SH DESCRIPTION
  33 The
  34 .BR membarrier ()
  35 system call helps reducing the overhead of the memory barrier
  36 instructions required to order memory accesses on multi-core systems.
  37 However, this system call is heavier than a memory barrier, so using it
  38 effectively is
  39 .I not
  40 as simple as replacing memory barriers with this
  41 system call, but requires understanding of the details below.
  42 .PP
  43 Use of memory barriers needs to be done taking into account that a
  44 memory barrier always needs to be either matched with its memory barrier
  45 counterparts, or that the architecture's memory model doesn't require the
  46 matching barriers.
  47 .PP
  48 There are cases where one side of the matching barriers (which we will
  49 refer to as "fast side") is executed much more often than the other
  50 (which we will refer to as "slow side").
  51 This is a prime target for the use of
  52 .BR membarrier ().
  53 The key idea is to replace, for these matching
  54 barriers, the fast-side memory barriers by simple compiler barriers,
  55 for example:
  56 .PP
  57     asm volatile ("" : : : "memory")
  58 .PP
  59 and replace the slow-side memory barriers by calls to
  60 .BR membarrier ().
  61 .PP
  62 This will add overhead to the slow side, and remove overhead from the
  63 fast side, thus resulting in an overall performance increase as long as
  64 the slow side is infrequent enough that the overhead of the
  65 .BR membarrier ()
  66 calls does not outweigh the performance gain on the fast side.
  67 .PP
  68 The
  69 .I cmd
  70 argument is one of the following:
  71 .TP
  72 .BR MEMBARRIER_CMD_QUERY " (since Linux 4.3)"
  73 Query the set of supported commands.
  74 The return value of the call is a bit mask of supported
  75 commands.
  76 .BR MEMBARRIER_CMD_QUERY ,
  77 which has the value 0,
  78 is not itself included in this bit mask.
  79 This command is always supported (on kernels where
  80 .BR membarrier ()
  81 is provided).
  82 .TP
  83 .BR MEMBARRIER_CMD_GLOBAL " (since Linux 4.16)"
  84 Ensure that all threads from all processes on the system pass through a
  85 state where all memory accesses to user-space addresses match program
  86 order between entry to and return from the
  87 .BR membarrier ()
  88 system call.
  89 All threads on the system are targeted by this command.
  90 .TP
  91 .BR MEMBARRIER_CMD_GLOBAL_EXPEDITED " (since Linux 4.16)"
  92 Execute a memory barrier on all running threads of all processes that
  93 previously registered with
  94 .BR MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED .
  95 Upon return from the system call, the calling thread is ensured that all
  96 running threads have passed through a state where all memory accesses to
  97 user-space addresses match program order between entry to and return
  98 from the system call (non-running threads are de facto in such a state).
  99 This covers only threads from processes which registered with
 100 .BR MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED .
 101 Given that registration is about the intent to receive the barriers, it
 102 is valid to invoke
 103 .BR MEMBARRIER_CMD_GLOBAL_EXPEDITED
 104 from a non-registered process.
 105 .IP
 106 The "expedited" commands complete faster than the non-expedited ones;
 107 they never block, but have the downside of causing extra overhead.
 108 .TP
 109 .BR MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED " (since Linux 4.16)"
 110 Register the process's intent to receive
 111 .BR MEMBARRIER_CMD_GLOBAL_EXPEDITED
 112 memory barriers.
 113 .TP
 114 .BR MEMBARRIER_CMD_PRIVATE_EXPEDITED " (since Linux 4.14)"
 115 Execute a memory barrier on each running thread belonging to the same
 116 process as the current thread.
 117 Upon return from system call, the calling
 118 thread is assured that all its running threads siblings have passed
 119 through a state where all memory accesses to user-space addresses match
 120 program order between entry to and return from the system call
 121 (non-running threads are de facto in such a state).
 122 This covers only threads from the same process as the calling thread.
 123 .IP
 124 The "expedited" commands complete faster than the non-expedited ones;
 125 they never block, but have the downside of causing extra overhead.
 126 A process needs to register its intent to use the private
 127 expedited command prior to using it.
 128 .TP
 129 .BR MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED " (since Linux 4.14)"
 130 Register the process's intent to use
 131 .BR MEMBARRIER_CMD_PRIVATE_EXPEDITED .
 132 .TP
 133 .BR MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE " (since Linux 4.16)"
 134 In addition to providing the memory ordering guarantees described in
 135 .BR MEMBARRIER_CMD_PRIVATE_EXPEDITED ,
 136 ensure the calling thread, upon return from system call, that all its
 137 running threads siblings have executed a core serializing instruction.
 138 This only covers threads from the same process as the calling thread.
 139 The "expedited" commands complete faster than the non-expedited ones,
 140 they never block, but have the downside of causing extra overhead.
 141 A process needs to register its intent to use the private expedited sync
 142 core command prior to using it.
 143 .TP
 144 .BR MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE " (since Linux 4.16)"
 145 Register the process's intent to use
 146 .BR MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE .
 147 .TP
 148 .BR MEMBARRIER_CMD_SHARED " (since Linux 4.3)"
 149 Alias for
 150 .BR MEMBARRIER_CMD_GLOBAL .
 151 Provided for header backward compatibility.
 152 .PP
 153 The
 154 .I flags
 155 argument is currently unused and must be specified as 0.
 156 .PP
 157 All memory accesses performed in program order from each targeted thread
 158 are guaranteed to be ordered with respect to
 159 .BR membarrier ().
 160 .PP
 161 If we use the semantic
 162 .I barrier()
 163 to represent a compiler barrier forcing memory
 164 accesses to be performed in program order across the barrier, and
 165 .I smp_mb()
 166 to represent explicit memory barriers forcing full memory
 167 ordering across the barrier, we have the following ordering table for
 168 each pairing of
 169 .IR barrier() ,
 170 .BR membarrier ()
 171 and
 172 .IR smp_mb() .
 173 The pair ordering is detailed as (O: ordered, X: not ordered):
 174 .PP
 175                        barrier()  smp_mb()  membarrier()
 176        barrier()          X          X          O
 177        smp_mb()           X          O          O
 178        membarrier()       O          O          O
 179 .SH RETURN VALUE
 180 On success, the
 181 .B MEMBARRIER_CMD_QUERY
 182 operation returns a bit mask of supported commands, and the
 183 .BR MEMBARRIER_CMD_GLOBAL ,
 184 .BR MEMBARRIER_CMD_GLOBAL_EXPEDITED ,
 185 .BR MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED ,
 186 .BR MEMBARRIER_CMD_PRIVATE_EXPEDITED ,
 187 .BR MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED ,
 188 .BR MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE ,
 189 and
 190 .B MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE
 191 operations return zero.
 192 On error, \-1 is returned,
 193 and
 194 .I errno
 195 is set appropriately.
 196 .PP
 197 For a given command, with
 198 .I flags
 199 set to 0, this system call is
 200 guaranteed to always return the same value until reboot.
 201 Further calls with the same arguments will lead to the same result.
 202 Therefore, with
 203 .I flags
 204 set to 0, error handling is required only for the first call to
 205 .BR membarrier ().
 206 .SH ERRORS
 207 .TP
 208 .B EINVAL
 209 .I cmd
 210 is invalid, or
 211 .I flags
 212 is nonzero, or the
 213 .BR MEMBARRIER_CMD_GLOBAL
 214 command is disabled because the
 215 .I nohz_full
 216 CPU parameter has been set, or the
 217 .BR MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE
 218 and
 219 .BR MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE
 220 commands are not implemented by the architecture.
 221 .TP
 222 .B ENOSYS
 223 The
 224 .BR membarrier ()
 225 system call is not implemented by this kernel.
 226 .TP
 227 .B EPERM
 228 The current process was not registered prior to using private expedited
 229 commands.
 230 .SH VERSIONS
 231 The
 232 .BR membarrier ()
 233 system call was added in Linux 4.3.
 234 .\"
 235 .SH CONFORMING TO
 236 .BR membarrier ()
 237 is Linux-specific.
 238 .\" .SH SEE ALSO
 239 .\" FIXME See if the following syscalls make it into Linux 4.15 or later
 240 .\" .BR cpu_opv (2),
 241 .\" .BR rseq (2)
 242 .SH NOTES
 243 A memory barrier instruction is part of the instruction set of
 244 architectures with weakly-ordered memory models.
 245 It orders memory
 246 accesses prior to the barrier and after the barrier with respect to
 247 matching barriers on other cores.
 248 For instance, a load fence can order
 249 loads prior to and following that fence with respect to stores ordered
 250 by store fences.
 251 .PP
 252 Program order is the order in which instructions are ordered in the
 253 program assembly code.
 254 .PP
 255 Examples where
 256 .BR membarrier ()
 257 can be useful include implementations
 258 of Read-Copy-Update libraries and garbage collectors.
 259 .SH EXAMPLE
 260 Assuming a multithreaded application where "fast_path()" is executed
 261 very frequently, and where "slow_path()" is executed infrequently, the
 262 following code (x86) can be transformed using
 263 .BR membarrier ():
 264 .PP
 265 .in +4n
 266 .EX
 267 #include <stdlib.h>
 268
 269 static volatile int a, b;
 270
 271 static void
 272 fast_path(int *read_b)
 273 {
 274     a = 1;
 275     asm volatile ("mfence" : : : "memory");
 276     *read_b = b;
 277 }
 278
 279 static void
 280 slow_path(int *read_a)
 281 {
 282     b = 1;
 283     asm volatile ("mfence" : : : "memory");
 284     *read_a = a;
 285 }
 286
 287 int
 288 main(int argc, char **argv)
 289 {
 290     int read_a, read_b;
 291
 292     /*
 293      * Real applications would call fast_path() and slow_path()
 294      * from different threads. Call those from main() to keep
 295      * this example short.
 296      */
 297
 298     slow_path(&read_a);
 299     fast_path(&read_b);
 300
 301     /*
 302      * read_b == 0 implies read_a == 1 and
 303      * read_a == 0 implies read_b == 1.
 304      */
 305
 306     if (read_b == 0 && read_a == 0)
 307         abort();
 308
 309     exit(EXIT_SUCCESS);
 310 }
 311 .EE
 312 .in
 313 .PP
 314 The code above transformed to use
 315 .BR membarrier ()
 316 becomes:
 317 .PP
 318 .in +4n
 319 .EX
 320 #define _GNU_SOURCE
 321 #include <stdlib.h>
 322 #include <stdio.h>
 323 #include <unistd.h>
 324 #include <sys/syscall.h>
 325 #include <linux/membarrier.h>
 326
 327 static volatile int a, b;
 328
 329 static int
 330 membarrier(int cmd, int flags)
 331 {
 332     return syscall(__NR_membarrier, cmd, flags);
 333 }
 334
 335 static int
 336 init_membarrier(void)
 337 {
 338     int ret;
 339
 340     /* Check that membarrier() is supported. */
 341
 342     ret = membarrier(MEMBARRIER_CMD_QUERY, 0);
 343     if (ret < 0) {
 344         perror("membarrier");
 345         return \-1;
 346     }
 347
 348     if (!(ret & MEMBARRIER_CMD_GLOBAL)) {
 349         fprintf(stderr,
 350             "membarrier does not support MEMBARRIER_CMD_GLOBAL\\n");
 351         return \-1;
 352     }
 353
 354     return 0;
 355 }
 356
 357 static void
 358 fast_path(int *read_b)
 359 {
 360     a = 1;
 361     asm volatile ("" : : : "memory");
 362     *read_b = b;
 363 }
 364
 365 static void
 366 slow_path(int *read_a)
 367 {
 368     b = 1;
 369     membarrier(MEMBARRIER_CMD_GLOBAL, 0);
 370     *read_a = a;
 371 }
 372
 373 int
 374 main(int argc, char **argv)
 375 {
 376     int read_a, read_b;
 377
 378     if (init_membarrier())
 379         exit(EXIT_FAILURE);
 380
 381     /*
 382      * Real applications would call fast_path() and slow_path()
 383      * from different threads. Call those from main() to keep
 384      * this example short.
 385      */
 386
 387     slow_path(&read_a);
 388     fast_path(&read_b);
 389
 390     /*
 391      * read_b == 0 implies read_a == 1 and
 392      * read_a == 0 implies read_b == 1.
 393      */
 394
 395     if (read_b == 0 && read_a == 0)
 396         abort();
 397
 398     exit(EXIT_SUCCESS);
 399 }
 400 .EE
 401 .in