]>
Commit | Line | Data |
---|---|---|
3038054c BI |
1 | /* os-unix.c -*-C-*- |
2 | * | |
3 | ************************************************************************* | |
4 | * | |
5 | * @copyright | |
6 | * Copyright (C) 2009-2013, Intel Corporation | |
7 | * All rights reserved. | |
8 | * | |
9 | * @copyright | |
10 | * Redistribution and use in source and binary forms, with or without | |
11 | * modification, are permitted provided that the following conditions | |
12 | * are met: | |
13 | * | |
14 | * * Redistributions of source code must retain the above copyright | |
15 | * notice, this list of conditions and the following disclaimer. | |
16 | * * Redistributions in binary form must reproduce the above copyright | |
17 | * notice, this list of conditions and the following disclaimer in | |
18 | * the documentation and/or other materials provided with the | |
19 | * distribution. | |
20 | * * Neither the name of Intel Corporation nor the names of its | |
21 | * contributors may be used to endorse or promote products derived | |
22 | * from this software without specific prior written permission. | |
23 | * | |
24 | * @copyright | |
25 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
26 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
27 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
28 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
29 | * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | |
30 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, | |
31 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS | |
32 | * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | |
33 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
34 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY | |
35 | * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
36 | * POSSIBILITY OF SUCH DAMAGE. | |
37 | **************************************************************************/ | |
38 | ||
39 | #ifdef __linux__ | |
40 | // define _GNU_SOURCE before *any* #include. | |
41 | // Even <stdint.h> will break later #includes if this macro is not | |
42 | // already defined when it is #included. | |
43 | # define _GNU_SOURCE | |
44 | #endif | |
45 | ||
46 | #include "os.h" | |
47 | #include "bug.h" | |
48 | #include "cilk_malloc.h" | |
49 | #include <internal/abi.h> | |
50 | ||
51 | #if defined __linux__ | |
52 | # include <sys/sysinfo.h> | |
53 | # include <sys/syscall.h> | |
54 | #elif defined __APPLE__ | |
55 | # include <sys/sysctl.h> | |
56 | // Uses sysconf(_SC_NPROCESSORS_ONLN) in verbose output | |
dbed5a9b JM |
57 | #elif defined __DragonFly__ |
58 | // No additional include files | |
3038054c BI |
59 | #elif defined __FreeBSD__ |
60 | // No additional include files | |
61 | #elif defined __CYGWIN__ | |
62 | // Cygwin on Windows - no additional include files | |
63 | #elif defined __VXWORKS__ | |
64 | # include <vxWorks.h> | |
65 | # include <vxCpuLib.h> | |
66 | # include <taskLib.h> | |
2546da0b RO |
67 | // Solaris |
68 | #elif defined __sun__ && defined __svr4__ | |
69 | # include <sched.h> | |
3038054c BI |
70 | #else |
71 | # error "Unsupported OS" | |
72 | #endif | |
73 | ||
74 | #include <stdarg.h> | |
75 | #include <stddef.h> | |
76 | #include <stdio.h> | |
77 | #include <stdlib.h> | |
78 | #include <string.h> | |
79 | #include <unistd.h> | |
80 | #include <pthread.h> | |
81 | #include <sys/types.h> | |
82 | ||
83 | ||
84 | ||
85 | // /* Thread-local storage */ | |
86 | // #ifdef _WIN32 | |
87 | // typedef unsigned cilkos_tls_key_t; | |
88 | // #else | |
89 | // typedef pthread_key_t cilkos_tls_key_t; | |
90 | // #endif | |
91 | // cilkos_tls_key_t cilkos_allocate_tls_key(); | |
92 | // void cilkos_set_tls_pointer(cilkos_tls_key_t key, void* ptr); | |
93 | // void* cilkos_get_tls_pointer(cilkos_tls_key_t key); | |
94 | ||
95 | #if !defined CILK_WORKER_TLS | |
96 | static int cilk_keys_defined; | |
97 | static pthread_key_t worker_key, pedigree_leaf_key, tbb_interop_key; | |
98 | ||
99 | #if SUPPORT_GET_CURRENT_FIBER > 0 | |
100 | static pthread_key_t fiber_key; | |
101 | #endif | |
102 | ||
103 | static void *serial_worker; | |
104 | ||
105 | ||
106 | // This destructor is called when a pthread dies to deallocate the | |
107 | // pedigree node. | |
108 | static void __cilkrts_pedigree_leaf_destructor(void* pedigree_tls_ptr) | |
109 | { | |
110 | __cilkrts_pedigree* pedigree_tls | |
111 | = (__cilkrts_pedigree*)pedigree_tls_ptr; | |
112 | if (pedigree_tls) { | |
113 | // Assert that we have either one or two nodes | |
114 | // left in the pedigree chain. | |
115 | // If we have more, then something is going wrong... | |
116 | CILK_ASSERT(!pedigree_tls->parent || !pedigree_tls->parent->parent); | |
117 | __cilkrts_free(pedigree_tls); | |
118 | } | |
119 | } | |
120 | ||
121 | void __cilkrts_init_tls_variables(void) | |
122 | { | |
123 | int status; | |
124 | /* This will be called once in serial execution before any | |
125 | Cilk parallelism so we do not need to worry about races | |
126 | on cilk_keys_defined. */ | |
127 | if (cilk_keys_defined) | |
128 | return; | |
129 | status = pthread_key_create(&worker_key, NULL); | |
130 | CILK_ASSERT (status == 0); | |
131 | status = pthread_key_create(&pedigree_leaf_key, | |
132 | __cilkrts_pedigree_leaf_destructor); | |
133 | CILK_ASSERT (status == 0); | |
134 | status = pthread_key_create(&tbb_interop_key, NULL); | |
135 | CILK_ASSERT (status == 0); | |
136 | ||
137 | #if SUPPORT_GET_CURRENT_FIBER > 0 | |
138 | status = pthread_key_create(&fiber_key, NULL); | |
139 | CILK_ASSERT (status == 0); | |
140 | #endif | |
141 | cilk_keys_defined = 1; | |
142 | return; | |
143 | } | |
144 | ||
145 | COMMON_SYSDEP | |
146 | void* cilkos_get_current_thread_id(void) | |
147 | { | |
148 | return (void*)pthread_self(); | |
149 | } | |
150 | ||
151 | ||
152 | CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker() | |
153 | { | |
154 | if (__builtin_expect(cilk_keys_defined, 1)) | |
155 | return (__cilkrts_worker *)pthread_getspecific(worker_key); | |
156 | else | |
157 | return serial_worker; | |
158 | ||
159 | } | |
160 | ||
161 | CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker_fast() | |
162 | { | |
163 | return (__cilkrts_worker *)pthread_getspecific(worker_key); | |
164 | } | |
165 | ||
166 | COMMON_SYSDEP | |
167 | __cilk_tbb_stack_op_thunk *__cilkrts_get_tls_tbb_interop(void) | |
168 | { | |
169 | if (__builtin_expect(cilk_keys_defined, 1)) | |
170 | return (__cilk_tbb_stack_op_thunk *) | |
171 | pthread_getspecific(tbb_interop_key); | |
172 | else | |
173 | return 0; | |
174 | } | |
175 | ||
176 | // This counter should be updated atomically. | |
177 | static int __cilkrts_global_pedigree_tls_counter = -1; | |
178 | ||
179 | COMMON_SYSDEP | |
180 | __cilkrts_pedigree *__cilkrts_get_tls_pedigree_leaf(int create_new) | |
181 | { | |
182 | __cilkrts_pedigree *pedigree_tls; | |
183 | if (__builtin_expect(cilk_keys_defined, 1)) { | |
184 | pedigree_tls = | |
185 | (struct __cilkrts_pedigree *)pthread_getspecific(pedigree_leaf_key); | |
186 | } | |
187 | else { | |
188 | return 0; | |
189 | } | |
190 | ||
191 | if (!pedigree_tls && create_new) { | |
192 | // This call creates two nodes, X and Y. | |
193 | // X == pedigree_tls[0] is the leaf node, which gets copied | |
194 | // in and out of a user worker w when w binds and unbinds. | |
195 | // Y == pedigree_tls[1] is the root node, | |
196 | // which is a constant node that represents the user worker | |
197 | // thread w. | |
198 | pedigree_tls = (__cilkrts_pedigree*) | |
199 | __cilkrts_malloc(2 * sizeof(__cilkrts_pedigree)); | |
200 | ||
201 | // This call sets the TLS pointer to the new node. | |
202 | __cilkrts_set_tls_pedigree_leaf(pedigree_tls); | |
203 | ||
204 | pedigree_tls[0].rank = 0; | |
205 | pedigree_tls[0].parent = &pedigree_tls[1]; | |
206 | ||
207 | // Create Y, whose rank begins as the global counter value. | |
208 | pedigree_tls[1].rank = | |
209 | __sync_add_and_fetch(&__cilkrts_global_pedigree_tls_counter, 1); | |
210 | ||
211 | pedigree_tls[1].parent = NULL; | |
212 | CILK_ASSERT(pedigree_tls[1].rank != -1); | |
213 | } | |
214 | return pedigree_tls; | |
215 | } | |
216 | ||
217 | #if SUPPORT_GET_CURRENT_FIBER > 0 | |
218 | COMMON_SYSDEP | |
219 | cilk_fiber_sysdep* cilkos_get_tls_cilk_fiber(void) | |
220 | { | |
221 | if (__builtin_expect(cilk_keys_defined, 1)) | |
222 | return (cilk_fiber_sysdep *)pthread_getspecific(fiber_key); | |
223 | else | |
224 | return NULL; | |
225 | } | |
226 | #endif | |
227 | ||
228 | COMMON_SYSDEP | |
229 | void __cilkrts_set_tls_worker(__cilkrts_worker *w) | |
230 | { | |
231 | if (__builtin_expect(cilk_keys_defined, 1)) { | |
232 | int status; | |
233 | status = pthread_setspecific(worker_key, w); | |
234 | CILK_ASSERT (status == 0); | |
235 | return; | |
236 | } | |
237 | else | |
238 | { | |
239 | serial_worker = w; | |
240 | } | |
241 | } | |
242 | ||
243 | COMMON_SYSDEP | |
244 | void __cilkrts_set_tls_tbb_interop(__cilk_tbb_stack_op_thunk *t) | |
245 | { | |
246 | if (__builtin_expect(cilk_keys_defined, 1)) { | |
247 | int status; | |
248 | status = pthread_setspecific(tbb_interop_key, t); | |
249 | CILK_ASSERT (status == 0); | |
250 | return; | |
251 | } | |
252 | abort(); | |
253 | } | |
254 | ||
255 | COMMON_SYSDEP | |
256 | void __cilkrts_set_tls_pedigree_leaf(__cilkrts_pedigree* pedigree_leaf) | |
257 | { | |
258 | if (__builtin_expect(cilk_keys_defined, 1)) { | |
259 | int status; | |
260 | status = pthread_setspecific(pedigree_leaf_key, pedigree_leaf); | |
261 | CILK_ASSERT (status == 0); | |
262 | return; | |
263 | } | |
264 | abort(); | |
265 | } | |
266 | ||
267 | #if SUPPORT_GET_CURRENT_FIBER > 0 | |
268 | COMMON_SYSDEP | |
269 | void cilkos_set_tls_cilk_fiber(cilk_fiber_sysdep* fiber) | |
270 | { | |
271 | if (__builtin_expect(cilk_keys_defined, 1)) { | |
272 | int status; | |
273 | status = pthread_setspecific(fiber_key, fiber); | |
274 | CILK_ASSERT (status == 0); | |
275 | return; | |
276 | } | |
277 | abort(); | |
278 | } | |
279 | #endif | |
280 | ||
281 | #else | |
282 | void __cilkrts_init_tls_variables(void) | |
283 | { | |
284 | } | |
285 | #endif | |
286 | ||
b1cd42c5 | 287 | #if defined (__linux__) && ! defined(__ANDROID__) |
3038054c BI |
288 | /* |
289 | * Get the thread id, rather than the pid. In the case of MIC offload, it's | |
290 | * possible that we have multiple threads entering Cilk, and each has a | |
291 | * different affinity. | |
292 | */ | |
293 | static pid_t linux_gettid(void) | |
294 | { | |
295 | return syscall(SYS_gettid); | |
296 | } | |
297 | ||
298 | /* | |
299 | * On Linux we look at the thread affinity mask and restrict ourself to one | |
300 | * thread for each of the hardware contexts to which we are bound. | |
301 | * Therefore if user does | |
302 | * % taskset 0-1 cilkProgram | |
303 | * # restrict execution to hardware contexts zero and one | |
304 | * the Cilk program will only use two threads even if it is running on a | |
305 | * machine that has 32 hardware contexts. | |
306 | * This is the right thing to do, because the threads are restricted to two | |
307 | * hardware contexts by the affinity mask set by taskset, and if we were to | |
308 | * create extra threads they would simply oversubscribe the hardware resources | |
309 | * we can use. | |
310 | * This is particularly important on MIC in offload mode, where the affinity | |
311 | * mask is set by the offload library to force the offload code away from | |
312 | * cores that have offload support threads running on them. | |
313 | */ | |
314 | static int linux_get_affinity_count (int tid) | |
315 | { | |
ef132d59 BI |
316 | #if !defined HAVE_PTHREAD_AFFINITY_NP |
317 | return 0; | |
318 | #else | |
319 | ||
3038054c BI |
320 | cpu_set_t process_mask; |
321 | ||
322 | // Extract the thread affinity mask | |
323 | int err = sched_getaffinity (tid, sizeof(process_mask),&process_mask); | |
324 | ||
325 | if (0 != err) | |
326 | { | |
327 | return 0; | |
328 | } | |
329 | ||
330 | // We have extracted the mask OK, so now we can count the number of threads | |
331 | // in it. This is linear in the maximum number of CPUs available, We | |
332 | // could do a logarithmic version, if we assume the format of the mask, | |
333 | // but it's not really worth it. We only call this at thread startup | |
334 | // anyway. | |
335 | int available_procs = 0; | |
336 | int i; | |
337 | for (i = 0; i < CPU_SETSIZE; i++) | |
338 | { | |
339 | if (CPU_ISSET(i, &process_mask)) | |
340 | { | |
341 | available_procs++; | |
342 | } | |
343 | } | |
344 | ||
345 | return available_procs; | |
ef132d59 | 346 | #endif |
3038054c | 347 | } |
b1cd42c5 | 348 | #endif // defined (__linux__) && ! defined(__ANDROID__) |
3038054c BI |
349 | |
350 | /* | |
351 | * __cilkrts_hardware_cpu_count | |
352 | * | |
353 | * Returns the number of available CPUs on this hardware. This is architecture- | |
354 | * specific. | |
355 | */ | |
356 | ||
357 | COMMON_SYSDEP int __cilkrts_hardware_cpu_count(void) | |
358 | { | |
b1cd42c5 | 359 | #if defined __ANDROID__ || (defined(__sun__) && defined(__svr4__)) |
3038054c BI |
360 | return sysconf (_SC_NPROCESSORS_ONLN); |
361 | #elif defined __MIC__ | |
362 | /// HACK: Usually, the 3rd and 4th hyperthreads are not beneficial | |
363 | /// on KNC. Also, ignore the last core. | |
364 | int P = sysconf (_SC_NPROCESSORS_ONLN); | |
365 | return P/2 - 2; | |
366 | #elif defined __linux__ | |
367 | int affinity_count = linux_get_affinity_count(linux_gettid()); | |
368 | ||
369 | return (0 != affinity_count) ? affinity_count : sysconf (_SC_NPROCESSORS_ONLN); | |
370 | #elif defined __APPLE__ | |
371 | int count = 0; | |
372 | int cmd[2] = { CTL_HW, HW_NCPU }; | |
373 | size_t len = sizeof count; | |
374 | int status = sysctl(cmd, 2, &count, &len, 0, 0); | |
375 | assert(status >= 0); | |
376 | assert((unsigned)count == count); | |
377 | ||
378 | return count; | |
dbed5a9b | 379 | #elif defined __FreeBSD__ || defined __CYGWIN__ || defined __DragonFly__ |
3038054c BI |
380 | int ncores = sysconf(_SC_NPROCESSORS_ONLN); |
381 | ||
382 | return ncores; | |
383 | // Just get the number of processors | |
384 | // return sysconf(_SC_NPROCESSORS_ONLN); | |
385 | #elif defined __VXWORKS__ | |
386 | return __builtin_popcount( vxCpuEnabledGet() ); | |
387 | #else | |
388 | #error "Unknown architecture" | |
389 | #endif | |
390 | } | |
391 | ||
392 | COMMON_SYSDEP void __cilkrts_sleep(void) | |
393 | { | |
394 | #ifdef __VXWORKS__ | |
395 | taskDelay(1); | |
396 | #else | |
397 | usleep(1); | |
398 | #endif | |
399 | } | |
400 | ||
401 | COMMON_SYSDEP void __cilkrts_yield(void) | |
402 | { | |
403 | #if __APPLE__ || __FreeBSD__ || __VXWORKS__ | |
404 | // On MacOS, call sched_yield to yield quantum. I'm not sure why we | |
405 | // don't do this on Linux also. | |
406 | sched_yield(); | |
dbed5a9b JM |
407 | #elif defined(__DragonFly__) |
408 | // On DragonFly BSD, call sched_yield to yield quantum. | |
409 | sched_yield(); | |
3038054c BI |
410 | #elif defined(__MIC__) |
411 | // On MIC, pthread_yield() really trashes things. Arch's measurements | |
412 | // showed that calling _mm_delay_32() (or doing nothing) was a better | |
413 | // option. Delaying 1024 clock cycles is a reasonable compromise between | |
414 | // giving up the processor and latency starting up when work becomes | |
415 | // available | |
416 | _mm_delay_32(1024); | |
b1cd42c5 | 417 | #elif defined(__ANDROID__) || (defined(__sun__) && defined(__svr4__)) |
2546da0b RO |
418 | // On Android and Solaris, call sched_yield to yield quantum. I'm not |
419 | // sure why we don't do this on Linux also. | |
3038054c BI |
420 | sched_yield(); |
421 | #else | |
422 | // On Linux, call pthread_yield (which in turn will call sched_yield) | |
423 | // to yield quantum. | |
424 | pthread_yield(); | |
425 | #endif | |
426 | } | |
427 | ||
428 | COMMON_SYSDEP __STDNS size_t cilkos_getenv(char* value, __STDNS size_t vallen, | |
429 | const char* varname) | |
430 | { | |
431 | CILK_ASSERT(value); | |
432 | CILK_ASSERT(varname); | |
433 | ||
434 | const char* envstr = getenv(varname); | |
435 | if (envstr) | |
436 | { | |
437 | size_t len = strlen(envstr); | |
438 | if (len > vallen - 1) | |
439 | return len + 1; | |
440 | ||
441 | strcpy(value, envstr); | |
442 | return len; | |
443 | } | |
444 | else | |
445 | { | |
446 | value[0] = '\0'; | |
447 | return 0; | |
448 | } | |
449 | } | |
450 | ||
451 | /* | |
452 | * Unrecoverable error: Print an error message and abort execution. | |
453 | */ | |
454 | COMMON_SYSDEP void cilkos_error(const char *fmt, ...) | |
455 | { | |
456 | va_list l; | |
457 | fflush(NULL); | |
458 | fprintf(stderr, "Cilk error: "); | |
459 | va_start(l, fmt); | |
460 | vfprintf(stderr, fmt, l); | |
461 | va_end(l); | |
462 | fprintf(stderr, "Exiting.\n"); | |
463 | fflush(stderr); | |
464 | ||
465 | abort(); | |
466 | } | |
467 | ||
468 | /* | |
469 | * Print a warning message and return. | |
470 | */ | |
471 | COMMON_SYSDEP void cilkos_warning(const char *fmt, ...) | |
472 | { | |
473 | va_list l; | |
474 | fflush(NULL); | |
475 | fprintf(stderr, "Cilk warning: "); | |
476 | va_start(l, fmt); | |
477 | vfprintf(stderr, fmt, l); | |
478 | va_end(l); | |
479 | fflush(stderr); | |
480 | } | |
481 | ||
482 | static void __attribute__((constructor)) init_once() | |
483 | { | |
484 | /*__cilkrts_debugger_notification_internal(CILK_DB_RUNTIME_LOADED);*/ | |
485 | __cilkrts_init_tls_variables(); | |
486 | } | |
487 | ||
488 | ||
489 | #define PAGE 4096 | |
490 | #define CILK_MIN_STACK_SIZE (4*PAGE) | |
491 | // Default size for the stacks that we create in Cilk for Unix. | |
492 | #define CILK_DEFAULT_STACK_SIZE 0x100000 | |
493 | ||
494 | /* | |
495 | * Convert the user's specified stack size into a "reasonable" value | |
496 | * for this OS. | |
497 | */ | |
498 | size_t cilkos_validate_stack_size(size_t specified_stack_size) { | |
499 | // Convert any negative value to the default. | |
500 | if (specified_stack_size == 0) { | |
501 | CILK_ASSERT((CILK_DEFAULT_STACK_SIZE % PAGE) == 0); | |
502 | return CILK_DEFAULT_STACK_SIZE; | |
503 | } | |
504 | // Round values in between 0 and CILK_MIN_STACK_SIZE up to | |
505 | // CILK_MIN_STACK_SIZE. | |
506 | if (specified_stack_size <= CILK_MIN_STACK_SIZE) { | |
507 | return CILK_MIN_STACK_SIZE; | |
508 | } | |
509 | if ((specified_stack_size % PAGE) > 0) { | |
510 | // Round the user's stack size value up to nearest page boundary. | |
511 | return (PAGE * (1 + specified_stack_size / PAGE)); | |
512 | } | |
513 | return specified_stack_size; | |
514 | } | |
515 | ||
516 | long cilkos_atomic_add(volatile long* p, long x) | |
517 | { | |
518 | return __sync_add_and_fetch(p, x); | |
519 | } | |
520 | ||
521 | /* End os-unix.c */ |