[thirdparty/gcc.git] / libcilkrts / runtime / os-unix.c

/* os-unix.c                  -*-C-*-
 *
 *************************************************************************
 *
 *  @copyright
 *  Copyright (C) 2009-2013, Intel Corporation
 *  All rights reserved.
 *  
 *  @copyright
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
 *  
 *    * Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *    * Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in
 *      the documentation and/or other materials provided with the
 *      distribution.
 *    * Neither the name of Intel Corporation nor the names of its
 *      contributors may be used to endorse or promote products derived
 *      from this software without specific prior written permission.
 *  
 *  @copyright
 *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
 *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 *  POSSIBILITY OF SUCH DAMAGE.
 **************************************************************************/

#ifdef __linux__
    // define _GNU_SOURCE before *any* #include.
    // Even <stdint.h> will break later #includes if this macro is not
    // already defined when it is #included.
#   define _GNU_SOURCE
#endif

#include "os.h"
#include "bug.h"
#include "cilk_malloc.h"
#include <internal/abi.h>

#if defined __linux__
#   include <sys/sysinfo.h>
#   include <sys/syscall.h>
#elif defined __APPLE__
#   include <sys/sysctl.h>
    // Uses sysconf(_SC_NPROCESSORS_ONLN) in verbose output
#elif defined  __DragonFly__
// No additional include files
#elif defined  __FreeBSD__
// No additional include files
#elif defined __CYGWIN__
// Cygwin on Windows - no additional include files
#elif defined  __VXWORKS__
#   include <vxWorks.h>   
#   include <vxCpuLib.h>   
#   include <taskLib.h>   
// Solaris
#elif defined __sun__ && defined __svr4__
#   include <sched.h>
#else
#   error "Unsupported OS"
#endif

#include <stdarg.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <pthread.h>
#include <sys/types.h>


// /* Thread-local storage */
// #ifdef _WIN32
// typedef unsigned cilkos_tls_key_t;
// #else
// typedef pthread_key_t cilkos_tls_key_t;
// #endif
// cilkos_tls_key_t cilkos_allocate_tls_key();
// void cilkos_set_tls_pointer(cilkos_tls_key_t key, void* ptr);
// void* cilkos_get_tls_pointer(cilkos_tls_key_t key);

#if !defined CILK_WORKER_TLS
static int cilk_keys_defined;
static pthread_key_t worker_key, pedigree_leaf_key, tbb_interop_key;

#if SUPPORT_GET_CURRENT_FIBER > 0
static pthread_key_t fiber_key;
#endif

static void *serial_worker;


// This destructor is called when a pthread dies to deallocate the
// pedigree node.
static void __cilkrts_pedigree_leaf_destructor(void* pedigree_tls_ptr)
{
    __cilkrts_pedigree* pedigree_tls
	= (__cilkrts_pedigree*)pedigree_tls_ptr;
    if (pedigree_tls) {
        // Assert that we have either one or two nodes
        // left in the pedigree chain.
        // If we have more, then something is going wrong...
        CILK_ASSERT(!pedigree_tls->parent || !pedigree_tls->parent->parent);
	__cilkrts_free(pedigree_tls);
    }
}

void __cilkrts_init_tls_variables(void)
{
    int status;
    /* This will be called once in serial execution before any
       Cilk parallelism so we do not need to worry about races
       on cilk_keys_defined. */
    if (cilk_keys_defined)
        return;
    status = pthread_key_create(&worker_key, NULL);
    CILK_ASSERT (status == 0);
    status = pthread_key_create(&pedigree_leaf_key,
				__cilkrts_pedigree_leaf_destructor);
    CILK_ASSERT (status == 0);
    status = pthread_key_create(&tbb_interop_key, NULL);
    CILK_ASSERT (status == 0);

#if SUPPORT_GET_CURRENT_FIBER > 0    
    status = pthread_key_create(&fiber_key, NULL);
    CILK_ASSERT (status == 0);
#endif
    cilk_keys_defined = 1;
    return;
}

COMMON_SYSDEP
void* cilkos_get_current_thread_id(void)
{
    return (void*)pthread_self();
}


CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker()
{
    if (__builtin_expect(cilk_keys_defined, 1))
        return (__cilkrts_worker *)pthread_getspecific(worker_key);
    else 
        return serial_worker;
    
}

CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker_fast()
{
  return (__cilkrts_worker *)pthread_getspecific(worker_key);
}

COMMON_SYSDEP
__cilk_tbb_stack_op_thunk *__cilkrts_get_tls_tbb_interop(void)
{
    if (__builtin_expect(cilk_keys_defined, 1))
        return (__cilk_tbb_stack_op_thunk *)
            pthread_getspecific(tbb_interop_key);
    else
        return 0;
}

// This counter should be updated atomically.
static int __cilkrts_global_pedigree_tls_counter = -1;

COMMON_SYSDEP
__cilkrts_pedigree *__cilkrts_get_tls_pedigree_leaf(int create_new)
{
    __cilkrts_pedigree *pedigree_tls;    
    if (__builtin_expect(cilk_keys_defined, 1)) {
        pedigree_tls =
            (struct __cilkrts_pedigree *)pthread_getspecific(pedigree_leaf_key);
    }
    else {
        return 0;
    }
    
    if (!pedigree_tls && create_new) {
        // This call creates two nodes, X and Y.
        // X == pedigree_tls[0] is the leaf node, which gets copied
        // in and out of a user worker w when w binds and unbinds.
        // Y == pedigree_tls[1] is the root node,
        // which is a constant node that represents the user worker
        // thread w.
	pedigree_tls = (__cilkrts_pedigree*)
	    __cilkrts_malloc(2 * sizeof(__cilkrts_pedigree));

        // This call sets the TLS pointer to the new node.
	__cilkrts_set_tls_pedigree_leaf(pedigree_tls);
        
        pedigree_tls[0].rank = 0;
        pedigree_tls[0].parent = &pedigree_tls[1];

        // Create Y, whose rank begins as the global counter value.
        pedigree_tls[1].rank =
            __sync_add_and_fetch(&__cilkrts_global_pedigree_tls_counter, 1);

        pedigree_tls[1].parent = NULL;
        CILK_ASSERT(pedigree_tls[1].rank != -1);
    }
    return pedigree_tls;
}

#if SUPPORT_GET_CURRENT_FIBER > 0
COMMON_SYSDEP
cilk_fiber_sysdep* cilkos_get_tls_cilk_fiber(void)
{
    if (__builtin_expect(cilk_keys_defined, 1))
        return (cilk_fiber_sysdep *)pthread_getspecific(fiber_key);
    else
        return NULL;
}
#endif

COMMON_SYSDEP
void __cilkrts_set_tls_worker(__cilkrts_worker *w)
{
    if (__builtin_expect(cilk_keys_defined, 1)) {
        int status;
        status = pthread_setspecific(worker_key, w);
        CILK_ASSERT (status == 0);
        return;
    }
    else
    {
        serial_worker = w;
    }
}

COMMON_SYSDEP
void __cilkrts_set_tls_tbb_interop(__cilk_tbb_stack_op_thunk *t)
{
    if (__builtin_expect(cilk_keys_defined, 1)) {
        int status;
        status = pthread_setspecific(tbb_interop_key, t);
        CILK_ASSERT (status == 0);
        return;
    }
    abort();
}

COMMON_SYSDEP
void __cilkrts_set_tls_pedigree_leaf(__cilkrts_pedigree* pedigree_leaf)
{
    if (__builtin_expect(cilk_keys_defined, 1)) {
        int status;
        status = pthread_setspecific(pedigree_leaf_key, pedigree_leaf);
        CILK_ASSERT (status == 0);
        return;
    }
    abort();
}

#if SUPPORT_GET_CURRENT_FIBER > 0
COMMON_SYSDEP
void cilkos_set_tls_cilk_fiber(cilk_fiber_sysdep* fiber)
{
    if (__builtin_expect(cilk_keys_defined, 1)) {
        int status;
        status = pthread_setspecific(fiber_key, fiber);
        CILK_ASSERT (status == 0);
        return;
    }
    abort();
}
#endif

#else
void __cilkrts_init_tls_variables(void)
{
}
#endif

#if defined (__linux__) && ! defined(__ANDROID__)
/*
 * Get the thread id, rather than the pid. In the case of MIC offload, it's
 * possible that we have multiple threads entering Cilk, and each has a
 * different affinity.
 */
static pid_t linux_gettid(void)
{
    return syscall(SYS_gettid);
}

/*
 * On Linux we look at the thread affinity mask and restrict ourself to one
 * thread for each of the hardware contexts to which we are bound.
 * Therefore if user does
 * % taskset 0-1 cilkProgram
 *       # restrict execution to hardware contexts zero and one
 * the Cilk program will only use two threads even if it is running on a
 * machine that has 32 hardware contexts.
 * This is the right thing to do, because the threads are restricted to two
 * hardware contexts by the affinity mask set by taskset, and if we were to
 * create extra threads they would simply oversubscribe the hardware resources
 * we can use.
 * This is particularly important on MIC in offload mode, where the affinity
 * mask is set by the offload library to force the offload code away from
 * cores that have offload support threads running on them.
 */
static int linux_get_affinity_count (int tid) 
{
#if !defined HAVE_PTHREAD_AFFINITY_NP
  return 0;
#else

    cpu_set_t process_mask;

    // Extract the thread affinity mask
    int err = sched_getaffinity (tid, sizeof(process_mask),&process_mask);

    if (0 != err)
    {
        return 0;
    }

    // We have extracted the mask OK, so now we can count the number of threads
    // in it.  This is linear in the maximum number of CPUs available, We
    // could do a logarithmic version, if we assume the format of the mask,
    // but it's not really worth it. We only call this at thread startup
    // anyway.
    int available_procs = 0;
    int i;
    for (i = 0; i < CPU_SETSIZE; i++)
    {
        if (CPU_ISSET(i, &process_mask))
        {
            available_procs++;
        }
    }

    return available_procs;
#endif
}
#endif  //  defined (__linux__) && ! defined(__ANDROID__)

/*
 * __cilkrts_hardware_cpu_count
 *
 * Returns the number of available CPUs on this hardware.  This is architecture-
 * specific. 
 */

COMMON_SYSDEP int __cilkrts_hardware_cpu_count(void)
{
#if defined __ANDROID__ || (defined(__sun__) && defined(__svr4__))
    return sysconf (_SC_NPROCESSORS_ONLN);
#elif defined __MIC__
    /// HACK: Usually, the 3rd and 4th hyperthreads are not beneficial
    /// on KNC.  Also, ignore the last core.
    int P = sysconf (_SC_NPROCESSORS_ONLN);
    return P/2 - 2;
#elif defined __linux__
    int affinity_count = linux_get_affinity_count(linux_gettid());

    return (0 != affinity_count) ? affinity_count : sysconf (_SC_NPROCESSORS_ONLN);
#elif defined __APPLE__
    int count = 0;
    int cmd[2] = { CTL_HW, HW_NCPU };
    size_t len = sizeof count;
    int status = sysctl(cmd, 2, &count, &len, 0, 0);
    assert(status >= 0);
    assert((unsigned)count == count);

    return count;
#elif defined  __FreeBSD__ || defined __CYGWIN__ || defined __DragonFly__
    int ncores = sysconf(_SC_NPROCESSORS_ONLN);

    return ncores;
    // Just get the number of processors
//    return sysconf(_SC_NPROCESSORS_ONLN);
#elif defined  __VXWORKS__
    return __builtin_popcount( vxCpuEnabledGet() );
#else
#error "Unknown architecture"
#endif
}

COMMON_SYSDEP void __cilkrts_sleep(void)
{
#ifdef __VXWORKS__
	taskDelay(1);
#else			
    usleep(1);
#endif	
}

COMMON_SYSDEP void __cilkrts_yield(void)
{
#if __APPLE__ || __FreeBSD__ || __VXWORKS__
    // On MacOS, call sched_yield to yield quantum.  I'm not sure why we
    // don't do this on Linux also.
    sched_yield();
#elif defined(__DragonFly__)
    // On DragonFly BSD, call sched_yield to yield quantum.
    sched_yield();
#elif defined(__MIC__)
    // On MIC, pthread_yield() really trashes things.  Arch's measurements
    // showed that calling _mm_delay_32() (or doing nothing) was a better
    // option.  Delaying 1024 clock cycles is a reasonable compromise between
    // giving up the processor and latency starting up when work becomes
    // available
    _mm_delay_32(1024);
#elif defined(__ANDROID__) || (defined(__sun__) && defined(__svr4__))
    // On Android and Solaris, call sched_yield to yield quantum.  I'm not
    // sure why we don't do this on Linux also.
    sched_yield();
#else
    // On Linux, call pthread_yield (which in turn will call sched_yield)
    // to yield quantum.
    pthread_yield();
#endif
}

COMMON_SYSDEP __STDNS size_t cilkos_getenv(char* value, __STDNS size_t vallen,
                                           const char* varname)
{
    CILK_ASSERT(value);
    CILK_ASSERT(varname);

    const char* envstr = getenv(varname);
    if (envstr)
    {
        size_t len = strlen(envstr);
        if (len > vallen - 1)
            return len + 1;

        strcpy(value, envstr);
        return len;
    }
    else
    {
        value[0] = '\0';
        return 0;
    }
}

/*
 * Unrecoverable error: Print an error message and abort execution.
 */
COMMON_SYSDEP void cilkos_error(const char *fmt, ...)
{
    va_list l;
    fflush(NULL);
    fprintf(stderr, "Cilk error: ");
    va_start(l, fmt);
    vfprintf(stderr, fmt, l);
    va_end(l);
    fprintf(stderr, "Exiting.\n");
    fflush(stderr);

    abort();
}

/*
 * Print a warning message and return.
 */
COMMON_SYSDEP void cilkos_warning(const char *fmt, ...)
{
    va_list l;
    fflush(NULL);
    fprintf(stderr, "Cilk warning: ");
    va_start(l, fmt);
    vfprintf(stderr, fmt, l);
    va_end(l);
    fflush(stderr);
}

static void __attribute__((constructor)) init_once()
{
    /*__cilkrts_debugger_notification_internal(CILK_DB_RUNTIME_LOADED);*/
    __cilkrts_init_tls_variables();
}


#define PAGE 4096
#define CILK_MIN_STACK_SIZE (4*PAGE)
// Default size for the stacks that we create in Cilk for Unix.
#define CILK_DEFAULT_STACK_SIZE 0x100000

/*
 * Convert the user's specified stack size into a "reasonable" value
 * for this OS.
 */
size_t cilkos_validate_stack_size(size_t specified_stack_size) {
    // Convert any negative value to the default.
    if (specified_stack_size == 0) {
        CILK_ASSERT((CILK_DEFAULT_STACK_SIZE % PAGE) == 0);
        return CILK_DEFAULT_STACK_SIZE;
    }
    // Round values in between 0 and CILK_MIN_STACK_SIZE up to
    // CILK_MIN_STACK_SIZE.
    if (specified_stack_size <= CILK_MIN_STACK_SIZE) {
        return CILK_MIN_STACK_SIZE;
    }
    if ((specified_stack_size % PAGE) > 0) {
        // Round the user's stack size value up to nearest page boundary.
        return (PAGE * (1 + specified_stack_size / PAGE));
    }
    return specified_stack_size;
}

long cilkos_atomic_add(volatile long* p, long x)
{
    return __sync_add_and_fetch(p, x);
}

/* End os-unix.c */
Commit	Line	Data
3038054c BI	1	/* os-unix.c --C--
	2	*
	3	*************************************************************************
	4	*
	5	* @copyright
	6	* Copyright (C) 2009-2013, Intel Corporation
	7	* All rights reserved.
	8	*
	9	* @copyright
	10	* Redistribution and use in source and binary forms, with or without
	11	* modification, are permitted provided that the following conditions
	12	* are met:
	13	*
	14	* * Redistributions of source code must retain the above copyright
	15	* notice, this list of conditions and the following disclaimer.
	16	* * Redistributions in binary form must reproduce the above copyright
	17	* notice, this list of conditions and the following disclaimer in
	18	* the documentation and/or other materials provided with the
	19	* distribution.
	20	* * Neither the name of Intel Corporation nor the names of its
	21	* contributors may be used to endorse or promote products derived
	22	* from this software without specific prior written permission.
	23	*
	24	* @copyright
	25	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	26	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	27	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	28	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	29	* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	30	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	31	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
	32	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	33	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	34	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
	35	* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	36	* POSSIBILITY OF SUCH DAMAGE.
	37	**************************************************************************/
	38
	39	#ifdef __linux__
	40	// define _GNU_SOURCE before any #include.
	41	// Even <stdint.h> will break later #includes if this macro is not
	42	// already defined when it is #included.
	43	# define _GNU_SOURCE
	44	#endif
	45
	46	#include "os.h"
	47	#include "bug.h"
	48	#include "cilk_malloc.h"
	49	#include <internal/abi.h>
	50
	51	#if defined __linux__
	52	# include <sys/sysinfo.h>
	53	# include <sys/syscall.h>
	54	#elif defined __APPLE__
	55	# include <sys/sysctl.h>
	56	// Uses sysconf(_SC_NPROCESSORS_ONLN) in verbose output
dbed5a9b JM	57	#elif defined __DragonFly__
dbed5a9b JM	58	// No additional include files
3038054c BI	59	#elif defined __FreeBSD__
	60	// No additional include files
	61	#elif defined __CYGWIN__
	62	// Cygwin on Windows - no additional include files
	63	#elif defined __VXWORKS__
	64	# include <vxWorks.h>
	65	# include <vxCpuLib.h>
	66	# include <taskLib.h>
2546da0b RO	67	// Solaris
	68	#elif defined __sun__ && defined __svr4__
	69	# include <sched.h>
3038054c BI	70	#else
	71	# error "Unsupported OS"
	72	#endif
	73
	74	#include <stdarg.h>
	75	#include <stddef.h>
	76	#include <stdio.h>
	77	#include <stdlib.h>
	78	#include <string.h>
	79	#include <unistd.h>
	80	#include <pthread.h>
	81	#include <sys/types.h>
	82
	83
	84
	85	// /* Thread-local storage */
	86	// #ifdef _WIN32
	87	// typedef unsigned cilkos_tls_key_t;
	88	// #else
	89	// typedef pthread_key_t cilkos_tls_key_t;
	90	// #endif
	91	// cilkos_tls_key_t cilkos_allocate_tls_key();
	92	// void cilkos_set_tls_pointer(cilkos_tls_key_t key, void* ptr);
	93	// void* cilkos_get_tls_pointer(cilkos_tls_key_t key);
	94
	95	#if !defined CILK_WORKER_TLS
	96	static int cilk_keys_defined;
	97	static pthread_key_t worker_key, pedigree_leaf_key, tbb_interop_key;
	98
	99	#if SUPPORT_GET_CURRENT_FIBER > 0
	100	static pthread_key_t fiber_key;
	101	#endif
	102
	103	static void *serial_worker;
	104
	105
	106	// This destructor is called when a pthread dies to deallocate the
	107	// pedigree node.
	108	static void __cilkrts_pedigree_leaf_destructor(void* pedigree_tls_ptr)
	109	{
	110	__cilkrts_pedigree* pedigree_tls
	111	= (__cilkrts_pedigree*)pedigree_tls_ptr;
	112	if (pedigree_tls) {
	113	// Assert that we have either one or two nodes
	114	// left in the pedigree chain.
	115	// If we have more, then something is going wrong...
	116	CILK_ASSERT(!pedigree_tls->parent \|\| !pedigree_tls->parent->parent);
	117	__cilkrts_free(pedigree_tls);
	118	}
	119	}
	120
	121	void __cilkrts_init_tls_variables(void)
	122	{
	123	int status;
	124	/* This will be called once in serial execution before any
	125	Cilk parallelism so we do not need to worry about races
	126	on cilk_keys_defined. */
	127	if (cilk_keys_defined)
	128	return;
	129	status = pthread_key_create(&worker_key, NULL);
	130	CILK_ASSERT (status == 0);
	131	status = pthread_key_create(&pedigree_leaf_key,
	132	__cilkrts_pedigree_leaf_destructor);
	133	CILK_ASSERT (status == 0);
134	status = pthread_key_create(&tbb_interop_key, NULL);
135	CILK_ASSERT (status == 0);
136
137	#if SUPPORT_GET_CURRENT_FIBER > 0
138	status = pthread_key_create(&fiber_key, NULL);
139	CILK_ASSERT (status == 0);
140	#endif
141	cilk_keys_defined = 1;
142	return;
143	}
144
145	COMMON_SYSDEP
146	void* cilkos_get_current_thread_id(void)
147	{
148	return (void*)pthread_self();
149	}
150
151
152	CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker()
153	{
154	if (__builtin_expect(cilk_keys_defined, 1))
155	return (__cilkrts_worker *)pthread_getspecific(worker_key);
156	else
157	return serial_worker;
158
159	}
160
161	CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker_fast()
162	{
163	return (__cilkrts_worker *)pthread_getspecific(worker_key);
164	}
165
166	COMMON_SYSDEP
167	__cilk_tbb_stack_op_thunk *__cilkrts_get_tls_tbb_interop(void)
168	{
169	if (__builtin_expect(cilk_keys_defined, 1))
170	return (__cilk_tbb_stack_op_thunk *)
171	pthread_getspecific(tbb_interop_key);
172	else
173	return 0;
174	}
175
176	// This counter should be updated atomically.
177	static int __cilkrts_global_pedigree_tls_counter = -1;
178
179	COMMON_SYSDEP
180	__cilkrts_pedigree *__cilkrts_get_tls_pedigree_leaf(int create_new)
181	{
182	__cilkrts_pedigree *pedigree_tls;
183	if (__builtin_expect(cilk_keys_defined, 1)) {
184	pedigree_tls =
185	(struct __cilkrts_pedigree *)pthread_getspecific(pedigree_leaf_key);
186	}
187	else {
188	return 0;
189	}
190
191	if (!pedigree_tls && create_new) {
192	// This call creates two nodes, X and Y.
193	// X == pedigree_tls[0] is the leaf node, which gets copied
194	// in and out of a user worker w when w binds and unbinds.
195	// Y == pedigree_tls[1] is the root node,
196	// which is a constant node that represents the user worker
197	// thread w.
198	pedigree_tls = (__cilkrts_pedigree*)
199	__cilkrts_malloc(2 * sizeof(__cilkrts_pedigree));
200
201	// This call sets the TLS pointer to the new node.
202	__cilkrts_set_tls_pedigree_leaf(pedigree_tls);
203
204	pedigree_tls[0].rank = 0;
205	pedigree_tls[0].parent = &pedigree_tls[1];
206
207	// Create Y, whose rank begins as the global counter value.
208	pedigree_tls[1].rank =
209	__sync_add_and_fetch(&__cilkrts_global_pedigree_tls_counter, 1);
210
211	pedigree_tls[1].parent = NULL;
212	CILK_ASSERT(pedigree_tls[1].rank != -1);
213	}
214	return pedigree_tls;
215	}
216
217	#if SUPPORT_GET_CURRENT_FIBER > 0
218	COMMON_SYSDEP
219	cilk_fiber_sysdep* cilkos_get_tls_cilk_fiber(void)
220	{
221	if (__builtin_expect(cilk_keys_defined, 1))
222	return (cilk_fiber_sysdep *)pthread_getspecific(fiber_key);
223	else
224	return NULL;
225	}
226	#endif
227
228	COMMON_SYSDEP
229	void __cilkrts_set_tls_worker(__cilkrts_worker *w)
230	{
231	if (__builtin_expect(cilk_keys_defined, 1)) {
232	int status;
233	status = pthread_setspecific(worker_key, w);
234	CILK_ASSERT (status == 0);
235	return;
236	}
237	else
238	{
239	serial_worker = w;
240	}
241	}
242
243	COMMON_SYSDEP
244	void __cilkrts_set_tls_tbb_interop(__cilk_tbb_stack_op_thunk *t)
245	{
246	if (__builtin_expect(cilk_keys_defined, 1)) {
247	int status;
248	status = pthread_setspecific(tbb_interop_key, t);
249	CILK_ASSERT (status == 0);
250	return;
251	}
252	abort();
253	}
254
255	COMMON_SYSDEP
256	void __cilkrts_set_tls_pedigree_leaf(__cilkrts_pedigree* pedigree_leaf)
257	{
258	if (__builtin_expect(cilk_keys_defined, 1)) {
259	int status;
260	status = pthread_setspecific(pedigree_leaf_key, pedigree_leaf);
261	CILK_ASSERT (status == 0);
262	return;
263	}
264	abort();
265	}
266
267	#if SUPPORT_GET_CURRENT_FIBER > 0
268	COMMON_SYSDEP
269	void cilkos_set_tls_cilk_fiber(cilk_fiber_sysdep* fiber)
270	{
271	if (__builtin_expect(cilk_keys_defined, 1)) {
272	int status;
273	status = pthread_setspecific(fiber_key, fiber);
274	CILK_ASSERT (status == 0);
275	return;
276	}
277	abort();
278	}
279	#endif
280
281	#else
282	void __cilkrts_init_tls_variables(void)
283	{
284	}
285	#endif
286
b1cd42c5	287	#if defined (__linux__) && ! defined(__ANDROID__)
3038054c BI	288	/*
	289	* Get the thread id, rather than the pid. In the case of MIC offload, it's
	290	* possible that we have multiple threads entering Cilk, and each has a
	291	* different affinity.
	292	*/
	293	static pid_t linux_gettid(void)
	294	{
	295	return syscall(SYS_gettid);
	296	}
	297
	298	/*
	299	* On Linux we look at the thread affinity mask and restrict ourself to one
	300	* thread for each of the hardware contexts to which we are bound.
	301	* Therefore if user does
	302	* % taskset 0-1 cilkProgram
	303	* # restrict execution to hardware contexts zero and one
	304	* the Cilk program will only use two threads even if it is running on a
	305	* machine that has 32 hardware contexts.
	306	* This is the right thing to do, because the threads are restricted to two
	307	* hardware contexts by the affinity mask set by taskset, and if we were to
	308	* create extra threads they would simply oversubscribe the hardware resources
	309	* we can use.
	310	* This is particularly important on MIC in offload mode, where the affinity
	311	* mask is set by the offload library to force the offload code away from
	312	* cores that have offload support threads running on them.
	313	*/
	314	static int linux_get_affinity_count (int tid)
	315	{
ef132d59 BI	316	#if !defined HAVE_PTHREAD_AFFINITY_NP
	317	return 0;
	318	#else
	319
3038054c BI	320	cpu_set_t process_mask;
	321
	322	// Extract the thread affinity mask
	323	int err = sched_getaffinity (tid, sizeof(process_mask),&process_mask);
	324
	325	if (0 != err)
	326	{
	327	return 0;
	328	}
	329
	330	// We have extracted the mask OK, so now we can count the number of threads
	331	// in it. This is linear in the maximum number of CPUs available, We
	332	// could do a logarithmic version, if we assume the format of the mask,
	333	// but it's not really worth it. We only call this at thread startup
	334	// anyway.
	335	int available_procs = 0;
	336	int i;
	337	for (i = 0; i < CPU_SETSIZE; i++)
	338	{
	339	if (CPU_ISSET(i, &process_mask))
	340	{
	341	available_procs++;
	342	}
	343	}
	344
	345	return available_procs;
ef132d59	346	#endif
3038054c	347	}
b1cd42c5	348	#endif // defined (__linux__) && ! defined(__ANDROID__)
3038054c BI	349
	350	/*
	351	* __cilkrts_hardware_cpu_count
	352	*
	353	* Returns the number of available CPUs on this hardware. This is architecture-
	354	* specific.
	355	*/
	356
	357	COMMON_SYSDEP int __cilkrts_hardware_cpu_count(void)
	358	{
b1cd42c5	359	#if defined __ANDROID__ \|\| (defined(__sun__) && defined(__svr4__))
3038054c BI	360	return sysconf (_SC_NPROCESSORS_ONLN);
	361	#elif defined __MIC__
	362	/// HACK: Usually, the 3rd and 4th hyperthreads are not beneficial
	363	/// on KNC. Also, ignore the last core.
	364	int P = sysconf (_SC_NPROCESSORS_ONLN);
	365	return P/2 - 2;
	366	#elif defined __linux__
	367	int affinity_count = linux_get_affinity_count(linux_gettid());
	368
	369	return (0 != affinity_count) ? affinity_count : sysconf (_SC_NPROCESSORS_ONLN);
	370	#elif defined __APPLE__
	371	int count = 0;
	372	int cmd[2] = { CTL_HW, HW_NCPU };
	373	size_t len = sizeof count;
	374	int status = sysctl(cmd, 2, &count, &len, 0, 0);
	375	assert(status >= 0);
	376	assert((unsigned)count == count);
	377
	378	return count;
dbed5a9b	379	#elif defined __FreeBSD__ \|\| defined __CYGWIN__ \|\| defined __DragonFly__
3038054c BI	380	int ncores = sysconf(_SC_NPROCESSORS_ONLN);
	381
	382	return ncores;
	383	// Just get the number of processors
	384	// return sysconf(_SC_NPROCESSORS_ONLN);
	385	#elif defined __VXWORKS__
	386	return __builtin_popcount( vxCpuEnabledGet() );
	387	#else
	388	#error "Unknown architecture"
	389	#endif
	390	}
	391
	392	COMMON_SYSDEP void __cilkrts_sleep(void)
	393	{
	394	#ifdef __VXWORKS__
	395	taskDelay(1);
	396	#else
	397	usleep(1);
	398	#endif
	399	}
	400
	401	COMMON_SYSDEP void __cilkrts_yield(void)
	402	{
	403	#if __APPLE__ \|\| __FreeBSD__ \|\| __VXWORKS__
	404	// On MacOS, call sched_yield to yield quantum. I'm not sure why we
	405	// don't do this on Linux also.
	406	sched_yield();
dbed5a9b JM	407	#elif defined(__DragonFly__)
	408	// On DragonFly BSD, call sched_yield to yield quantum.
	409	sched_yield();
3038054c BI	410	#elif defined(__MIC__)
	411	// On MIC, pthread_yield() really trashes things. Arch's measurements
	412	// showed that calling _mm_delay_32() (or doing nothing) was a better
	413	// option. Delaying 1024 clock cycles is a reasonable compromise between
	414	// giving up the processor and latency starting up when work becomes
	415	// available
	416	_mm_delay_32(1024);
b1cd42c5	417	#elif defined(__ANDROID__) \|\| (defined(__sun__) && defined(__svr4__))
2546da0b RO	418	// On Android and Solaris, call sched_yield to yield quantum. I'm not
2546da0b RO	419	// sure why we don't do this on Linux also.
3038054c BI	420	sched_yield();
	421	#else
	422	// On Linux, call pthread_yield (which in turn will call sched_yield)
	423	// to yield quantum.
	424	pthread_yield();
	425	#endif
	426	}
	427
	428	COMMON_SYSDEP __STDNS size_t cilkos_getenv(char* value, __STDNS size_t vallen,
	429	const char* varname)
	430	{
	431	CILK_ASSERT(value);
	432	CILK_ASSERT(varname);
	433
	434	const char* envstr = getenv(varname);
	435	if (envstr)
	436	{
	437	size_t len = strlen(envstr);
	438	if (len > vallen - 1)
	439	return len + 1;
	440
	441	strcpy(value, envstr);
	442	return len;
	443	}
	444	else
	445	{
	446	value[0] = '\0';
	447	return 0;
	448	}
	449	}
	450
	451	/*
	452	* Unrecoverable error: Print an error message and abort execution.
	453	*/
	454	COMMON_SYSDEP void cilkos_error(const char *fmt, ...)
	455	{
	456	va_list l;
	457	fflush(NULL);
	458	fprintf(stderr, "Cilk error: ");
	459	va_start(l, fmt);
	460	vfprintf(stderr, fmt, l);
	461	va_end(l);
	462	fprintf(stderr, "Exiting.\n");
	463	fflush(stderr);
	464
	465	abort();
	466	}
	467
	468	/*
	469	* Print a warning message and return.
	470	*/
	471	COMMON_SYSDEP void cilkos_warning(const char *fmt, ...)
	472	{
	473	va_list l;
	474	fflush(NULL);
	475	fprintf(stderr, "Cilk warning: ");
	476	va_start(l, fmt);
	477	vfprintf(stderr, fmt, l);
	478	va_end(l);
	479	fflush(stderr);
	480	}
	481
	482	static void __attribute__((constructor)) init_once()
	483	{
484	/__cilkrts_debugger_notification_internal(CILK_DB_RUNTIME_LOADED);/
485	__cilkrts_init_tls_variables();
486	}
487
488
489	#define PAGE 4096
490	#define CILK_MIN_STACK_SIZE (4*PAGE)
491	// Default size for the stacks that we create in Cilk for Unix.
492	#define CILK_DEFAULT_STACK_SIZE 0x100000
493
494	/*
495	* Convert the user's specified stack size into a "reasonable" value
496	* for this OS.
497	*/
498	size_t cilkos_validate_stack_size(size_t specified_stack_size) {
499	// Convert any negative value to the default.
500	if (specified_stack_size == 0) {
501	CILK_ASSERT((CILK_DEFAULT_STACK_SIZE % PAGE) == 0);
502	return CILK_DEFAULT_STACK_SIZE;
503	}
504	// Round values in between 0 and CILK_MIN_STACK_SIZE up to
505	// CILK_MIN_STACK_SIZE.
506	if (specified_stack_size <= CILK_MIN_STACK_SIZE) {
507	return CILK_MIN_STACK_SIZE;
508	}
509	if ((specified_stack_size % PAGE) > 0) {
510	// Round the user's stack size value up to nearest page boundary.
511	return (PAGE * (1 + specified_stack_size / PAGE));
512	}
513	return specified_stack_size;
514	}
515
516	long cilkos_atomic_add(volatile long* p, long x)
517	{
518	return __sync_add_and_fetch(p, x);
519	}
520
521	/* End os-unix.c */