[thirdparty/gcc.git] / libcilkrts / runtime / os-unix.c

/* os-unix.c                  -*-C-*-
 *
 *************************************************************************
 *
 *  Copyright (C) 2009-2016, Intel Corporation
 *  All rights reserved.
 *  
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
 *  
 *    * Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *    * Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in
 *      the documentation and/or other materials provided with the
 *      distribution.
 *    * Neither the name of Intel Corporation nor the names of its
 *      contributors may be used to endorse or promote products derived
 *      from this software without specific prior written permission.
 *  
 *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
 *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 *  POSSIBILITY OF SUCH DAMAGE.
 *  
 *  *********************************************************************
 *  
 *  PLEASE NOTE: This file is a downstream copy of a file mainitained in
 *  a repository at cilkplus.org. Changes made to this file that are not
 *  submitted through the contribution process detailed at
 *  http://www.cilkplus.org/submit-cilk-contribution will be lost the next
 *  time that a new version is released. Changes only submitted to the
 *  GNU compiler collection or posted to the git repository at
 *  https://bitbucket.org/intelcilkruntime/intel-cilk-runtime.git are
 *  not tracked.
 *  
 *  We welcome your contributions to this open source project. Thank you
 *  for your assistance in helping us improve Cilk Plus.
 **************************************************************************/

#include "os.h"
#include "bug.h"
#include "cilk_malloc.h"
#include <internal/abi.h>

#if defined __linux__
#   include <sys/sysinfo.h>
#   include <sys/syscall.h>

#elif defined __APPLE__
#   include <sys/sysctl.h>
    // Uses sysconf(_SC_NPROCESSORS_ONLN) in verbose output

#elif defined  __VXWORKS__
#   include <vxWorks.h>   
#   include <vxCpuLib.h>   
#   include <taskLib.h>
   
// Solaris
#elif defined __sun__ && defined __svr4__
#   include <sched.h>

// OSes we know about which don't require any additional files
#elif defined __CYGWIN__ || \
      defined __DragonFly__ || \
      defined __FreeBSD__ || \
      defined __GNU__
// No additional include files

#else
#   error "Unsupported OS"
#endif

#include <stdarg.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <pthread.h>
#include <sys/types.h>


// /* Thread-local storage */
// #ifdef _WIN32
// typedef unsigned cilkos_tls_key_t;
// #else
// typedef pthread_key_t cilkos_tls_key_t;
// #endif
// cilkos_tls_key_t cilkos_allocate_tls_key();
// void cilkos_set_tls_pointer(cilkos_tls_key_t key, void* ptr);
// void* cilkos_get_tls_pointer(cilkos_tls_key_t key);

#if !defined CILK_WORKER_TLS
static int cilk_keys_defined;
static pthread_key_t worker_key, pedigree_leaf_key, tbb_interop_key;

#if SUPPORT_GET_CURRENT_FIBER > 0
static pthread_key_t fiber_key;
#endif

static void *serial_worker;


// This destructor is called when a pthread dies to deallocate the
// pedigree node.
static void __cilkrts_pedigree_leaf_destructor(void* pedigree_tls_ptr)
{
    __cilkrts_pedigree* pedigree_tls
	= (__cilkrts_pedigree*)pedigree_tls_ptr;
    if (pedigree_tls) {
        // Assert that we have either one or two nodes
        // left in the pedigree chain.
        // If we have more, then something is going wrong...
        CILK_ASSERT(!pedigree_tls->parent || !pedigree_tls->parent->parent);
	__cilkrts_free(pedigree_tls);
    }
}

void __cilkrts_init_tls_variables(void)
{
    int status;
    /* This will be called once in serial execution before any
       Cilk parallelism so we do not need to worry about races
       on cilk_keys_defined. */
    if (cilk_keys_defined)
        return;
    status = pthread_key_create(&worker_key, NULL);
    CILK_ASSERT (status == 0);
    status = pthread_key_create(&pedigree_leaf_key,
				__cilkrts_pedigree_leaf_destructor);
    CILK_ASSERT (status == 0);
    status = pthread_key_create(&tbb_interop_key, NULL);
    CILK_ASSERT (status == 0);

#if SUPPORT_GET_CURRENT_FIBER > 0    
    status = pthread_key_create(&fiber_key, NULL);
    CILK_ASSERT (status == 0);
#endif
    cilk_keys_defined = 1;
    return;
}

COMMON_SYSDEP
void* cilkos_get_current_thread_id(void)
{
    return (void*)pthread_self();
}


CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker()
{
    if (__builtin_expect(cilk_keys_defined, 1))
        return (__cilkrts_worker *)pthread_getspecific(worker_key);
    else 
        return serial_worker;
    
}

CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker_fast()
{
  return (__cilkrts_worker *)pthread_getspecific(worker_key);
}

COMMON_SYSDEP
__cilk_tbb_stack_op_thunk *__cilkrts_get_tls_tbb_interop(void)
{
    if (__builtin_expect(cilk_keys_defined, 1))
        return (__cilk_tbb_stack_op_thunk *)
            pthread_getspecific(tbb_interop_key);
    else
        return 0;
}

// This counter should be updated atomically.
static int __cilkrts_global_pedigree_tls_counter = -1;

COMMON_SYSDEP
__cilkrts_pedigree *__cilkrts_get_tls_pedigree_leaf(int create_new)
{
    __cilkrts_pedigree *pedigree_tls;    
    if (__builtin_expect(cilk_keys_defined, 1)) {
        pedigree_tls =
            (struct __cilkrts_pedigree *)pthread_getspecific(pedigree_leaf_key);
    }
    else {
        return 0;
    }
    
    if (!pedigree_tls && create_new) {
        // This call creates two nodes, X and Y.
        // X == pedigree_tls[0] is the leaf node, which gets copied
        // in and out of a user worker w when w binds and unbinds.
        // Y == pedigree_tls[1] is the root node,
        // which is a constant node that represents the user worker
        // thread w.
	pedigree_tls = (__cilkrts_pedigree*)
	    __cilkrts_malloc(2 * sizeof(__cilkrts_pedigree));

        // This call sets the TLS pointer to the new node.
	__cilkrts_set_tls_pedigree_leaf(pedigree_tls);
        
        pedigree_tls[0].rank = 0;
        pedigree_tls[0].parent = &pedigree_tls[1];

        // Create Y, whose rank begins as the global counter value.
        pedigree_tls[1].rank =
            __sync_add_and_fetch(&__cilkrts_global_pedigree_tls_counter, 1);

        pedigree_tls[1].parent = NULL;
        CILK_ASSERT(pedigree_tls[1].rank != -1);
    }
    return pedigree_tls;
}

#if SUPPORT_GET_CURRENT_FIBER > 0
COMMON_SYSDEP
cilk_fiber_sysdep* cilkos_get_tls_cilk_fiber(void)
{
    if (__builtin_expect(cilk_keys_defined, 1))
        return (cilk_fiber_sysdep *)pthread_getspecific(fiber_key);
    else
        return NULL;
}
#endif

COMMON_SYSDEP
void __cilkrts_set_tls_worker(__cilkrts_worker *w)
{
    if (__builtin_expect(cilk_keys_defined, 1)) {
        int status;
        status = pthread_setspecific(worker_key, w);
        CILK_ASSERT (status == 0);
        return;
    }
    else
    {
        serial_worker = w;
    }
}

COMMON_SYSDEP
void __cilkrts_set_tls_tbb_interop(__cilk_tbb_stack_op_thunk *t)
{
    if (__builtin_expect(cilk_keys_defined, 1)) {
        int status;
        status = pthread_setspecific(tbb_interop_key, t);
        CILK_ASSERT (status == 0);
        return;
    }
    abort();
}

COMMON_SYSDEP
void __cilkrts_set_tls_pedigree_leaf(__cilkrts_pedigree* pedigree_leaf)
{
    if (__builtin_expect(cilk_keys_defined, 1)) {
        int status;
        status = pthread_setspecific(pedigree_leaf_key, pedigree_leaf);
        CILK_ASSERT (status == 0);
        return;
    }
    abort();
}

#if SUPPORT_GET_CURRENT_FIBER > 0
COMMON_SYSDEP
void cilkos_set_tls_cilk_fiber(cilk_fiber_sysdep* fiber)
{
    if (__builtin_expect(cilk_keys_defined, 1)) {
        int status;
        status = pthread_setspecific(fiber_key, fiber);
        CILK_ASSERT (status == 0);
        return;
    }
    abort();
}
#endif

#else
void __cilkrts_init_tls_variables(void)
{
}
#endif

#if defined (__linux__) && ! defined(__ANDROID__)
/*
 * Get the thread id, rather than the pid. In the case of MIC offload, it's
 * possible that we have multiple threads entering Cilk, and each has a
 * different affinity.
 */
static pid_t linux_gettid(void)
{
    return syscall(SYS_gettid);
}

/*
 * On Linux we look at the thread affinity mask and restrict ourself to one
 * thread for each of the hardware contexts to which we are bound.
 * Therefore if user does
 * % taskset 0-1 cilkProgram
 *       # restrict execution to hardware contexts zero and one
 * the Cilk program will only use two threads even if it is running on a
 * machine that has 32 hardware contexts.
 * This is the right thing to do, because the threads are restricted to two
 * hardware contexts by the affinity mask set by taskset, and if we were to
 * create extra threads they would simply oversubscribe the hardware resources
 * we can use.
 * This is particularly important on MIC in offload mode, where the affinity
 * mask is set by the offload library to force the offload code away from
 * cores that have offload support threads running on them.
 */
static int linux_get_affinity_count ()
{
    long system_cores = sysconf(_SC_NPROCESSORS_ONLN);
    int affinity_cores = 0;

#if defined HAVE_PTHREAD_AFFINITY_NP

#if defined (CPU_ALLOC_SIZE) && ! defined(DONT_USE_CPU_ALLOC_SIZE)
    // Statically allocated cpu_set_t's max out at 1024 cores.  If
    // CPU_ALLOC_SIZE is available, use it to support large numbers of cores
    size_t cpusetsize = CPU_ALLOC_SIZE(system_cores);
    cpu_set_t *process_mask = (cpu_set_t *)__cilkrts_malloc(cpusetsize);

    // Get the affinity mask for this thread
    int err = pthread_getaffinity_np(pthread_self(),
                                     cpusetsize,
                                     process_mask);

    // Count the available cores.
    if (0 == err)
        affinity_cores = CPU_COUNT_S(cpusetsize, process_mask);

    __cilkrts_free(process_mask);

#else
    // CPU_ALLOC_SIZE isn't available, or this is the Intel compiler build
    // and we have to support RHEL5.  Use a statically allocated cpu_set_t

    cpu_set_t process_mask;

    // Extract the thread affinity mask
    int err = pthread_getaffinity_np(pthread_self(),
                                     sizeof(process_mask),
                                     &process_mask);

    if (0 == err)
    {
        // We have extracted the mask OK, so now we can count the number of
        // threads in it.  This is linear in the maximum number of CPUs
        // available, We could do a logarithmic version, if we assume the
        // format of the mask, but it's not really worth it. We only call
        // this at thread startup anyway.
        int i;
        for (i = 0; i < CPU_SETSIZE; i++)
        {
            if (CPU_ISSET(i, &process_mask))
            {
                affinity_cores++;
            }
        }
    }
#endif  // CPU_ALLOC_SIZE
#endif  //  ! defined HAVE_PTHREAD_AFFINITY_NP

    // If we've got a count of cores this thread is supposed to use, that's
    // the number or cores we'll use.  Otherwise, default to the number of
    // cores on the system.
    if (0 == affinity_cores)
        return system_cores;
    else
        return affinity_cores;
}
#endif  //  defined (__linux__) && ! defined(__ANDROID__)

/*
 * __cilkrts_hardware_cpu_count
 *
 * Returns the number of available CPUs on this hardware.  This is architecture-
 * specific. 
 */

COMMON_SYSDEP int __cilkrts_hardware_cpu_count(void)
{
#if defined __ANDROID__  || \
    defined __CYGWIN__   || \
    defined __DragonFly__  || \
    defined __FreeBSD__  || \
    (defined(__sun__) && defined(__svr4__))
    return (int)sysconf(_SC_NPROCESSORS_ONLN);
#elif defined __MIC__
    /// HACK: Usually, the 3rd and 4th hyperthreads are not beneficial
    /// on KNC.  Also, ignore the last core.
    int count = (int)sysconf (_SC_NPROCESSORS_ONLN);
    return count/2 - 2;
#elif defined __linux__
    return linux_get_affinity_count();
#elif defined __APPLE__
    int count;
    size_t len = sizeof count;
    int status = sysctlbyname("hw.logicalcpu", &count, &len, 0, 0);
    assert(0 == status);

    return count;
#elif defined  __VXWORKS__
    return __builtin_popcount(vxCpuEnabledGet());
#else
#error "Unsupported architecture"
#endif
}

COMMON_SYSDEP void __cilkrts_idle(void)
{
    // This is another version of __cilkrts_yield() to be used when
    // silencing workers that are not stealing work.
#if defined(__ANDROID__)  || \
    defined(__FreeBSD__)  || \
    defined(__VXWORKS__)  || \
    (defined(__sun__) && defined(__svr4__))
    sched_yield();
#elif defined(__MIC__)
    _mm_delay_32(1024);
#elif defined(__linux__) || \
      defined(__APPLE__) || \
      defined(__CYGWIN__)
      
    usleep(10000);
#else
# error "Unsupported architecture"
#endif
}

COMMON_SYSDEP void __cilkrts_sleep(void)
{
#ifdef __VXWORKS__
    taskDelay(1);
#else			
    usleep(1);
#endif	
}

COMMON_SYSDEP void __cilkrts_yield(void)
{
#if defined(__ANDROID__)  || \
    defined(__APPLE__)    || \
    defined(__CYGWIN__)   || \
    defined(__FreeBSD__)  || \
    defined(__VXWORKS__)  || \
    (defined(__sun__) && defined(__svr4__))
    // Call sched_yield to yield quantum.  I'm not sure why we
    // don't do this on Linux also.
    sched_yield();
#elif defined(__MIC__)
    // On MIC, pthread_yield() really trashes things.  Arch's measurements
    // showed that calling _mm_delay_32() (or doing nothing) was a better
    // option.  Delaying 1024 clock cycles is a reasonable compromise between
    // giving up the processor and latency starting up when work becomes
    // available
    _mm_delay_32(1024);
#elif defined(__linux__)
    // On Linux, call pthread_yield (which in turn will call sched_yield)
    // to yield quantum.
    pthread_yield();
#else
# error "Unsupported architecture"
#endif
}

COMMON_SYSDEP __STDNS size_t cilkos_getenv(char* value, __STDNS size_t vallen,
                                           const char* varname)
{
    CILK_ASSERT(value);
    CILK_ASSERT(varname);

    const char* envstr = getenv(varname);
    if (envstr)
    {
        size_t len = cilk_strlen(envstr);
        if (len > vallen - 1)
            return len + 1;
        cilk_strcpy_s(value, vallen, envstr);
        return len;
    }
    else
    {
        value[0] = '\0';
        return 0;
    }
}

/*
 * Unrecoverable error: Print an error message and abort execution.
 */
COMMON_SYSDEP void cilkos_error(const char *fmt, ...)
{
    va_list l;
    fflush(NULL);
    fprintf(stderr, "Cilk error: ");
    va_start(l, fmt);
    vfprintf(stderr, fmt, l);
    va_end(l);
    fprintf(stderr, "Exiting.\n");
    fflush(stderr);

    abort();
}

/*
 * Print a warning message and return.
 */
COMMON_SYSDEP void cilkos_warning(const char *fmt, ...)
{
    va_list l;
    fflush(NULL);
    fprintf(stderr, "Cilk warning: ");
    va_start(l, fmt);
    vfprintf(stderr, fmt, l);
    va_end(l);
    fflush(stderr);
}

#ifdef __VXWORKS__
#ifdef _WRS_KERNEL
void cilkStart()
{
    __cilkrts_init_tls_variables();
}
#else
_WRS_CONSTRUCTOR(cilkInit, 100)
{
    __cilkrts_init_tls_variables();
}
#endif
#else
static void __attribute__((constructor)) init_once()
{
    /*__cilkrts_debugger_notification_internal(CILK_DB_RUNTIME_LOADED);*/
    __cilkrts_init_tls_variables();
}
#endif


#define PAGE 4096
#define CILK_MIN_STACK_SIZE (4*PAGE)
// Default size for the stacks that we create in Cilk for Unix.
#define CILK_DEFAULT_STACK_SIZE 0x100000

/*
 * Convert the user's specified stack size into a "reasonable" value
 * for this OS.
 */
size_t cilkos_validate_stack_size(size_t specified_stack_size) {
    // Convert any negative value to the default.
    if (specified_stack_size == 0) {
        CILK_ASSERT((CILK_DEFAULT_STACK_SIZE % PAGE) == 0);
        return CILK_DEFAULT_STACK_SIZE;
    }
    // Round values in between 0 and CILK_MIN_STACK_SIZE up to
    // CILK_MIN_STACK_SIZE.
    if (specified_stack_size <= CILK_MIN_STACK_SIZE) {
        return CILK_MIN_STACK_SIZE;
    }
    if ((specified_stack_size % PAGE) > 0) {
        // Round the user's stack size value up to nearest page boundary.
        return (PAGE * (1 + specified_stack_size / PAGE));
    }
    return specified_stack_size;
}

long cilkos_atomic_add(volatile long* p, long x)
{
    return __sync_add_and_fetch(p, x);
}

/* End os-unix.c */
Commit	Line	Data
3038054c BI	1	/* os-unix.c --C--
	2	*
	3	*************************************************************************
	4	*
2e01cda6	5	* Copyright (C) 2009-2016, Intel Corporation
3038054c BI	6	* All rights reserved.
3038054c BI	7	*
3038054c BI	8	* Redistribution and use in source and binary forms, with or without
	9	* modification, are permitted provided that the following conditions
	10	* are met:
	11	*
	12	* * Redistributions of source code must retain the above copyright
	13	* notice, this list of conditions and the following disclaimer.
	14	* * Redistributions in binary form must reproduce the above copyright
	15	* notice, this list of conditions and the following disclaimer in
	16	* the documentation and/or other materials provided with the
	17	* distribution.
	18	* * Neither the name of Intel Corporation nor the names of its
	19	* contributors may be used to endorse or promote products derived
	20	* from this software without specific prior written permission.
	21	*
3038054c BI	22	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	23	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	24	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	25	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	26	* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	27	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	28	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
	29	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	30	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	31	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
	32	* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	33	* POSSIBILITY OF SUCH DAMAGE.
2e01cda6 IV	34	*
	35	* *********************************************************************
	36	*
	37	* PLEASE NOTE: This file is a downstream copy of a file mainitained in
	38	* a repository at cilkplus.org. Changes made to this file that are not
	39	* submitted through the contribution process detailed at
	40	* http://www.cilkplus.org/submit-cilk-contribution will be lost the next
	41	* time that a new version is released. Changes only submitted to the
	42	* GNU compiler collection or posted to the git repository at
	43	* https://bitbucket.org/intelcilkruntime/intel-cilk-runtime.git are
	44	* not tracked.
	45	*
	46	* We welcome your contributions to this open source project. Thank you
	47	* for your assistance in helping us improve Cilk Plus.
3038054c BI	48	**************************************************************************/
3038054c BI	49
3038054c BI	50	#include "os.h"
	51	#include "bug.h"
	52	#include "cilk_malloc.h"
	53	#include <internal/abi.h>
	54
	55	#if defined __linux__
	56	# include <sys/sysinfo.h>
	57	# include <sys/syscall.h>
2e01cda6	58
3038054c BI	59	#elif defined __APPLE__
	60	# include <sys/sysctl.h>
	61	// Uses sysconf(_SC_NPROCESSORS_ONLN) in verbose output
2e01cda6	62
3038054c BI	63	#elif defined __VXWORKS__
	64	# include <vxWorks.h>
	65	# include <vxCpuLib.h>
2e01cda6 IV	66	# include <taskLib.h>
2e01cda6 IV	67
2546da0b RO	68	// Solaris
	69	#elif defined __sun__ && defined __svr4__
	70	# include <sched.h>
2e01cda6 IV	71
	72	// OSes we know about which don't require any additional files
	73	#elif defined __CYGWIN__ \|\| \
	74	defined __DragonFly__ \|\| \
	75	defined __FreeBSD__ \|\| \
	76	defined __GNU__
	77	// No additional include files
	78
3038054c BI	79	#else
	80	# error "Unsupported OS"
	81	#endif
	82
	83	#include <stdarg.h>
	84	#include <stddef.h>
	85	#include <stdio.h>
	86	#include <stdlib.h>
	87	#include <string.h>
	88	#include <unistd.h>
	89	#include <pthread.h>
	90	#include <sys/types.h>
	91
	92
	93
	94	// /* Thread-local storage */
	95	// #ifdef _WIN32
	96	// typedef unsigned cilkos_tls_key_t;
	97	// #else
	98	// typedef pthread_key_t cilkos_tls_key_t;
	99	// #endif
	100	// cilkos_tls_key_t cilkos_allocate_tls_key();
	101	// void cilkos_set_tls_pointer(cilkos_tls_key_t key, void* ptr);
	102	// void* cilkos_get_tls_pointer(cilkos_tls_key_t key);
	103
	104	#if !defined CILK_WORKER_TLS
	105	static int cilk_keys_defined;
	106	static pthread_key_t worker_key, pedigree_leaf_key, tbb_interop_key;
	107
	108	#if SUPPORT_GET_CURRENT_FIBER > 0
	109	static pthread_key_t fiber_key;
	110	#endif
	111
	112	static void *serial_worker;
	113
	114
	115	// This destructor is called when a pthread dies to deallocate the
	116	// pedigree node.
	117	static void __cilkrts_pedigree_leaf_destructor(void* pedigree_tls_ptr)
	118	{
	119	__cilkrts_pedigree* pedigree_tls
	120	= (__cilkrts_pedigree*)pedigree_tls_ptr;
	121	if (pedigree_tls) {
	122	// Assert that we have either one or two nodes
	123	// left in the pedigree chain.
	124	// If we have more, then something is going wrong...
	125	CILK_ASSERT(!pedigree_tls->parent \|\| !pedigree_tls->parent->parent);
	126	__cilkrts_free(pedigree_tls);
	127	}
	128	}
	129
	130	void __cilkrts_init_tls_variables(void)
	131	{
	132	int status;
	133	/* This will be called once in serial execution before any
	134	Cilk parallelism so we do not need to worry about races
	135	on cilk_keys_defined. */
	136	if (cilk_keys_defined)
	137	return;
	138	status = pthread_key_create(&worker_key, NULL);
	139	CILK_ASSERT (status == 0);
	140	status = pthread_key_create(&pedigree_leaf_key,
	141	__cilkrts_pedigree_leaf_destructor);
	142	CILK_ASSERT (status == 0);
143	status = pthread_key_create(&tbb_interop_key, NULL);
144	CILK_ASSERT (status == 0);
145
146	#if SUPPORT_GET_CURRENT_FIBER > 0
147	status = pthread_key_create(&fiber_key, NULL);
148	CILK_ASSERT (status == 0);
149	#endif
150	cilk_keys_defined = 1;
151	return;
152	}
153
154	COMMON_SYSDEP
155	void* cilkos_get_current_thread_id(void)
156	{
157	return (void*)pthread_self();
158	}
159
160
161	CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker()
162	{
163	if (__builtin_expect(cilk_keys_defined, 1))
164	return (__cilkrts_worker *)pthread_getspecific(worker_key);
165	else
166	return serial_worker;
167
168	}
169
170	CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker_fast()
171	{
172	return (__cilkrts_worker *)pthread_getspecific(worker_key);
173	}
174
175	COMMON_SYSDEP
176	__cilk_tbb_stack_op_thunk *__cilkrts_get_tls_tbb_interop(void)
177	{
178	if (__builtin_expect(cilk_keys_defined, 1))
179	return (__cilk_tbb_stack_op_thunk *)
180	pthread_getspecific(tbb_interop_key);
181	else
182	return 0;
183	}
184
185	// This counter should be updated atomically.
186	static int __cilkrts_global_pedigree_tls_counter = -1;
187
188	COMMON_SYSDEP
189	__cilkrts_pedigree *__cilkrts_get_tls_pedigree_leaf(int create_new)
190	{
191	__cilkrts_pedigree *pedigree_tls;
192	if (__builtin_expect(cilk_keys_defined, 1)) {
193	pedigree_tls =
194	(struct __cilkrts_pedigree *)pthread_getspecific(pedigree_leaf_key);
195	}
196	else {
197	return 0;
198	}
199
200	if (!pedigree_tls && create_new) {
201	// This call creates two nodes, X and Y.
202	// X == pedigree_tls[0] is the leaf node, which gets copied
203	// in and out of a user worker w when w binds and unbinds.
204	// Y == pedigree_tls[1] is the root node,
205	// which is a constant node that represents the user worker
206	// thread w.
207	pedigree_tls = (__cilkrts_pedigree*)
208	__cilkrts_malloc(2 * sizeof(__cilkrts_pedigree));
209
210	// This call sets the TLS pointer to the new node.
211	__cilkrts_set_tls_pedigree_leaf(pedigree_tls);
212
213	pedigree_tls[0].rank = 0;
214	pedigree_tls[0].parent = &pedigree_tls[1];
215
216	// Create Y, whose rank begins as the global counter value.
217	pedigree_tls[1].rank =
218	__sync_add_and_fetch(&__cilkrts_global_pedigree_tls_counter, 1);
219
220	pedigree_tls[1].parent = NULL;
221	CILK_ASSERT(pedigree_tls[1].rank != -1);
222	}
223	return pedigree_tls;
224	}
225
226	#if SUPPORT_GET_CURRENT_FIBER > 0
227	COMMON_SYSDEP
228	cilk_fiber_sysdep* cilkos_get_tls_cilk_fiber(void)
229	{
230	if (__builtin_expect(cilk_keys_defined, 1))
231	return (cilk_fiber_sysdep *)pthread_getspecific(fiber_key);
232	else
233	return NULL;
234	}
235	#endif
236
237	COMMON_SYSDEP
238	void __cilkrts_set_tls_worker(__cilkrts_worker *w)
239	{
240	if (__builtin_expect(cilk_keys_defined, 1)) {
241	int status;
242	status = pthread_setspecific(worker_key, w);
243	CILK_ASSERT (status == 0);
244	return;
245	}
246	else
247	{
248	serial_worker = w;
249	}
250	}
251
252	COMMON_SYSDEP
253	void __cilkrts_set_tls_tbb_interop(__cilk_tbb_stack_op_thunk *t)
254	{
255	if (__builtin_expect(cilk_keys_defined, 1)) {
256	int status;
257	status = pthread_setspecific(tbb_interop_key, t);
258	CILK_ASSERT (status == 0);
259	return;
260	}
261	abort();
262	}
263
264	COMMON_SYSDEP
265	void __cilkrts_set_tls_pedigree_leaf(__cilkrts_pedigree* pedigree_leaf)
266	{
267	if (__builtin_expect(cilk_keys_defined, 1)) {
268	int status;
269	status = pthread_setspecific(pedigree_leaf_key, pedigree_leaf);
270	CILK_ASSERT (status == 0);
271	return;
272	}
273	abort();
274	}
275
276	#if SUPPORT_GET_CURRENT_FIBER > 0
277	COMMON_SYSDEP
278	void cilkos_set_tls_cilk_fiber(cilk_fiber_sysdep* fiber)
279	{
280	if (__builtin_expect(cilk_keys_defined, 1)) {
281	int status;
282	status = pthread_setspecific(fiber_key, fiber);
283	CILK_ASSERT (status == 0);
284	return;
285	}
286	abort();
287	}
288	#endif
289
290	#else
291	void __cilkrts_init_tls_variables(void)
292	{
293	}
294	#endif
295
b1cd42c5	296	#if defined (__linux__) && ! defined(__ANDROID__)
3038054c BI	297	/*
	298	* Get the thread id, rather than the pid. In the case of MIC offload, it's
	299	* possible that we have multiple threads entering Cilk, and each has a
	300	* different affinity.
	301	*/
	302	static pid_t linux_gettid(void)
	303	{
	304	return syscall(SYS_gettid);
	305	}
	306
	307	/*
	308	* On Linux we look at the thread affinity mask and restrict ourself to one
	309	* thread for each of the hardware contexts to which we are bound.
	310	* Therefore if user does
	311	* % taskset 0-1 cilkProgram
	312	* # restrict execution to hardware contexts zero and one
	313	* the Cilk program will only use two threads even if it is running on a
	314	* machine that has 32 hardware contexts.
	315	* This is the right thing to do, because the threads are restricted to two
	316	* hardware contexts by the affinity mask set by taskset, and if we were to
	317	* create extra threads they would simply oversubscribe the hardware resources
	318	* we can use.
	319	* This is particularly important on MIC in offload mode, where the affinity
	320	* mask is set by the offload library to force the offload code away from
	321	* cores that have offload support threads running on them.
	322	*/
2e01cda6	323	static int linux_get_affinity_count ()
3038054c	324	{
2e01cda6 IV	325	long system_cores = sysconf(_SC_NPROCESSORS_ONLN);
	326	int affinity_cores = 0;
	327
	328	#if defined HAVE_PTHREAD_AFFINITY_NP
	329
	330	#if defined (CPU_ALLOC_SIZE) && ! defined(DONT_USE_CPU_ALLOC_SIZE)
	331	// Statically allocated cpu_set_t's max out at 1024 cores. If
	332	// CPU_ALLOC_SIZE is available, use it to support large numbers of cores
	333	size_t cpusetsize = CPU_ALLOC_SIZE(system_cores);
	334	cpu_set_t process_mask = (cpu_set_t )__cilkrts_malloc(cpusetsize);
	335
	336	// Get the affinity mask for this thread
	337	int err = pthread_getaffinity_np(pthread_self(),
	338	cpusetsize,
	339	process_mask);
	340
	341	// Count the available cores.
	342	if (0 == err)
	343	affinity_cores = CPU_COUNT_S(cpusetsize, process_mask);
	344
	345	__cilkrts_free(process_mask);
	346
ef132d59	347	#else
2e01cda6 IV	348	// CPU_ALLOC_SIZE isn't available, or this is the Intel compiler build
2e01cda6 IV	349	// and we have to support RHEL5. Use a statically allocated cpu_set_t
ef132d59	350
3038054c BI	351	cpu_set_t process_mask;
	352
	353	// Extract the thread affinity mask
2e01cda6 IV	354	int err = pthread_getaffinity_np(pthread_self(),
	355	sizeof(process_mask),
	356	&process_mask);
3038054c	357
2e01cda6	358	if (0 == err)
3038054c	359	{
2e01cda6 IV	360	// We have extracted the mask OK, so now we can count the number of
	361	// threads in it. This is linear in the maximum number of CPUs
	362	// available, We could do a logarithmic version, if we assume the
	363	// format of the mask, but it's not really worth it. We only call
	364	// this at thread startup anyway.
	365	int i;
	366	for (i = 0; i < CPU_SETSIZE; i++)
3038054c	367	{
2e01cda6 IV	368	if (CPU_ISSET(i, &process_mask))
	369	{
	370	affinity_cores++;
	371	}
3038054c BI	372	}
3038054c BI	373	}
2e01cda6 IV	374	#endif // CPU_ALLOC_SIZE
	375	#endif // ! defined HAVE_PTHREAD_AFFINITY_NP
	376
	377	// If we've got a count of cores this thread is supposed to use, that's
	378	// the number or cores we'll use. Otherwise, default to the number of
	379	// cores on the system.
	380	if (0 == affinity_cores)
	381	return system_cores;
	382	else
	383	return affinity_cores;
3038054c	384	}
b1cd42c5	385	#endif // defined (__linux__) && ! defined(__ANDROID__)
3038054c BI	386
	387	/*
	388	* __cilkrts_hardware_cpu_count
	389	*
	390	* Returns the number of available CPUs on this hardware. This is architecture-
	391	* specific.
	392	*/
	393
	394	COMMON_SYSDEP int __cilkrts_hardware_cpu_count(void)
	395	{
2e01cda6 IV	396	#if defined __ANDROID__ \|\| \
	397	defined __CYGWIN__ \|\| \
	398	defined __DragonFly__ \|\| \
	399	defined __FreeBSD__ \|\| \
	400	(defined(__sun__) && defined(__svr4__))
	401	return (int)sysconf(_SC_NPROCESSORS_ONLN);
3038054c BI	402	#elif defined __MIC__
	403	/// HACK: Usually, the 3rd and 4th hyperthreads are not beneficial
	404	/// on KNC. Also, ignore the last core.
2e01cda6 IV	405	int count = (int)sysconf (_SC_NPROCESSORS_ONLN);
2e01cda6 IV	406	return count/2 - 2;
3038054c	407	#elif defined __linux__
2e01cda6	408	return linux_get_affinity_count();
3038054c	409	#elif defined __APPLE__
2e01cda6	410	int count;
3038054c	411	size_t len = sizeof count;
2e01cda6 IV	412	int status = sysctlbyname("hw.logicalcpu", &count, &len, 0, 0);
2e01cda6 IV	413	assert(0 == status);
3038054c BI	414
3038054c BI	415	return count;
3038054c	416	#elif defined __VXWORKS__
2e01cda6	417	return __builtin_popcount(vxCpuEnabledGet());
3038054c	418	#else
2e01cda6 IV	419	#error "Unsupported architecture"
	420	#endif
	421	}
	422
	423	COMMON_SYSDEP void __cilkrts_idle(void)
	424	{
	425	// This is another version of __cilkrts_yield() to be used when
	426	// silencing workers that are not stealing work.
	427	#if defined(__ANDROID__) \|\| \
	428	defined(__FreeBSD__) \|\| \
	429	defined(__VXWORKS__) \|\| \
	430	(defined(__sun__) && defined(__svr4__))
	431	sched_yield();
	432	#elif defined(__MIC__)
	433	_mm_delay_32(1024);
	434	#elif defined(__linux__) \|\| \
be5ddbb8 RO	435	defined(__APPLE__) \|\| \
	436	defined(__CYGWIN__)
	437
2e01cda6 IV	438	usleep(10000);
	439	#else
	440	# error "Unsupported architecture"
3038054c BI	441	#endif
	442	}
	443
	444	COMMON_SYSDEP void __cilkrts_sleep(void)
	445	{
	446	#ifdef __VXWORKS__
2e01cda6	447	taskDelay(1);
3038054c BI	448	#else
	449	usleep(1);
	450	#endif
	451	}
	452
	453	COMMON_SYSDEP void __cilkrts_yield(void)
	454	{
2e01cda6 IV	455	#if defined(__ANDROID__) \|\| \
2e01cda6 IV	456	defined(__APPLE__) \|\| \
be5ddbb8	457	defined(__CYGWIN__) \|\| \
2e01cda6 IV	458	defined(__FreeBSD__) \|\| \
	459	defined(__VXWORKS__) \|\| \
	460	(defined(__sun__) && defined(__svr4__))
	461	// Call sched_yield to yield quantum. I'm not sure why we
3038054c BI	462	// don't do this on Linux also.
	463	sched_yield();
	464	#elif defined(__MIC__)
	465	// On MIC, pthread_yield() really trashes things. Arch's measurements
	466	// showed that calling _mm_delay_32() (or doing nothing) was a better
	467	// option. Delaying 1024 clock cycles is a reasonable compromise between
	468	// giving up the processor and latency starting up when work becomes
	469	// available
	470	_mm_delay_32(1024);
2e01cda6	471	#elif defined(__linux__)
3038054c BI	472	// On Linux, call pthread_yield (which in turn will call sched_yield)
	473	// to yield quantum.
	474	pthread_yield();
2e01cda6 IV	475	#else
2e01cda6 IV	476	# error "Unsupported architecture"
3038054c BI	477	#endif
	478	}
	479
	480	COMMON_SYSDEP __STDNS size_t cilkos_getenv(char* value, __STDNS size_t vallen,
	481	const char* varname)
	482	{
	483	CILK_ASSERT(value);
	484	CILK_ASSERT(varname);
	485
	486	const char* envstr = getenv(varname);
	487	if (envstr)
	488	{
2e01cda6	489	size_t len = cilk_strlen(envstr);
3038054c BI	490	if (len > vallen - 1)
3038054c BI	491	return len + 1;
2e01cda6	492	cilk_strcpy_s(value, vallen, envstr);
3038054c BI	493	return len;
	494	}
	495	else
	496	{
	497	value[0] = '\0';
	498	return 0;
	499	}
	500	}
	501
	502	/*
	503	* Unrecoverable error: Print an error message and abort execution.
	504	*/
	505	COMMON_SYSDEP void cilkos_error(const char *fmt, ...)
	506	{
	507	va_list l;
	508	fflush(NULL);
	509	fprintf(stderr, "Cilk error: ");
	510	va_start(l, fmt);
	511	vfprintf(stderr, fmt, l);
	512	va_end(l);
	513	fprintf(stderr, "Exiting.\n");
	514	fflush(stderr);
	515
	516	abort();
	517	}
	518
	519	/*
	520	* Print a warning message and return.
	521	*/
	522	COMMON_SYSDEP void cilkos_warning(const char *fmt, ...)
	523	{
	524	va_list l;
	525	fflush(NULL);
	526	fprintf(stderr, "Cilk warning: ");
	527	va_start(l, fmt);
	528	vfprintf(stderr, fmt, l);
	529	va_end(l);
	530	fflush(stderr);
	531	}
	532
2e01cda6 IV	533	#ifdef __VXWORKS__
	534	#ifdef _WRS_KERNEL
	535	void cilkStart()
	536	{
	537	__cilkrts_init_tls_variables();
	538	}
	539	#else
	540	_WRS_CONSTRUCTOR(cilkInit, 100)
	541	{
	542	__cilkrts_init_tls_variables();
	543	}
	544	#endif
	545	#else
3038054c BI	546	static void __attribute__((constructor)) init_once()
	547	{
	548	/__cilkrts_debugger_notification_internal(CILK_DB_RUNTIME_LOADED);/
	549	__cilkrts_init_tls_variables();
	550	}
2e01cda6	551	#endif
3038054c BI	552
	553
	554	#define PAGE 4096
	555	#define CILK_MIN_STACK_SIZE (4*PAGE)
	556	// Default size for the stacks that we create in Cilk for Unix.
	557	#define CILK_DEFAULT_STACK_SIZE 0x100000
	558
	559	/*
	560	* Convert the user's specified stack size into a "reasonable" value
	561	* for this OS.
	562	*/
	563	size_t cilkos_validate_stack_size(size_t specified_stack_size) {
	564	// Convert any negative value to the default.
	565	if (specified_stack_size == 0) {
	566	CILK_ASSERT((CILK_DEFAULT_STACK_SIZE % PAGE) == 0);
	567	return CILK_DEFAULT_STACK_SIZE;
	568	}
	569	// Round values in between 0 and CILK_MIN_STACK_SIZE up to
	570	// CILK_MIN_STACK_SIZE.
	571	if (specified_stack_size <= CILK_MIN_STACK_SIZE) {
	572	return CILK_MIN_STACK_SIZE;
	573	}
	574	if ((specified_stack_size % PAGE) > 0) {
	575	// Round the user's stack size value up to nearest page boundary.
	576	return (PAGE * (1 + specified_stack_size / PAGE));
	577	}
	578	return specified_stack_size;
	579	}
	580
	581	long cilkos_atomic_add(volatile long* p, long x)
	582	{
	583	return __sync_add_and_fetch(p, x);
	584	}
	585
	586	/* End os-unix.c */