From: VMware, Inc <> Date: Wed, 18 Sep 2013 03:18:47 +0000 (-0700) Subject: Avoid a memory dereference when getting the current thread ID. X-Git-Tag: 2013.09.16-1328054~97 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=fb4b3833080ba2ebd1dbafc8334f0cb10e181197;p=thirdparty%2Fopen-vm-tools.git Avoid a memory dereference when getting the current thread ID. As part of some recent benchmarking and ensuing discussions a couple of overheads in VThread_CurID have been identified. (1) VThread_CurID ultimately gets a pointer to a per thread structure that contains various bits of state including the thread ID. After getting that pointer it must read the thread ID out of memory. This could be avoided if the thread local value we stored was the thread ID itself. (2) Before calling into the host's API for thread local storage, vthreadBase.c must get the host key, check if it is initialized and initialize it if necessary. Then once it gets the thread local value from the host API it must check if that has been initialized. These checks could be avoided if we forced clients to initialize the thread before calling functions like VThread_CurID. (3) On some (all?) Linux pthread implementations pthread_getspecific itself can be heavyweight. Both Windows and Linux offer alternatives to make thread local storage cheaper. (OS X on the other hand provides a very fast implementation of pthread_getspecific: pretty much one instruction plus the function call overhead.) The first two overheads came up in my profiling of USB workloads on OS X while Kevin raised the third issue in the following discussions. This change attempts to eliminate the first overhead, but to do so in a way that helps set up the code for attacking the remaining two. In particular it introduces a second thread local variable to store the thread ID. For the time I left the thread ID in the other structure as well and verify that they stay in sync. We could get rid of it, but it's low cost and I suspect (though I have no proof) that it could be useful in debugging. As part of this I've rearranged some of the initialization code both for the TLS keys as well as for initializing the TLS data. This is useful because I wanted both pieces of state to get set together (and mutated together -- yes our threads change IDs during their life). And, with respect to the pthread keys, it's important to make sure the base key gets allocated before the thread ID key. This ensures it gets destroyed first which allows us to keep an ASSERT to make sure that the two thread IDs stay in sync. This does run into one wrinkle because of the lazy thread initialization. The default value for uninitialized state is NULL which, inconveniently, is a valid vthread ID. So instead of actually storing the thread ID, we store the thread ID incremented by one. This means the default/uninitialized value will show up as -1. We can avoid this after addressing (2) from above, but in the meantime trading a memory read for an ALU operation is still a nice win. Signed-off-by: Dmitry Torokhov --- diff --git a/open-vm-tools/lib/misc/vthreadBase.c b/open-vm-tools/lib/misc/vthreadBase.c index 84964a7e1..6b0dac06b 100644 --- a/open-vm-tools/lib/misc/vthreadBase.c +++ b/open-vm-tools/lib/misc/vthreadBase.c @@ -118,18 +118,21 @@ typedef pthread_key_t VThreadBaseKeyType; #endif #endif +static void VThreadBaseInit(void); static void VThreadBaseSimpleNoID(void); static void VThreadBaseSimpleFreeID(void *tlsData); static void VThreadBaseSafeDeleteTLS(void *data); static struct { - Atomic_Int key; + Atomic_Int baseKey; + Atomic_Int threadIDKey; Atomic_Int dynamicID; Atomic_Int numThreads; Atomic_Ptr nativeHash; void (*noIDFunc)(void); void (*freeIDFunc)(void *); } vthreadBaseGlobals = { + { VTHREADBASE_INVALID_KEY }, { VTHREADBASE_INVALID_KEY }, { VTHREAD_ALLOCSTART_ID }, { 0 }, @@ -138,6 +141,11 @@ static struct { VThreadBaseSimpleFreeID, }; +typedef enum VThreadLocal { + VTHREAD_LOCAL_BASE, + VTHREAD_LOCAL_ID +} VThreadLocal; + /* * Code to mask all asynchronous signals @@ -172,62 +180,39 @@ static struct { } while(0) #endif - /* *----------------------------------------------------------------------------- * - * VThreadBaseGetKey -- - * - * Get the host-specific TLS slot. - * - * Failure to allocate a TLS slot is immediately fatal. Note that - * a TLS slot is generally allocated at the first of: - * - VThread_Init() (e.g. uses lib/thread) - * - VThread_SetName() - * - a Posix signal - * - a lock acquisition - * Since most Panic paths do look up a thread name (and thus need a TLS - * slot), a program that does not want to Panic-loop should call - * one of the above functions very early to "prime" the TLS slot. - * - * Results: - * OS-specific TLS key. + * VThreadBaseInitKeysWork -- * - * Side effects: - * None. + * Helper function to be only used by VThreadBaseInitKeys. * *----------------------------------------------------------------------------- */ -static VThreadBaseKeyType -VThreadBaseGetKey(void) +static void +VThreadBaseInitKeyWork(Atomic_Int *key, void (*deletePtr)(void*)) { - VThreadBaseKeyType key = Atomic_Read(&vthreadBaseGlobals.key); - - if (key == VTHREADBASE_INVALID_KEY) { + if (Atomic_Read(key) == VTHREADBASE_INVALID_KEY) { VThreadBaseKeyType newKey; #if defined _WIN32 newKey = TlsAlloc(); ASSERT_NOT_IMPLEMENTED(newKey != VTHREADBASE_INVALID_KEY); #else - Bool success = pthread_key_create(&newKey, - &VThreadBaseSafeDeleteTLS) == 0; + Bool success = pthread_key_create(&newKey, deletePtr) == 0; if (success && newKey == 0) { /* * Leak TLS key 0. System libraries have a habit of destroying * it. See bugs 702818 and 773420. */ - - success = pthread_key_create(&newKey, - &VThreadBaseSafeDeleteTLS) == 0; + success = pthread_key_create(&newKey, deletePtr) == 0; } ASSERT_NOT_IMPLEMENTED(success); #endif - if (Atomic_ReadIfEqualWrite(&vthreadBaseGlobals.key, - VTHREADBASE_INVALID_KEY, - newKey) != VTHREADBASE_INVALID_KEY) { + if (Atomic_ReadIfEqualWrite(key, VTHREADBASE_INVALID_KEY, newKey) != + VTHREADBASE_INVALID_KEY) { /* Race: someone else init'd */ #if defined _WIN32 TlsFree(newKey); @@ -235,79 +220,191 @@ VThreadBaseGetKey(void) pthread_key_delete(newKey); #endif } - - key = Atomic_Read(&vthreadBaseGlobals.key); - ASSERT(key != VTHREADBASE_INVALID_KEY); } - - return key; } - /* *----------------------------------------------------------------------------- * - * VThreadBaseGetNativeHash -- + * VThreadBaseInitKeys -- * - * Get the default hash table of native thread IDs. This is used by - * the "simple" allocation function to enable re-using of VThreadIDs. + * Initialize the host-specific TLS slots. * - * Results: - * An atomic HashTable *. + * Failure to allocate a TLS slot is immediately fatal. Note that + * a TLS slot is generally allocated at the first of: + * - VThread_Init() (e.g. uses lib/thread) + * - VThread_SetName() + * - a Posix signal + * - a lock acquisition + * Since most Panic paths do look up a thread name (and thus need a TLS + * slot), a program that does not want to Panic-loop should call + * one of the above functions very early to "prime" the TLS slot. * - * Side effects: - * Allocates the HashTable on first call. + * Results: + * None. * *----------------------------------------------------------------------------- */ -static HashTable * -VThreadBaseGetNativeHash(void) +static void +VThreadBaseInitKeys(void) { - return HashTable_AllocOnce(&vthreadBaseGlobals.nativeHash, - 128, HASH_INT_KEY | HASH_FLAG_ATOMIC, NULL); + VThreadBaseInitKeyWork(&vthreadBaseGlobals.baseKey, + &VThreadBaseSafeDeleteTLS); + VThreadBaseInitKeyWork(&vthreadBaseGlobals.threadIDKey, NULL); +} + + +static INLINE Bool +VThreadBaseAreKeysInited(void) +{ + return Atomic_Read(&vthreadBaseGlobals.baseKey) != VTHREADBASE_INVALID_KEY && + Atomic_Read(&vthreadBaseGlobals.threadIDKey) != VTHREADBASE_INVALID_KEY; } /* *----------------------------------------------------------------------------- * - * VThreadBaseRaw -- + * VThreadBaseSetLocal -- * - * Get the per-thread data, but do not assign data if not present. + * Sets the specified thread local variable to the supplied + * value. This abstracts between pthread, the Windows TLS + * supports, and the use of __thread. Requires that the TLS keys + * are initialized before calling this function. We encourage + * the compiler to inline this with the expectation that "local" + * is a compile time constant. * - * Inlined; hitting TLS needs to be a fastpath. + * To facilitate lazy initialization we special case + * VTHREAD_LOCAL_ID slightly and arrange so that if its value is + * read before it is initialized we return -1 instead of 0 (since + * 0 is used as a real thread id). * - * Results: - * A VThreadBaseData *, or NULL if thread has not been initialized. + *----------------------------------------------------------------------------- + */ + +static INLINE Bool +VThreadBaseSetLocal(VThreadLocal local, void *value) +{ + VThreadBaseKeyType key; + Bool success; + ASSERT(VThreadBaseAreKeysInited()); + if (local == VTHREAD_LOCAL_BASE) { + key = Atomic_Read(&vthreadBaseGlobals.baseKey); + } else { + ASSERT(local == VTHREAD_LOCAL_ID); + /* VThreadGetLocal compensates for this, lets the default be -1. */ + value = (void*)((uintptr_t)value + 1); + key = Atomic_Read(&vthreadBaseGlobals.threadIDKey); + } + ASSERT(key != VTHREADBASE_INVALID_KEY); +#if defined _WIN32 + success = TlsSetValue(key, value); +#else + success = pthread_setspecific(key, value) == 0; +#endif + return success; +} + + +/* + *----------------------------------------------------------------------------- * - * Side effects: - * Does NOT assign data to the TLS slot. + * VThreadBaseGetLocal -- + * + * Retrives the specified thread local variable. This abstracts + * between pthread, the Windows TLS supports, and the use of + * __thread. See VThreadSetLocal for more details. * *----------------------------------------------------------------------------- */ -static INLINE VThreadBaseData * -VThreadBaseRaw(void) +static INLINE void * +VThreadBaseGetLocal(VThreadLocal local) { - VThreadBaseKeyType key = Atomic_Read(&vthreadBaseGlobals.key); + VThreadBaseKeyType key; + Atomic_Int *keyPtr; + void *result; + + ASSERT(local == VTHREAD_LOCAL_BASE || local == VTHREAD_LOCAL_ID); + keyPtr = local == VTHREAD_LOCAL_BASE ? &vthreadBaseGlobals.baseKey : + &vthreadBaseGlobals.threadIDKey; + key = Atomic_Read(keyPtr); if (UNLIKELY(key == VTHREADBASE_INVALID_KEY)) { - key = VThreadBaseGetKey(); /* Non-inlined slow path */ - } + VThreadBaseInitKeys(); + key = Atomic_Read(keyPtr); + } + ASSERT(VThreadBaseAreKeysInited()); #if defined _WIN32 - return (VThreadBaseData *) TlsGetValue(key); + result = TlsGetValue(key); #else - return (VThreadBaseData *) pthread_getspecific(key); + result = pthread_getspecific(key); #endif + + if (local == VTHREAD_LOCAL_ID) { + result = (void*)((uintptr_t)result - 1); /* See VThreadBaseSetLocal. */ + } + + return result; +} + + +static INLINE VThreadBaseData * +VThreadBaseGetBase(void) +{ + return (VThreadBaseData *)VThreadBaseGetLocal(VTHREAD_LOCAL_BASE); +} + +static INLINE Bool +VThreadBaseSetBase(VThreadBaseData *base) +{ + return VThreadBaseSetLocal(VTHREAD_LOCAL_BASE, base); +} + +static INLINE VThreadID +VThreadBaseGetID(void) +{ + return (VThreadID)(uintptr_t)VThreadBaseGetLocal(VTHREAD_LOCAL_ID); +} + +static INLINE Bool +VThreadBaseSetID(VThreadID id) +{ + return VThreadBaseSetLocal(VTHREAD_LOCAL_ID, (void*)(uintptr_t)id); } /* *----------------------------------------------------------------------------- * - * VThreadBaseCooked -- + * VThreadBaseGetNativeHash -- + * + * Get the default hash table of native thread IDs. This is used by + * the "simple" allocation function to enable re-using of VThreadIDs. + * + * Results: + * An atomic HashTable *. + * + * Side effects: + * Allocates the HashTable on first call. + * + *----------------------------------------------------------------------------- + */ + +static HashTable * +VThreadBaseGetNativeHash(void) +{ + return HashTable_AllocOnce(&vthreadBaseGlobals.nativeHash, + 128, HASH_INT_KEY | HASH_FLAG_ATOMIC, NULL); +} + + +/* + *----------------------------------------------------------------------------- + * + * VThreadGetAndInitBase -- * * Get the per-thread data, and assign if there is no data. * @@ -321,32 +418,17 @@ VThreadBaseRaw(void) */ static VThreadBaseData * -VThreadBaseCooked(void) +VThreadBaseGetAndInitBase(void) { - VThreadBaseData *base = VThreadBaseRaw(); + VThreadBaseData *base = VThreadBaseGetBase(); - ASSERT(vthreadBaseGlobals.noIDFunc); + ASSERT(vthreadBaseGlobals.noIDFunc != NULL); if (UNLIKELY(base == NULL)) { - /* Just saw a new thread. Ensure atomics are correct... */ - Atomic_Init(); - - /* - * The code between the last pthread_getspecific and the eventual - * call to pthread_setspecific either needs to run with async signals - * blocked or tolerate reentrancy. Simpler to just block the signals. - * See bugs 295686 & 477318. Here, the problem is that we could allocate - * two VThreadIDs (via a signal during the NoID callback). - */ - NO_ASYNC_SIGNALS_START; - if (VThreadBaseRaw() == NULL) { - (*vthreadBaseGlobals.noIDFunc)(); - } - NO_ASYNC_SIGNALS_END; - - base = VThreadBaseRaw(); - ASSERT(base); + VThreadBaseInit(); + base = VThreadBaseGetBase(); } + ASSERT(base != NULL); return base; } @@ -371,7 +453,14 @@ VThreadBaseCooked(void) VThreadID VThreadBase_CurID(void) { - return VThreadBaseCooked()->id; + VThreadID tid = VThreadBaseGetID(); + if (UNLIKELY(tid == VTHREAD_INVALID_ID)) { + VThreadBaseInit(); + tid = VThreadBaseGetID(); + } + ASSERT(tid != VTHREAD_INVALID_ID); + ASSERT(tid == VThreadBaseGetAndInitBase()->id); + return tid; } @@ -400,14 +489,14 @@ const char * VThreadBase_CurName(void) { static Atomic_Int curNameRecursion; - VThreadBaseData *base = VThreadBaseRaw(); + VThreadBaseData *base = VThreadBaseGetBase(); if (LIKELY(base != NULL)) { return base->name; } else if (Atomic_Read(&curNameRecursion) == 0) { /* Unnamed thread, try to name it. */ Atomic_Inc(&curNameRecursion); - base = VThreadBaseCooked(); /* Assigns name as side effect */ + base = VThreadBaseGetAndInitBase(); /* Assigns name as side effect */ Atomic_Dec(&curNameRecursion); return base->name; @@ -481,14 +570,12 @@ Bool VThreadBase_InitWithTLS(VThreadBaseData *base) // IN: caller-managed storage { Bool firstTime, success; - /* Require key allocation before TLS read */ - VThreadBaseKeyType key = VThreadBaseGetKey(); + VThreadBaseInitKeys(); /* Require key allocation before TLS read */ ASSERT(base != NULL && base->id != VTHREAD_INVALID_ID); NO_ASYNC_SIGNALS_START; - if (VThreadBaseRaw() == NULL) { -#if defined _WIN32 + if (VThreadBaseGetBase() == NULL) { /* * Windows potentially has the async-signal problem mentioned * below due to APCs, but in practice it will not happen: @@ -496,16 +583,13 @@ VThreadBase_InitWithTLS(VThreadBaseData *base) // IN: caller-managed storage * and we obviously do not make those between TlsGetValue and * TlsSetValue. */ - success = TlsSetValue(key, base); -#else /* * The code between the check of pthread_getspecific and the eventually * call to pthread_setspecific MUST run with async signals blocked, see * bugs 295686 & 477318. Here, the problem is that we could * accidentally set the TLS slot twice (it's not atomic). */ - success = pthread_setspecific(key, base) == 0; -#endif + success = VThreadBaseSetBase(base) && VThreadBaseSetID(base->id); firstTime = TRUE; } else { success = TRUE; @@ -514,12 +598,12 @@ VThreadBase_InitWithTLS(VThreadBaseData *base) // IN: caller-managed storage NO_ASYNC_SIGNALS_END; /* Try not to ASSERT while signals are blocked */ ASSERT_NOT_IMPLEMENTED(success); - ASSERT(!firstTime || (base == VThreadBaseRaw())); + ASSERT(!firstTime || (base == VThreadBaseGetBase())); if (firstTime) { Atomic_Inc(&vthreadBaseGlobals.numThreads); } else { - VThreadBaseData *realBase = VThreadBaseRaw(); + VThreadBaseData *realBase = VThreadBaseGetBase(); /* * If this happens, it means: @@ -568,7 +652,6 @@ VThreadBaseSafeDeleteTLS(void *tlsData) if (data != NULL) { if (vthreadBaseGlobals.freeIDFunc != NULL) { - VThreadBaseKeyType key = VThreadBaseGetKey(); Bool success; VThreadBaseData tmpData = *data; @@ -578,11 +661,7 @@ VThreadBaseSafeDeleteTLS(void *tlsData) * enough for the VThreadBase services, clean up, then clear the * TLS slot. */ -#if defined _WIN32 - success = TlsSetValue(key, &tmpData); -#else - success = pthread_setspecific(key, &tmpData) == 0; -#endif + success = VThreadBaseSetBase(&tmpData); ASSERT_NOT_IMPLEMENTED(success); if (vmx86_debug) { @@ -590,11 +669,7 @@ VThreadBaseSafeDeleteTLS(void *tlsData) } (*vthreadBaseGlobals.freeIDFunc)(data); -#if defined _WIN32 - success = TlsSetValue(key, NULL); -#else - success = pthread_setspecific(key, NULL) == 0; -#endif + success = VThreadBaseSetBase(NULL) && VThreadBaseSetID(0); ASSERT_NOT_IMPLEMENTED(success); } Atomic_Dec(&vthreadBaseGlobals.numThreads); @@ -625,18 +700,7 @@ VThreadBaseSafeDeleteTLS(void *tlsData) void VThreadBase_ForgetSelf(void) { - VThreadBaseKeyType key = VThreadBaseGetKey(); - VThreadBaseData *data = VThreadBaseRaw(); - Bool success; - -#if defined _WIN32 - success = TlsSetValue(key, NULL); -#else - success = pthread_setspecific(key, NULL) == 0; -#endif - ASSERT_NOT_IMPLEMENTED(success); - - VThreadBaseSafeDeleteTLS(data); + VThreadBaseSafeDeleteTLS(VThreadBaseGetBase()); } @@ -664,7 +728,7 @@ void VThreadBase_SetName(const char *name) // IN: new name { uint32 len = strlen(name); - VThreadBaseData *base = VThreadBaseCooked(); + VThreadBaseData *base = VThreadBaseGetAndInitBase(); ASSERT(name); @@ -800,7 +864,7 @@ VThreadBaseSimpleNoID(void) VThreadBaseData *base; /* Require key allocation before TLS read */ - VThreadBaseGetKey(); + ASSERT(VThreadBaseAreKeysInited()); /* Before allocating a new ID, try to reclaim any old IDs. */ for (newID = 0; @@ -944,6 +1008,39 @@ VThreadBase_SetNoIDFunc(void (*hookFunc)(void), // IN: new hook function } +/* + *----------------------------------------------------------------------------- + * + * VThreadBaseInit -- + * + * Performs basic initialization of this thread including setting + * up the thread local storage and assigning a thread id. + * + *----------------------------------------------------------------------------- + */ + +static void +VThreadBaseInit(void) +{ + Atomic_Init(); + VThreadBaseInitKeys(); + + /* + * The code between the last pthread_getspecific and the eventual + * call to pthread_setspecific either needs to run with async signals + * blocked or tolerate reentrancy. Simpler to just block the signals. + * See bugs 295686 & 477318. Here, the problem is that we could allocate + * two VThreadIDs (via a signal during the NoID callback). + */ + + NO_ASYNC_SIGNALS_START; + if (VThreadBaseGetBase() == NULL) { + (*vthreadBaseGlobals.noIDFunc)(); + } + NO_ASYNC_SIGNALS_END; +} + + #if !defined _WIN32 /* *----------------------------------------------------------------------------- @@ -965,7 +1062,7 @@ VThreadBase_SetNoIDFunc(void (*hookFunc)(void), // IN: new hook function Bool VThreadBase_IsInSignal(void) { - VThreadBaseData *base = VThreadBaseCooked(); + VThreadBaseData *base = VThreadBaseGetAndInitBase(); return Atomic_Read(&base->signalNestCount) > 0; } @@ -992,7 +1089,7 @@ void VThreadBase_SetIsInSignal(VThreadID tid, // IN: Bool isInSignal) // IN: { - VThreadBaseData *base = VThreadBaseCooked(); + VThreadBaseData *base = VThreadBaseGetAndInitBase(); /* It is an error to clear isInSignal while not in a signal. */ ASSERT(Atomic_Read(&base->signalNestCount) > 0 || isInSignal);