From: Georgi Valkov Date: Wed, 10 Jul 2024 14:29:09 +0000 (+0300) Subject: threads_pthread, threads_win: improve code consistency X-Git-Tag: openssl-3.4.0-alpha1~324 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ce6b2f98263712b2ccb4559117cbd480c552894b;p=thirdparty%2Fopenssl.git threads_pthread, threads_win: improve code consistency Improve code consistency between threads_pthread.c and threads_win.c threads_pthread.c has good comments, let's copy them to threads_win.c In many places uint64_t or LONG int was used, and assignments were performed between variables with different sizes. Unify the code to use uint32_t. In 32 bit architectures it is easier to perform 32 bit atomic operations. The size is large enough to hold the list of operations. Fix result of atomic_or_uint_nv improperly casted to int * instead of int. Note: In general size_t should be preferred for size and index, due to its descriptive name, however it is more convenient to use uint32_t for consistency between platforms and atomic calls. READER_COUNT and ID_VAL return results that fit 32 bit. Cast them to uint32_t to save a few CPU cycles, since they are used in 32 bit operations anyway. TODO: In struct rcu_lock_st, qp_group can be moved before id_ctr for better alignment, which would save 8 bytes. allocate_new_qp_group has a parameter count of type int. Signed values should be avoided as size or index. It is better to use unsigned, e.g uint32_t, even though internally this is assigned to a uint32_t variable. READER_SIZE is 16 in threads_pthread.c, and 32 in threads_win.c Using a common size for consistency should be prefered. Signed-off-by: Georgi Valkov Reviewed-by: Neil Horman Reviewed-by: Tomas Mraz (Merged from https://github.com/openssl/openssl/pull/24803) --- diff --git a/crypto/threads_pthread.c b/crypto/threads_pthread.c index 7b545999478..14b1dbf6c20 100644 --- a/crypto/threads_pthread.c +++ b/crypto/threads_pthread.c @@ -114,6 +114,7 @@ static inline void *apple_atomic_load_n_pvoid(void **p, } /* For uint64_t, we should be fine, though */ +# define apple_atomic_load_n_uint32_t(p, o) __atomic_load_n(p, o) # define apple_atomic_load_n_uint64_t(p, o) __atomic_load_n(p, o) # define ATOMIC_LOAD_N(t, p, o) apple_atomic_load_n_##t(p, o) @@ -141,6 +142,7 @@ static pthread_mutex_t atomic_sim_lock = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_unlock(&atomic_sim_lock); \ return ret; \ } +IMPL_fallback_atomic_load_n(uint32_t) IMPL_fallback_atomic_load_n(uint64_t) IMPL_fallback_atomic_load_n(pvoid) @@ -157,6 +159,7 @@ IMPL_fallback_atomic_load_n(pvoid) pthread_mutex_unlock(&atomic_sim_lock); \ return ret; \ } +IMPL_fallback_atomic_store_n(uint32_t) IMPL_fallback_atomic_store_n(uint64_t) # define ATOMIC_STORE_N(t, p, v, o) fallback_atomic_store_n_##t(p, v) @@ -265,17 +268,19 @@ static ossl_inline uint64_t fallback_atomic_or_fetch(uint64_t *p, uint64_t m) /* * users is broken up into 2 parts * bits 0-15 current readers - * bit 32-63 - ID + * bit 32-63 ID */ # define READER_SHIFT 0 # define ID_SHIFT 32 +/* TODO: READER_SIZE 32 in threads_win.c */ # define READER_SIZE 16 # define ID_SIZE 32 # define READER_MASK (((uint64_t)1 << READER_SIZE) - 1) # define ID_MASK (((uint64_t)1 << ID_SIZE) - 1) -# define READER_COUNT(x) (((uint64_t)(x) >> READER_SHIFT) & READER_MASK) -# define ID_VAL(x) (((uint64_t)(x) >> ID_SHIFT) & ID_MASK) +# define READER_COUNT(x) ((uint32_t)(((uint64_t)(x) >> READER_SHIFT) & \ + READER_MASK)) +# define ID_VAL(x) ((uint32_t)(((uint64_t)(x) >> ID_SHIFT) & ID_MASK)) # define VAL_READER ((uint64_t)1 << READER_SHIFT) # define VAL_ID(x) ((uint64_t)x << ID_SHIFT) @@ -322,20 +327,21 @@ struct rcu_lock_st { /* rcu generation counter for in-order retirement */ uint32_t id_ctr; + /* TODO: can be moved before id_ctr for better alignment */ /* Array of quiescent points for synchronization */ struct rcu_qp *qp_group; /* Number of elements in qp_group array */ - size_t group_count; + uint32_t group_count; /* Index of the current qp in the qp_group array */ - uint64_t reader_idx; + uint32_t reader_idx; /* value of the next id_ctr value to be retired */ uint32_t next_to_retire; /* index of the next free rcu_qp in the qp_group */ - uint64_t current_alloc_idx; + uint32_t current_alloc_idx; /* number of qp's in qp_group array currently being retired */ uint32_t writers_alloced; @@ -359,7 +365,7 @@ struct rcu_lock_st { /* Read side acquisition of the current qp */ static struct rcu_qp *get_hold_current_qp(struct rcu_lock_st *lock) { - uint64_t qp_idx; + uint32_t qp_idx; /* get the current qp index */ for (;;) { @@ -375,7 +381,7 @@ static struct rcu_qp *get_hold_current_qp(struct rcu_lock_st *lock) * systems like x86, but is relevant on other arches * Note: This applies to the reload below as well */ - qp_idx = ATOMIC_LOAD_N(uint64_t, &lock->reader_idx, __ATOMIC_ACQUIRE); + qp_idx = ATOMIC_LOAD_N(uint32_t, &lock->reader_idx, __ATOMIC_ACQUIRE); /* * Notes of use of __ATOMIC_RELEASE @@ -388,7 +394,7 @@ static struct rcu_qp *get_hold_current_qp(struct rcu_lock_st *lock) __ATOMIC_RELEASE); /* if the idx hasn't changed, we're good, else try again */ - if (qp_idx == ATOMIC_LOAD_N(uint64_t, &lock->reader_idx, __ATOMIC_ACQUIRE)) + if (qp_idx == ATOMIC_LOAD_N(uint32_t, &lock->reader_idx, __ATOMIC_ACQUIRE)) break; /* @@ -494,7 +500,7 @@ void ossl_rcu_read_unlock(CRYPTO_RCU_LOCK *lock) static struct rcu_qp *update_qp(CRYPTO_RCU_LOCK *lock) { uint64_t new_id; - uint64_t current_idx; + uint32_t current_idx; pthread_mutex_lock(&lock->alloc_lock); @@ -517,10 +523,9 @@ static struct rcu_qp *update_qp(CRYPTO_RCU_LOCK *lock) (lock->current_alloc_idx + 1) % lock->group_count; /* get and insert a new id */ - new_id = lock->id_ctr; + new_id = VAL_ID(lock->id_ctr); lock->id_ctr++; - new_id = VAL_ID(new_id); /* * Even though we are under a write side lock here * We need to use atomic instructions to ensure that the results @@ -538,7 +543,7 @@ static struct rcu_qp *update_qp(CRYPTO_RCU_LOCK *lock) * of __ATOMIC_ACQUIRE in get_hold_current_qp, as we want any publication * of this value to be seen on the read side immediately after it happens */ - ATOMIC_STORE_N(uint64_t, &lock->reader_idx, lock->current_alloc_idx, + ATOMIC_STORE_N(uint32_t, &lock->reader_idx, lock->current_alloc_idx, __ATOMIC_RELEASE); /* wake up any waiters */ @@ -555,6 +560,8 @@ static void retire_qp(CRYPTO_RCU_LOCK *lock, struct rcu_qp *qp) pthread_mutex_unlock(&lock->alloc_lock); } +/* TODO: count should be unsigned, e.g uint32_t */ +/* a negative value could result in unexpected behaviour */ static struct rcu_qp *allocate_new_qp_group(CRYPTO_RCU_LOCK *lock, int count) { @@ -1006,7 +1013,7 @@ int CRYPTO_atomic_load_int(int *val, int *ret, CRYPTO_RWLOCK *lock) # elif defined(__sun) && (defined(__SunOS_5_10) || defined(__SunOS_5_11)) /* This will work for all future Solaris versions. */ if (ret != NULL) { - *ret = (int *)atomic_or_uint_nv((unsigned int *)val, 0); + *ret = (int)atomic_or_uint_nv((unsigned int *)val, 0); return 1; } # endif diff --git a/crypto/threads_win.c b/crypto/threads_win.c index 4d7a1d9d1e2..0196235a13e 100644 --- a/crypto/threads_win.c +++ b/crypto/threads_win.c @@ -43,17 +43,24 @@ typedef struct { } CRYPTO_win_rwlock; # endif +/* + * users is broken up into 2 parts + * bits 0-31 current readers + * bit 32-63 ID + */ # define READER_SHIFT 0 -# define ID_SHIFT 32 -# define READER_SIZE 32 -# define ID_SIZE 32 - -# define READER_MASK (((LONG64)1 << READER_SIZE)-1) -# define ID_MASK (((LONG64)1 << ID_SIZE)-1) -# define READER_COUNT(x) (((LONG64)(x) >> READER_SHIFT) & READER_MASK) -# define ID_VAL(x) (((LONG64)(x) >> ID_SHIFT) & ID_MASK) -# define VAL_READER ((LONG64)1 << READER_SHIFT) -# define VAL_ID(x) ((LONG64)x << ID_SHIFT) +# define ID_SHIFT 32 +/* TODO: READER_SIZE 16 in threads_pthread.c */ +# define READER_SIZE 32 +# define ID_SIZE 32 + +# define READER_MASK (((uint64_t)1 << READER_SIZE) - 1) +# define ID_MASK (((uint64_t)1 << ID_SIZE) - 1) +# define READER_COUNT(x) ((uint32_t)(((uint64_t)(x) >> READER_SHIFT) & \ + READER_MASK)) +# define ID_VAL(x) ((uint32_t)(((uint64_t)(x) >> ID_SHIFT) & ID_MASK)) +# define VAL_READER ((int64_t)1 << READER_SHIFT) +# define VAL_ID(x) ((uint64_t)x << ID_SHIFT) /* * This defines a quescent point (qp) @@ -62,7 +69,7 @@ typedef struct { * atomically updated */ struct rcu_qp { - volatile LONG64 users; + volatile uint64_t users; }; struct thread_qp { @@ -89,23 +96,55 @@ struct rcu_thr_data { * it is cast from CRYPTO_RCU_LOCK */ struct rcu_lock_st { + /* Callbacks to call for next ossl_synchronize_rcu */ struct rcu_cb_item *cb_items; + + /* The context we are being created against */ OSSL_LIB_CTX *ctx; + + /* rcu generation counter for in-order retirement */ uint32_t id_ctr; + + /* TODO: can be moved before id_ctr for better alignment */ + /* Array of quiescent points for synchronization */ struct rcu_qp *qp_group; - size_t group_count; - uint32_t next_to_retire; + + /* Number of elements in qp_group array */ + uint32_t group_count; + + /* Index of the current qp in the qp_group array */ uint32_t reader_idx; + + /* value of the next id_ctr value to be retired */ + uint32_t next_to_retire; + + /* index of the next free rcu_qp in the qp_group */ uint32_t current_alloc_idx; + + /* number of qp's in qp_group array currently being retired */ uint32_t writers_alloced; + + /* lock protecting write side operations */ CRYPTO_MUTEX *write_lock; + + /* lock protecting updates to writers_alloced/current_alloc_idx */ CRYPTO_MUTEX *alloc_lock; + + /* signal to wake threads waiting on alloc_lock */ CRYPTO_CONDVAR *alloc_signal; + + /* lock to enforce in-order retirement */ CRYPTO_MUTEX *prior_lock; + + /* signal to wake threads waiting on prior_lock */ CRYPTO_CONDVAR *prior_signal; + + /* lock used with NO_INTERLOCKEDOR64: VS2010 x86 */ CRYPTO_RWLOCK *rw_lock; }; +/* TODO: count should be unsigned, e.g uint32_t */ +/* a negative value could result in unexpected behaviour */ static struct rcu_qp *allocate_new_qp_group(struct rcu_lock_st *lock, int count) { @@ -173,6 +212,7 @@ void ossl_rcu_lock_free(CRYPTO_RCU_LOCK *lock) OPENSSL_free(lock); } +/* Read side acquisition of the current qp */ static ossl_inline struct rcu_qp *get_hold_current_qp(CRYPTO_RCU_LOCK *lock) { uint32_t qp_idx; @@ -181,11 +221,12 @@ static ossl_inline struct rcu_qp *get_hold_current_qp(CRYPTO_RCU_LOCK *lock) /* get the current qp index */ for (;;) { - CRYPTO_atomic_load_int(&lock->reader_idx, (int *)&qp_idx, + CRYPTO_atomic_load_int((int *)&lock->reader_idx, (int *)&qp_idx, lock->rw_lock); CRYPTO_atomic_add64(&lock->qp_group[qp_idx].users, VAL_READER, &tmp64, lock->rw_lock); - CRYPTO_atomic_load_int(&lock->reader_idx, (int *)&tmp, lock->rw_lock); + CRYPTO_atomic_load_int((int *)&lock->reader_idx, (int *)&tmp, + lock->rw_lock); if (qp_idx == tmp) break; CRYPTO_atomic_add64(&lock->qp_group[qp_idx].users, -VAL_READER, &tmp64, @@ -277,6 +318,10 @@ void ossl_rcu_read_unlock(CRYPTO_RCU_LOCK *lock) } } +/* + * Write side allocation routine to get the current qp + * and replace it with a new one + */ static struct rcu_qp *update_qp(CRYPTO_RCU_LOCK *lock) { uint64_t new_id; @@ -291,9 +336,11 @@ static struct rcu_qp *update_qp(CRYPTO_RCU_LOCK *lock) * one that isn't yet being waited on */ while (lock->group_count - lock->writers_alloced < 2) + /* we have to wait for one to be free */ ossl_crypto_condvar_wait(lock->alloc_signal, lock->alloc_lock); current_idx = lock->current_alloc_idx; + /* Allocate the qp */ lock->writers_alloced++; @@ -302,10 +349,15 @@ static struct rcu_qp *update_qp(CRYPTO_RCU_LOCK *lock) (lock->current_alloc_idx + 1) % lock->group_count; /* get and insert a new id */ - new_id = lock->id_ctr; + new_id = VAL_ID(lock->id_ctr); lock->id_ctr++; - new_id = VAL_ID(new_id); + /* + * Even though we are under a write side lock here + * We need to use atomic instructions to ensure that the results + * of this update are published to the read side prior to updating the + * reader idx below + */ CRYPTO_atomic_and(&lock->qp_group[current_idx].users, ID_MASK, &tmp64, lock->rw_lock); CRYPTO_atomic_add64(&lock->qp_group[current_idx].users, new_id, &tmp64,