From: Georgi Valkov <gvalkov@gmail.com>
Date: Wed, 10 Jul 2024 14:29:09 +0000 (+0300)
Subject: threads_pthread, threads_win: improve code consistency
X-Git-Tag: openssl-3.4.0-alpha1~324
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ce6b2f98263712b2ccb4559117cbd480c552894b;p=thirdparty%2Fopenssl.git

threads_pthread, threads_win: improve code consistency

Improve code consistency between threads_pthread.c and threads_win.c
threads_pthread.c has good comments, let's copy them to threads_win.c
In many places uint64_t or LONG int was used, and assignments were
performed between variables with different sizes.
Unify the code to use uint32_t. In 32 bit architectures it is easier
to perform 32 bit atomic operations. The size is large enough to hold
the list of operations.
Fix result of atomic_or_uint_nv improperly casted to int *
instead of int.

Note:
In general size_t should be preferred for size and index, due to its
descriptive name, however it is more convenient to use uint32_t for
consistency between platforms and atomic calls.

READER_COUNT and ID_VAL return results that fit 32 bit. Cast them to
uint32_t to save a few CPU cycles, since they are used in 32 bit
operations anyway.

TODO:
In struct rcu_lock_st, qp_group can be moved before id_ctr
for better alignment, which would save 8 bytes.

allocate_new_qp_group has a parameter count of type int.
Signed values should be avoided as size or index.
It is better to use unsigned, e.g uint32_t, even though
internally this is assigned to a uint32_t variable.

READER_SIZE is 16 in threads_pthread.c, and 32 in threads_win.c
Using a common size for consistency should be prefered.

Signed-off-by: Georgi Valkov <gvalkov@gmail.com>

Reviewed-by: Neil Horman <nhorman@openssl.org>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/24803)
---

diff --git a/crypto/threads_pthread.c b/crypto/threads_pthread.c
index 7b545999478..14b1dbf6c20 100644
--- a/crypto/threads_pthread.c
+++ b/crypto/threads_pthread.c
@@ -114,6 +114,7 @@ static inline void *apple_atomic_load_n_pvoid(void **p,
 }
 
 /* For uint64_t, we should be fine, though */
+#   define apple_atomic_load_n_uint32_t(p, o) __atomic_load_n(p, o)
 #   define apple_atomic_load_n_uint64_t(p, o) __atomic_load_n(p, o)
 
 #   define ATOMIC_LOAD_N(t, p, o) apple_atomic_load_n_##t(p, o)
@@ -141,6 +142,7 @@ static pthread_mutex_t atomic_sim_lock = PTHREAD_MUTEX_INITIALIZER;
         pthread_mutex_unlock(&atomic_sim_lock);                 \
         return ret;                                             \
     }
+IMPL_fallback_atomic_load_n(uint32_t)
 IMPL_fallback_atomic_load_n(uint64_t)
 IMPL_fallback_atomic_load_n(pvoid)
 
@@ -157,6 +159,7 @@ IMPL_fallback_atomic_load_n(pvoid)
         pthread_mutex_unlock(&atomic_sim_lock);                 \
         return ret;                                             \
     }
+IMPL_fallback_atomic_store_n(uint32_t)
 IMPL_fallback_atomic_store_n(uint64_t)
 
 #  define ATOMIC_STORE_N(t, p, v, o) fallback_atomic_store_n_##t(p, v)
@@ -265,17 +268,19 @@ static ossl_inline uint64_t fallback_atomic_or_fetch(uint64_t *p, uint64_t m)
 /*
  * users is broken up into 2 parts
  * bits 0-15 current readers
- * bit 32-63 - ID
+ * bit 32-63 ID
  */
 # define READER_SHIFT 0
 # define ID_SHIFT 32
+/* TODO: READER_SIZE 32 in threads_win.c */
 # define READER_SIZE 16
 # define ID_SIZE 32
 
 # define READER_MASK     (((uint64_t)1 << READER_SIZE) - 1)
 # define ID_MASK         (((uint64_t)1 << ID_SIZE) - 1)
-# define READER_COUNT(x) (((uint64_t)(x) >> READER_SHIFT) & READER_MASK)
-# define ID_VAL(x)       (((uint64_t)(x) >> ID_SHIFT) & ID_MASK)
+# define READER_COUNT(x) ((uint32_t)(((uint64_t)(x) >> READER_SHIFT) & \
+                                     READER_MASK))
+# define ID_VAL(x)       ((uint32_t)(((uint64_t)(x) >> ID_SHIFT) & ID_MASK))
 # define VAL_READER      ((uint64_t)1 << READER_SHIFT)
 # define VAL_ID(x)       ((uint64_t)x << ID_SHIFT)
 
@@ -322,20 +327,21 @@ struct rcu_lock_st {
     /* rcu generation counter for in-order retirement */
     uint32_t id_ctr;
 
+    /* TODO: can be moved before id_ctr for better alignment */
     /* Array of quiescent points for synchronization */
     struct rcu_qp *qp_group;
 
     /* Number of elements in qp_group array */
-    size_t group_count;
+    uint32_t group_count;
 
     /* Index of the current qp in the qp_group array */
-    uint64_t reader_idx;
+    uint32_t reader_idx;
 
     /* value of the next id_ctr value to be retired */
     uint32_t next_to_retire;
 
     /* index of the next free rcu_qp in the qp_group */
-    uint64_t current_alloc_idx;
+    uint32_t current_alloc_idx;
 
     /* number of qp's in qp_group array currently being retired */
     uint32_t writers_alloced;
@@ -359,7 +365,7 @@ struct rcu_lock_st {
 /* Read side acquisition of the current qp */
 static struct rcu_qp *get_hold_current_qp(struct rcu_lock_st *lock)
 {
-    uint64_t qp_idx;
+    uint32_t qp_idx;
 
     /* get the current qp index */
     for (;;) {
@@ -375,7 +381,7 @@ static struct rcu_qp *get_hold_current_qp(struct rcu_lock_st *lock)
          * systems like x86, but is relevant on other arches
          * Note: This applies to the reload below as well
          */
-        qp_idx = ATOMIC_LOAD_N(uint64_t, &lock->reader_idx, __ATOMIC_ACQUIRE);
+        qp_idx = ATOMIC_LOAD_N(uint32_t, &lock->reader_idx, __ATOMIC_ACQUIRE);
 
         /*
          * Notes of use of __ATOMIC_RELEASE
@@ -388,7 +394,7 @@ static struct rcu_qp *get_hold_current_qp(struct rcu_lock_st *lock)
                          __ATOMIC_RELEASE);
 
         /* if the idx hasn't changed, we're good, else try again */
-        if (qp_idx == ATOMIC_LOAD_N(uint64_t, &lock->reader_idx, __ATOMIC_ACQUIRE))
+        if (qp_idx == ATOMIC_LOAD_N(uint32_t, &lock->reader_idx, __ATOMIC_ACQUIRE))
             break;
 
         /*
@@ -494,7 +500,7 @@ void ossl_rcu_read_unlock(CRYPTO_RCU_LOCK *lock)
 static struct rcu_qp *update_qp(CRYPTO_RCU_LOCK *lock)
 {
     uint64_t new_id;
-    uint64_t current_idx;
+    uint32_t current_idx;
 
     pthread_mutex_lock(&lock->alloc_lock);
 
@@ -517,10 +523,9 @@ static struct rcu_qp *update_qp(CRYPTO_RCU_LOCK *lock)
         (lock->current_alloc_idx + 1) % lock->group_count;
 
     /* get and insert a new id */
-    new_id = lock->id_ctr;
+    new_id = VAL_ID(lock->id_ctr);
     lock->id_ctr++;
 
-    new_id = VAL_ID(new_id);
     /*
      * Even though we are under a write side lock here
      * We need to use atomic instructions to ensure that the results
@@ -538,7 +543,7 @@ static struct rcu_qp *update_qp(CRYPTO_RCU_LOCK *lock)
      * of __ATOMIC_ACQUIRE in get_hold_current_qp, as we want any publication
      * of this value to be seen on the read side immediately after it happens
      */
-    ATOMIC_STORE_N(uint64_t, &lock->reader_idx, lock->current_alloc_idx,
+    ATOMIC_STORE_N(uint32_t, &lock->reader_idx, lock->current_alloc_idx,
                    __ATOMIC_RELEASE);
 
     /* wake up any waiters */
@@ -555,6 +560,8 @@ static void retire_qp(CRYPTO_RCU_LOCK *lock, struct rcu_qp *qp)
     pthread_mutex_unlock(&lock->alloc_lock);
 }
 
+/* TODO: count should be unsigned, e.g uint32_t */
+/* a negative value could result in unexpected behaviour */
 static struct rcu_qp *allocate_new_qp_group(CRYPTO_RCU_LOCK *lock,
                                             int count)
 {
@@ -1006,7 +1013,7 @@ int CRYPTO_atomic_load_int(int *val, int *ret, CRYPTO_RWLOCK *lock)
 # elif defined(__sun) && (defined(__SunOS_5_10) || defined(__SunOS_5_11))
     /* This will work for all future Solaris versions. */
     if (ret != NULL) {
-        *ret = (int *)atomic_or_uint_nv((unsigned int *)val, 0);
+        *ret = (int)atomic_or_uint_nv((unsigned int *)val, 0);
         return 1;
     }
 # endif
diff --git a/crypto/threads_win.c b/crypto/threads_win.c
index 4d7a1d9d1e2..0196235a13e 100644
--- a/crypto/threads_win.c
+++ b/crypto/threads_win.c
@@ -43,17 +43,24 @@ typedef struct {
 } CRYPTO_win_rwlock;
 # endif
 
+/*
+ * users is broken up into 2 parts
+ * bits 0-31 current readers
+ * bit 32-63 ID
+ */
 # define READER_SHIFT 0
-# define ID_SHIFT 32 
-# define READER_SIZE 32 
-# define ID_SIZE 32 
-
-# define READER_MASK     (((LONG64)1 << READER_SIZE)-1)
-# define ID_MASK         (((LONG64)1 << ID_SIZE)-1)
-# define READER_COUNT(x) (((LONG64)(x) >> READER_SHIFT) & READER_MASK)
-# define ID_VAL(x)       (((LONG64)(x) >> ID_SHIFT) & ID_MASK)
-# define VAL_READER      ((LONG64)1 << READER_SHIFT)
-# define VAL_ID(x)       ((LONG64)x << ID_SHIFT)
+# define ID_SHIFT 32
+/* TODO: READER_SIZE 16 in threads_pthread.c */
+# define READER_SIZE 32
+# define ID_SIZE 32
+
+# define READER_MASK     (((uint64_t)1 << READER_SIZE) - 1)
+# define ID_MASK         (((uint64_t)1 << ID_SIZE) - 1)
+# define READER_COUNT(x) ((uint32_t)(((uint64_t)(x) >> READER_SHIFT) & \
+                                     READER_MASK))
+# define ID_VAL(x)       ((uint32_t)(((uint64_t)(x) >> ID_SHIFT) & ID_MASK))
+# define VAL_READER      ((int64_t)1 << READER_SHIFT)
+# define VAL_ID(x)       ((uint64_t)x << ID_SHIFT)
 
 /*
  * This defines a quescent point (qp)
@@ -62,7 +69,7 @@ typedef struct {
  * atomically updated
  */
 struct rcu_qp {
-    volatile LONG64 users;
+    volatile uint64_t users;
 };
 
 struct thread_qp {
@@ -89,23 +96,55 @@ struct rcu_thr_data {
  * it is cast from CRYPTO_RCU_LOCK
  */
 struct rcu_lock_st {
+    /* Callbacks to call for next ossl_synchronize_rcu */
     struct rcu_cb_item *cb_items;
+
+    /* The context we are being created against */
     OSSL_LIB_CTX *ctx;
+
+    /* rcu generation counter for in-order retirement */
     uint32_t id_ctr;
+
+    /* TODO: can be moved before id_ctr for better alignment */
+    /* Array of quiescent points for synchronization */
     struct rcu_qp *qp_group;
-    size_t group_count;
-    uint32_t next_to_retire;
+
+    /* Number of elements in qp_group array */
+    uint32_t group_count;
+
+    /* Index of the current qp in the qp_group array */
     uint32_t reader_idx;
+
+    /* value of the next id_ctr value to be retired */
+    uint32_t next_to_retire;
+
+    /* index of the next free rcu_qp in the qp_group */
     uint32_t current_alloc_idx;
+
+    /* number of qp's in qp_group array currently being retired */
     uint32_t writers_alloced;
+
+    /* lock protecting write side operations */
     CRYPTO_MUTEX *write_lock;
+
+    /* lock protecting updates to writers_alloced/current_alloc_idx */
     CRYPTO_MUTEX *alloc_lock;
+
+    /* signal to wake threads waiting on alloc_lock */
     CRYPTO_CONDVAR *alloc_signal;
+
+    /* lock to enforce in-order retirement */
     CRYPTO_MUTEX *prior_lock;
+
+    /* signal to wake threads waiting on prior_lock */
     CRYPTO_CONDVAR *prior_signal;
+
+    /* lock used with NO_INTERLOCKEDOR64: VS2010 x86 */
     CRYPTO_RWLOCK *rw_lock;
 };
 
+/* TODO: count should be unsigned, e.g uint32_t */
+/* a negative value could result in unexpected behaviour */
 static struct rcu_qp *allocate_new_qp_group(struct rcu_lock_st *lock,
                                             int count)
 {
@@ -173,6 +212,7 @@ void ossl_rcu_lock_free(CRYPTO_RCU_LOCK *lock)
     OPENSSL_free(lock);
 }
 
+/* Read side acquisition of the current qp */
 static ossl_inline struct rcu_qp *get_hold_current_qp(CRYPTO_RCU_LOCK *lock)
 {
     uint32_t qp_idx;
@@ -181,11 +221,12 @@ static ossl_inline struct rcu_qp *get_hold_current_qp(CRYPTO_RCU_LOCK *lock)
 
     /* get the current qp index */
     for (;;) {
-        CRYPTO_atomic_load_int(&lock->reader_idx, (int *)&qp_idx,
+        CRYPTO_atomic_load_int((int *)&lock->reader_idx, (int *)&qp_idx,
                                lock->rw_lock);
         CRYPTO_atomic_add64(&lock->qp_group[qp_idx].users, VAL_READER, &tmp64,
                             lock->rw_lock);
-        CRYPTO_atomic_load_int(&lock->reader_idx, (int *)&tmp, lock->rw_lock);
+        CRYPTO_atomic_load_int((int *)&lock->reader_idx, (int *)&tmp,
+                               lock->rw_lock);
         if (qp_idx == tmp)
             break;
         CRYPTO_atomic_add64(&lock->qp_group[qp_idx].users, -VAL_READER, &tmp64,
@@ -277,6 +318,10 @@ void ossl_rcu_read_unlock(CRYPTO_RCU_LOCK *lock)
     }
 }
 
+/*
+ * Write side allocation routine to get the current qp
+ * and replace it with a new one
+ */
 static struct rcu_qp *update_qp(CRYPTO_RCU_LOCK *lock)
 {
     uint64_t new_id;
@@ -291,9 +336,11 @@ static struct rcu_qp *update_qp(CRYPTO_RCU_LOCK *lock)
      * one that isn't yet being waited on
      */
     while (lock->group_count - lock->writers_alloced < 2)
+        /* we have to wait for one to be free */
         ossl_crypto_condvar_wait(lock->alloc_signal, lock->alloc_lock);
 
     current_idx = lock->current_alloc_idx;
+
     /* Allocate the qp */
     lock->writers_alloced++;
 
@@ -302,10 +349,15 @@ static struct rcu_qp *update_qp(CRYPTO_RCU_LOCK *lock)
         (lock->current_alloc_idx + 1) % lock->group_count;
 
     /* get and insert a new id */
-    new_id = lock->id_ctr;
+    new_id = VAL_ID(lock->id_ctr);
     lock->id_ctr++;
 
-    new_id = VAL_ID(new_id);
+    /*
+     * Even though we are under a write side lock here
+     * We need to use atomic instructions to ensure that the results
+     * of this update are published to the read side prior to updating the
+     * reader idx below
+     */
     CRYPTO_atomic_and(&lock->qp_group[current_idx].users, ID_MASK, &tmp64,
                       lock->rw_lock);
     CRYPTO_atomic_add64(&lock->qp_group[current_idx].users, new_id, &tmp64,