From: Volker Lendecke <vl@samba.org>
Date: Tue, 19 Nov 2019 16:29:18 +0000 (+0100)
Subject: lib: Change the g_lock data model
X-Git-Tag: ldb-2.1.0~615
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=d734547488c6fab8d136239a95db640df57dbb97;p=thirdparty%2Fsamba.git

lib: Change the g_lock data model

Now we have one fixed field for the exclusive lock holder and an array
of shared locks. This way we now prioritize writers over readers: If a
pending write comes in while readers are active, it will put itself
into the exclusive slot. Then it waits for the readers to vanish. Only
when all readers are gone the exclusive lock request is granted. New
readers will just look at the exclusive slot and see it's taken. They
will then line up as watchers, retrying whenever things change.

This also means that it will be cheaper to support many shared locks:
Granting a shared lock just means to extend the array. We don't have
to walk the array for possible conflicts.

This also adds explicit UPGRADE and DOWNGRADE operations for better
error checking.

Signed-off-by: Volker Lendecke <vl@samba.org>
Reviewed-by: Jeremy Allison <jra@samba.org>
---

diff --git a/source3/include/g_lock.h b/source3/include/g_lock.h
index 5e12ffe4b95..54f60e410ba 100644
--- a/source3/include/g_lock.h
+++ b/source3/include/g_lock.h
@@ -26,8 +26,10 @@ struct g_lock_ctx;
 struct messaging_context;
 
 enum g_lock_type {
-	G_LOCK_READ = 0,
-	G_LOCK_WRITE = 1,
+	G_LOCK_READ,
+	G_LOCK_WRITE,
+	G_LOCK_UPGRADE,
+	G_LOCK_DOWNGRADE,
 };
 
 struct g_lock_ctx *g_lock_ctx_init_backend(
@@ -53,7 +55,8 @@ NTSTATUS g_lock_write_data(struct g_lock_ctx *ctx, TDB_DATA key,
 int g_lock_locks(struct g_lock_ctx *ctx,
 		 int (*fn)(TDB_DATA key, void *private_data),
 		 void *private_data);
-NTSTATUS g_lock_dump(struct g_lock_ctx *ctx, TDB_DATA key,
+NTSTATUS g_lock_dump(struct g_lock_ctx *ctx,
+		     TDB_DATA key,
 		     void (*fn)(struct server_id exclusive,
 				size_t num_shared,
 				struct server_id *shared,
diff --git a/source3/lib/g_lock.c b/source3/lib/g_lock.c
index ae551fd2b16..a05bb443b41 100644
--- a/source3/lib/g_lock.c
+++ b/source3/lib/g_lock.c
@@ -38,115 +38,107 @@ struct g_lock_ctx {
 	struct messaging_context *msg;
 };
 
-struct g_lock_rec {
-	enum g_lock_type lock_type;
-	struct server_id pid;
-};
-
-/*
- * The "g_lock.tdb" file contains records, indexed by the 0-terminated
- * lockname. The record contains an array of "struct g_lock_rec"
- * structures.
- */
-
-#define G_LOCK_REC_LENGTH (SERVER_ID_BUF_LENGTH+1)
-
-static void g_lock_rec_put(uint8_t buf[G_LOCK_REC_LENGTH],
-			   const struct g_lock_rec rec)
-{
-	SCVAL(buf, 0, rec.lock_type);
-	server_id_put(buf+1, rec.pid);
-}
-
-static void g_lock_rec_get(struct g_lock_rec *rec,
-			   const uint8_t buf[G_LOCK_REC_LENGTH])
-{
-	rec->lock_type = CVAL(buf, 0);
-	server_id_get(&rec->pid, buf+1);
-}
-
 struct g_lock {
-	uint8_t *recsbuf;
-	size_t num_recs;
-	uint8_t *data;
+	struct server_id exclusive;
+	size_t num_shared;
+	uint8_t *shared;
 	size_t datalen;
+	uint8_t *data;
 };
 
 static bool g_lock_parse(uint8_t *buf, size_t buflen, struct g_lock *lck)
 {
-	size_t found_recs, data_ofs;
+	struct server_id exclusive;
+	size_t num_shared, shared_len;
 
-	if (buflen < sizeof(uint32_t)) {
-		*lck = (struct g_lock) {0};
+	if (buflen < (SERVER_ID_BUF_LENGTH + sizeof(uint32_t))) {
+		*lck = (struct g_lock) { .exclusive.pid = 0 };
 		return true;
 	}
 
-	found_recs = IVAL(buf, 0);
+	server_id_get(&exclusive, buf);
+	buf += SERVER_ID_BUF_LENGTH;
+	buflen -= SERVER_ID_BUF_LENGTH;
 
+	num_shared = IVAL(buf, 0);
 	buf += sizeof(uint32_t);
 	buflen -= sizeof(uint32_t);
-	if (found_recs > buflen/G_LOCK_REC_LENGTH) {
+
+	if (num_shared > buflen/SERVER_ID_BUF_LENGTH) {
 		return false;
 	}
 
-	data_ofs = found_recs * G_LOCK_REC_LENGTH;
+	shared_len = num_shared * SERVER_ID_BUF_LENGTH;
 
 	*lck = (struct g_lock) {
-		.recsbuf = buf, .num_recs = found_recs,
-		.data = buf+data_ofs, .datalen = buflen-data_ofs
+		.exclusive = exclusive,
+		.num_shared = num_shared,
+		.shared = buf,
+		.datalen = buflen-shared_len,
+		.data = buf+shared_len,
 	};
 
 	return true;
 }
 
-static void g_lock_get_rec(const struct g_lock *lck,
-			   size_t i,
-			   struct g_lock_rec *rec)
+static void g_lock_get_shared(const struct g_lock *lck,
+			      size_t i,
+			      struct server_id *shared)
 {
-	if (i >= lck->num_recs) {
+	if (i >= lck->num_shared) {
 		abort();
 	}
-	g_lock_rec_get(rec, lck->recsbuf + i*G_LOCK_REC_LENGTH);
+	server_id_get(shared, lck->shared + i*SERVER_ID_BUF_LENGTH);
 }
 
-static void g_lock_rec_del(struct g_lock *lck, size_t i)
+static void g_lock_del_shared(struct g_lock *lck, size_t i)
 {
-	if (i >= lck->num_recs) {
+	if (i >= lck->num_shared) {
 		abort();
 	}
-	lck->num_recs -= 1;
-	if (i < lck->num_recs) {
-		uint8_t *recptr = lck->recsbuf + i*G_LOCK_REC_LENGTH;
-		memcpy(recptr, lck->recsbuf + lck->num_recs*G_LOCK_REC_LENGTH,
-		       G_LOCK_REC_LENGTH);
+	lck->num_shared -= 1;
+	if (i < lck->num_shared) {
+		memcpy(lck->shared + i*SERVER_ID_BUF_LENGTH,
+		       lck->shared + lck->num_shared*SERVER_ID_BUF_LENGTH,
+		       SERVER_ID_BUF_LENGTH);
 	}
 }
 
-static NTSTATUS g_lock_store(struct db_record *rec, struct g_lock *lck,
-			     struct g_lock_rec *add)
+static NTSTATUS g_lock_store(
+	struct db_record *rec,
+	struct g_lock *lck,
+	struct server_id *new_shared)
 {
-	uint8_t sizebuf[4];
-	uint8_t addbuf[G_LOCK_REC_LENGTH];
+	uint8_t exclusive[SERVER_ID_BUF_LENGTH];
+	uint8_t sizebuf[sizeof(uint32_t)];
+	uint8_t shared[SERVER_ID_BUF_LENGTH];
 
 	struct TDB_DATA dbufs[] = {
+		{ .dptr = exclusive, .dsize = sizeof(exclusive) },
 		{ .dptr = sizebuf, .dsize = sizeof(sizebuf) },
-		{ .dptr = lck->recsbuf,
-		  .dsize = lck->num_recs * G_LOCK_REC_LENGTH },
+		{ .dptr = lck->shared,
+		  .dsize = lck->num_shared * SERVER_ID_BUF_LENGTH },
 		{ 0 },
 		{ .dptr = lck->data, .dsize = lck->datalen }
 	};
 
-	if (add != NULL) {
-		g_lock_rec_put(addbuf, *add);
+	server_id_put(exclusive, lck->exclusive);
+
+	if (new_shared != NULL) {
+		if (lck->num_shared >= UINT32_MAX) {
+			return NT_STATUS_BUFFER_OVERFLOW;
+		}
+
+		server_id_put(shared, *new_shared);
 
-		dbufs[2] = (TDB_DATA) {
-			.dptr = addbuf, .dsize = G_LOCK_REC_LENGTH
+		dbufs[3] = (TDB_DATA) {
+			.dptr = shared, .dsize = sizeof(shared),
 		};
 
-		lck->num_recs += 1;
+		lck->num_shared += 1;
 	}
 
-	SIVAL(sizebuf, 0, lck->num_recs);
+	SIVAL(sizebuf, 0, lck->num_shared);
 
 	return dbwrap_record_storev(rec, dbufs, ARRAY_SIZE(dbufs), 0);
 }
@@ -204,167 +196,310 @@ struct g_lock_ctx *g_lock_ctx_init(TALLOC_CTX *mem_ctx,
 	return ctx;
 }
 
-static bool g_lock_conflicts(enum g_lock_type l1, enum g_lock_type l2)
+static NTSTATUS g_lock_cleanup_dead(
+	struct db_record *rec,
+	struct g_lock *lck,
+	struct server_id *dead_blocker)
 {
-	/*
-	 * Only tested write locks so far. Very likely this routine
-	 * needs to be fixed for read locks....
-	 */
-	if ((l1 == G_LOCK_READ) && (l2 == G_LOCK_READ)) {
-		return false;
+	bool modified = false;
+	bool exclusive_died;
+	NTSTATUS status = NT_STATUS_OK;
+	struct server_id_buf tmp;
+
+	if (dead_blocker == NULL) {
+		return NT_STATUS_OK;
 	}
-	return true;
+
+	exclusive_died = server_id_equal(dead_blocker, &lck->exclusive);
+
+	if (exclusive_died) {
+		DBG_DEBUG("Exclusive holder %s died\n",
+			  server_id_str_buf(lck->exclusive, &tmp));
+		lck->exclusive.pid = 0;
+		modified = true;
+	}
+
+	if (lck->num_shared != 0) {
+		bool shared_died;
+		struct server_id shared;
+
+		g_lock_get_shared(lck, 0, &shared);
+		shared_died = server_id_equal(dead_blocker, &shared);
+
+		if (shared_died) {
+			DBG_DEBUG("Shared holder %s died\n",
+				  server_id_str_buf(shared, &tmp));
+			g_lock_del_shared(lck, 0);
+			modified = true;
+		}
+	}
+
+	if (modified) {
+		status = g_lock_store(rec, lck, NULL);
+		if (!NT_STATUS_IS_OK(status)) {
+			DBG_DEBUG("g_lock_store() failed: %s\n",
+				  nt_errstr(status));
+		}
+	}
+
+	return status;
+}
+
+static ssize_t g_lock_find_shared(
+	struct g_lock *lck,
+	const struct server_id *self)
+{
+	size_t i;
+
+	for (i=0; i<lck->num_shared; i++) {
+		struct server_id shared;
+		bool same;
+
+		g_lock_get_shared(lck, i, &shared);
+
+		same = server_id_equal(self, &shared);
+		if (same) {
+			return i;
+		}
+	}
+
+	return -1;
 }
 
+struct g_lock_lock_state {
+	struct tevent_context *ev;
+	struct g_lock_ctx *ctx;
+	TDB_DATA key;
+	enum g_lock_type type;
+	bool retry;
+};
+
+struct g_lock_lock_fn_state {
+	struct g_lock_lock_state *req_state;
+	struct server_id *dead_blocker;
+
+	struct tevent_req *watch_req;
+	NTSTATUS status;
+};
+
+static int g_lock_lock_state_destructor(struct g_lock_lock_state *s);
+
 static NTSTATUS g_lock_trylock(
 	struct db_record *rec,
+	struct g_lock_lock_fn_state *state,
 	TDB_DATA data,
-	struct server_id self,
-	enum g_lock_type type,
 	struct server_id *blocker)
 {
-	size_t i;
-	struct g_lock lck;
-	struct g_lock_rec mylock = {0};
+	struct g_lock_lock_state *req_state = state->req_state;
+	struct server_id self = messaging_server_id(req_state->ctx->msg);
+	enum g_lock_type type = req_state->type;
+	bool retry = req_state->retry;
+	struct g_lock lck = { .exclusive.pid = 0 };
+	struct server_id check;
+	struct server_id_buf tmp;
 	NTSTATUS status;
-	bool modified = false;
+	size_t i;
+	bool exists;
 	bool ok;
 
 	ok = g_lock_parse(data.dptr, data.dsize, &lck);
 	if (!ok) {
+		DBG_DEBUG("g_lock_parse failed\n");
 		return NT_STATUS_INTERNAL_DB_CORRUPTION;
 	}
 
-	if ((type == G_LOCK_READ) && (lck.num_recs > 0)) {
-		struct g_lock_rec check_rec;
+	status = g_lock_cleanup_dead(rec, &lck, state->dead_blocker);
+	if (!NT_STATUS_IS_OK(status)) {
+		DBG_DEBUG("g_lock_cleanup_dead() failed: %s\n",
+			  nt_errstr(status));
+		return status;
+	}
+
+	if (lck.exclusive.pid != 0) {
+		bool self_exclusive = server_id_equal(&self, &lck.exclusive);
 
-		/*
-		 * Read locks can stay around forever if the process
-		 * dies. Do a heuristic check for process existence:
-		 * Check one random process for existence. Hopefully
-		 * this will keep runaway read locks under control.
-		 */
-		i = generate_random() % lck.num_recs;
+		if (!self_exclusive) {
+			exists = serverid_exists(&lck.exclusive);
+			if (!exists) {
+				lck.exclusive = (struct server_id) { .pid=0 };
+				goto noexclusive;
+			}
 
-		g_lock_get_rec(&lck, i, &check_rec);
+			DBG_DEBUG("%s has an exclusive lock\n",
+				  server_id_str_buf(lck.exclusive, &tmp));
 
-		if ((check_rec.lock_type == G_LOCK_READ) &&
-		    !serverid_exists(&check_rec.pid)) {
-			g_lock_rec_del(&lck, i);
-			modified = true;
+			if (type == G_LOCK_DOWNGRADE) {
+				struct server_id_buf tmp2;
+				DBG_DEBUG("%s: Trying to downgrade %s\n",
+					  server_id_str_buf(self, &tmp),
+					  server_id_str_buf(
+						  lck.exclusive, &tmp2));
+				return NT_STATUS_NOT_LOCKED;
+			}
+
+			if (type == G_LOCK_UPGRADE) {
+				ssize_t shared_idx;
+				shared_idx = g_lock_find_shared(&lck, &self);
+
+				if (shared_idx == -1) {
+					DBG_DEBUG("Trying to upgrade %s "
+						  "without "
+						  "existing shared lock\n",
+						  server_id_str_buf(
+							  self, &tmp));
+					return NT_STATUS_NOT_LOCKED;
+				}
+
+				/*
+				 * We're trying to upgrade, and the
+				 * exlusive lock is taken by someone
+				 * else. This means that someone else
+				 * is waiting for us to give up our
+				 * shared lock. If we now also wait
+				 * for someone to give their shared
+				 * lock, we will deadlock.
+				 */
+
+				DBG_DEBUG("Trying to upgrade %s while "
+					  "someone else is also "
+					  "trying to upgrade\n",
+					  server_id_str_buf(self, &tmp));
+				return NT_STATUS_POSSIBLE_DEADLOCK;
+			}
+
+			*blocker = lck.exclusive;
+			return NT_STATUS_LOCK_NOT_GRANTED;
 		}
-	}
 
-	/*
-	 * For the lock upgrade/downgrade case, remove ourselves from
-	 * the list. We re-add ourselves later after we checked the
-	 * other entries for conflict.
-	 */
+		if (type == G_LOCK_DOWNGRADE) {
+			DBG_DEBUG("Downgrading %s from WRITE to READ\n",
+				  server_id_str_buf(self, &tmp));
 
-	for (i=0; i<lck.num_recs; i++) {
-		struct g_lock_rec lock;
+			lck.exclusive = (struct server_id) { .pid = 0 };
+			goto do_shared;
+		}
 
-		g_lock_get_rec(&lck, i, &lock);
+		if (!retry) {
+			DBG_DEBUG("%s already locked by self\n",
+				  server_id_str_buf(self, &tmp));
+			return NT_STATUS_WAS_LOCKED;
+		}
 
-		if (server_id_equal(&self, &lock.pid)) {
-			if (lock.lock_type == type) {
-				status = NT_STATUS_WAS_LOCKED;
-				goto done;
-			}
+		if (lck.num_shared != 0) {
+			g_lock_get_shared(&lck, 0, blocker);
 
-			mylock = lock;
-			g_lock_rec_del(&lck, i);
-			modified = true;
-			break;
+			DBG_DEBUG("Continue waiting for shared lock %s\n",
+				  server_id_str_buf(*blocker, &tmp));
+
+			return NT_STATUS_LOCK_NOT_GRANTED;
 		}
+
+		/*
+		 * Retry after a conflicting lock was released
+		 */
+		return NT_STATUS_OK;
 	}
 
-	/*
-	 * Check for conflicts with everybody else. Not a for-loop
-	 * because we remove stale entries in the meantime,
-	 * decrementing lck.num_recs.
-	 */
+noexclusive:
 
-	i = 0;
+	if (type == G_LOCK_UPGRADE) {
+		ssize_t shared_idx = g_lock_find_shared(&lck, &self);
 
-	while (i < lck.num_recs) {
-		struct g_lock_rec lock;
+		if (shared_idx == -1) {
+			DBG_DEBUG("Trying to upgrade %s without "
+				  "existing shared lock\n",
+				  server_id_str_buf(self, &tmp));
+			return NT_STATUS_NOT_LOCKED;
+		}
 
-		g_lock_get_rec(&lck, i, &lock);
+		g_lock_del_shared(&lck, shared_idx);
+		type = G_LOCK_WRITE;
+	}
 
-		if (g_lock_conflicts(type, lock.lock_type)) {
-			struct server_id pid = lock.pid;
+	if (type == G_LOCK_WRITE) {
+		ssize_t shared_idx = g_lock_find_shared(&lck, &self);
 
-			/*
-			 * As the serverid_exists might recurse into
-			 * the g_lock code, we use
-			 * SERVERID_UNIQUE_ID_NOT_TO_VERIFY to avoid the loop
-			 */
-			pid.unique_id = SERVERID_UNIQUE_ID_NOT_TO_VERIFY;
+		if (shared_idx != -1) {
+			DBG_DEBUG("Trying to writelock existing shared %s\n",
+				  server_id_str_buf(self, &tmp));
+			return NT_STATUS_WAS_LOCKED;
+		}
 
-			if (serverid_exists(&pid)) {
-				status = NT_STATUS_LOCK_NOT_GRANTED;
-				*blocker = lock.pid;
-				goto done;
-			}
+		lck.exclusive = self;
 
-			/*
-			 * Delete stale conflicting entry
-			 */
-			g_lock_rec_del(&lck, i);
-			modified = true;
-			continue;
+		status = g_lock_store(rec, &lck, NULL);
+		if (!NT_STATUS_IS_OK(status)) {
+			DBG_DEBUG("g_lock_store() failed: %s\n",
+				  nt_errstr(status));
+			return status;
 		}
-		i++;
-	}
 
-	modified = true;
+		if (lck.num_shared != 0) {
+			talloc_set_destructor(
+				req_state, g_lock_lock_state_destructor);
 
-	mylock = (struct g_lock_rec) {
-		.pid = self,
-		.lock_type = type
-	};
+			g_lock_get_shared(&lck, 0, blocker);
 
-	status = NT_STATUS_OK;
-done:
-	if (modified) {
-		NTSTATUS store_status;
+			DBG_DEBUG("Waiting for %zu shared locks, "
+				  "picking blocker %s\n",
+				  lck.num_shared,
+				  server_id_str_buf(*blocker, &tmp));
 
-		/*
-		 * (Re-)add ourselves if needed via non-NULL
-		 * g_lock_store argument
-		 */
+			return NT_STATUS_LOCK_NOT_GRANTED;
+		}
+
+		talloc_set_destructor(req_state, NULL);
+
+		return NT_STATUS_OK;
+	}
 
-		store_status = g_lock_store(
-			rec,
-			&lck,
-			mylock.pid.pid != 0 ? &mylock : NULL);
+do_shared:
 
-		if (!NT_STATUS_IS_OK(store_status)) {
-			DBG_WARNING("g_lock_record_store failed: %s\n",
-				    nt_errstr(store_status));
-			status = store_status;
+	if (lck.num_shared == 0) {
+		status = g_lock_store(rec, &lck, &self);
+		if (!NT_STATUS_IS_OK(status)) {
+			DBG_DEBUG("g_lock_store() failed: %s\n",
+				  nt_errstr(status));
 		}
+
+		return status;
 	}
-	return status;
-}
 
-struct g_lock_lock_state {
-	struct tevent_context *ev;
-	struct g_lock_ctx *ctx;
-	TDB_DATA key;
-	enum g_lock_type type;
-};
+	/*
+	 * Read locks can stay around forever if the process dies. Do
+	 * a heuristic check for process existence: Check one random
+	 * process for existence. Hopefully this will keep runaway
+	 * read locks under control.
+	 */
 
-static void g_lock_lock_retry(struct tevent_req *subreq);
+	i = generate_random() % lck.num_shared;
 
-struct g_lock_lock_fn_state {
-	struct g_lock_lock_state *state;
-	struct server_id self;
+	g_lock_get_shared(&lck, i, &check);
 
-	struct tevent_req *watch_req;
-	NTSTATUS status;
-};
+	if (!serverid_exists(&check)) {
+
+		DBG_DEBUG("Shared locker %s died -- removing\n",
+			  server_id_str_buf(check, &tmp));
+
+		g_lock_del_shared(&lck, i);
+
+		status = g_lock_store(rec, &lck, NULL);
+		if (!NT_STATUS_IS_OK(status)) {
+			DBG_DEBUG("g_lock_store() failed: %s\n",
+				  nt_errstr(status));
+			return status;
+		}
+	}
+
+	status = g_lock_store(rec, &lck, &self);
+	if (!NT_STATUS_IS_OK(status)) {
+		DBG_DEBUG("g_lock_store() failed: %s\n",
+			  nt_errstr(status));
+		return status;
+	}
+
+	return NT_STATUS_OK;
+}
 
 static void g_lock_lock_fn(
 	struct db_record *rec,
@@ -374,16 +509,29 @@ static void g_lock_lock_fn(
 	struct g_lock_lock_fn_state *state = private_data;
 	struct server_id blocker = {0};
 
-	state->status = g_lock_trylock(
-		rec, value, state->self, state->state->type, &blocker);
+	state->status = g_lock_trylock(rec, state, value, &blocker);
 	if (!NT_STATUS_EQUAL(state->status, NT_STATUS_LOCK_NOT_GRANTED)) {
 		return;
 	}
 
 	state->watch_req = dbwrap_watched_watch_send(
-		state->state, state->state->ev, rec, blocker);
+		state->req_state, state->req_state->ev, rec, blocker);
+	if (state->watch_req == NULL) {
+		state->status = NT_STATUS_NO_MEMORY;
+	}
+}
+
+static int g_lock_lock_state_destructor(struct g_lock_lock_state *s)
+{
+	NTSTATUS status = g_lock_unlock(s->ctx, s->key);
+	if (!NT_STATUS_IS_OK(status)) {
+		DBG_DEBUG("g_lock_unlock failed: %s\n", nt_errstr(status));
+	}
+	return 0;
 }
 
+static void g_lock_lock_retry(struct tevent_req *subreq);
+
 struct tevent_req *g_lock_lock_send(TALLOC_CTX *mem_ctx,
 				    struct tevent_context *ev,
 				    struct g_lock_ctx *ctx,
@@ -394,6 +542,7 @@ struct tevent_req *g_lock_lock_send(TALLOC_CTX *mem_ctx,
 	struct g_lock_lock_state *state;
 	struct g_lock_lock_fn_state fn_state;
 	NTSTATUS status;
+	bool ok;
 
 	req = tevent_req_create(mem_ctx, &state, struct g_lock_lock_state);
 	if (req == NULL) {
@@ -405,7 +554,7 @@ struct tevent_req *g_lock_lock_send(TALLOC_CTX *mem_ctx,
 	state->type = type;
 
 	fn_state = (struct g_lock_lock_fn_state) {
-		.state = state, .self = messaging_server_id(ctx->msg)
+		.req_state = state,
 	};
 
 	status = dbwrap_do_locked(ctx->db, key, g_lock_lock_fn, &fn_state);
@@ -428,12 +577,16 @@ struct tevent_req *g_lock_lock_send(TALLOC_CTX *mem_ctx,
 		return tevent_req_post(req, ev);
 	}
 
-	if (!tevent_req_set_endtime(
-		    fn_state.watch_req, state->ev,
-		    timeval_current_ofs(5 + generate_random() % 5, 0))) {
+	ok = tevent_req_set_endtime(
+		fn_state.watch_req,
+		state->ev,
+		timeval_current_ofs(5 + generate_random() % 5, 0));
+	if (!ok) {
+		tevent_req_oom(req);
 		return tevent_req_post(req, ev);
 	}
 	tevent_req_set_callback(fn_state.watch_req, g_lock_lock_retry, req);
+
 	return req;
 }
 
@@ -444,9 +597,11 @@ static void g_lock_lock_retry(struct tevent_req *subreq)
 	struct g_lock_lock_state *state = tevent_req_data(
 		req, struct g_lock_lock_state);
 	struct g_lock_lock_fn_state fn_state;
+	struct server_id blocker;
+	bool blockerdead;
 	NTSTATUS status;
 
-	status = dbwrap_watched_watch_recv(subreq, NULL, NULL);
+	status = dbwrap_watched_watch_recv(subreq, &blockerdead, &blocker);
 	DBG_DEBUG("watch_recv returned %s\n", nt_errstr(status));
 	TALLOC_FREE(subreq);
 
@@ -456,8 +611,11 @@ static void g_lock_lock_retry(struct tevent_req *subreq)
 		return;
 	}
 
+	state->retry = true;
+
 	fn_state = (struct g_lock_lock_fn_state) {
-		.state = state, .self = messaging_server_id(state->ctx->msg)
+		.req_state = state,
+		.dead_blocker = blockerdead ? &blocker : NULL,
 	};
 
 	status = dbwrap_do_locked(state->ctx->db, state->key,
@@ -525,7 +683,6 @@ NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, TDB_DATA key,
 }
 
 struct g_lock_unlock_state {
-	TDB_DATA key;
 	struct server_id self;
 	NTSTATUS status;
 };
@@ -536,45 +693,62 @@ static void g_lock_unlock_fn(
 	void *private_data)
 {
 	struct g_lock_unlock_state *state = private_data;
+	struct server_id_buf tmp;
 	struct g_lock lck;
 	size_t i;
-	bool ok;
+	bool ok, exclusive;
 
 	ok = g_lock_parse(value.dptr, value.dsize, &lck);
 	if (!ok) {
-		DBG_DEBUG("g_lock_parse for %s failed\n",
-			  hex_encode_talloc(talloc_tos(),
-					    state->key.dptr,
-					    state->key.dsize));
-		state->status = NT_STATUS_FILE_INVALID;
+		DBG_DEBUG("g_lock_parse() failed\n");
+		state->status = NT_STATUS_INTERNAL_DB_CORRUPTION;
 		return;
 	}
-	for (i=0; i<lck.num_recs; i++) {
-		struct g_lock_rec lockrec;
-		g_lock_get_rec(&lck, i, &lockrec);
-		if (server_id_equal(&state->self, &lockrec.pid)) {
+
+	exclusive = server_id_equal(&state->self, &lck.exclusive);
+
+	for (i=0; i<lck.num_shared; i++) {
+		struct server_id shared;
+		g_lock_get_shared(&lck, i, &shared);
+		if (server_id_equal(&state->self, &shared)) {
 			break;
 		}
 	}
-	if (i == lck.num_recs) {
-		DBG_DEBUG("Lock not found, num_rec=%zu\n", lck.num_recs);
-		state->status = NT_STATUS_NOT_FOUND;
-		return;
-	}
 
-	g_lock_rec_del(&lck, i);
+	if (i < lck.num_shared) {
+		if (exclusive) {
+			DBG_DEBUG("%s both exclusive and shared (%zu)\n",
+				  server_id_str_buf(state->self, &tmp),
+				  i);
+			state->status = NT_STATUS_INTERNAL_DB_CORRUPTION;
+			return;
+		}
+		g_lock_del_shared(&lck, i);
+	} else {
+		if (!exclusive) {
+			DBG_DEBUG("Lock %s not found, num_rec=%zu\n",
+				  server_id_str_buf(state->self, &tmp),
+				  lck.num_shared);
+			state->status = NT_STATUS_NOT_FOUND;
+			return;
+		}
+		lck.exclusive = (struct server_id) { .pid = 0 };
+	}
 
-	if ((lck.num_recs == 0) && (lck.datalen == 0)) {
+	if ((lck.exclusive.pid == 0) &&
+	    (lck.num_shared == 0) &&
+	    (lck.datalen == 0)) {
 		state->status = dbwrap_record_delete(rec);
 		return;
 	}
+
 	state->status = g_lock_store(rec, &lck, NULL);
 }
 
 NTSTATUS g_lock_unlock(struct g_lock_ctx *ctx, TDB_DATA key)
 {
 	struct g_lock_unlock_state state = {
-		.self = messaging_server_id(ctx->msg), .key = key
+		.self = messaging_server_id(ctx->msg),
 	};
 	NTSTATUS status;
 
@@ -608,7 +782,7 @@ static void g_lock_write_data_fn(
 {
 	struct g_lock_write_data_state *state = private_data;
 	struct g_lock lck;
-	size_t i;
+	bool exclusive;
 	bool ok;
 
 	ok = g_lock_parse(value.dptr, value.dsize, &lck);
@@ -620,15 +794,16 @@ static void g_lock_write_data_fn(
 		state->status = NT_STATUS_INTERNAL_DB_CORRUPTION;
 		return;
 	}
-	for (i=0; i<lck.num_recs; i++) {
-		struct g_lock_rec lockrec;
-		g_lock_get_rec(&lck, i, &lockrec);
-		if ((lockrec.lock_type == G_LOCK_WRITE) &&
-		    server_id_equal(&state->self, &lockrec.pid)) {
-			break;
-		}
-	}
-	if (i == lck.num_recs) {
+
+	exclusive = server_id_equal(&state->self, &lck.exclusive);
+
+	/*
+	 * Make sure we're really exclusive. We are marked as
+	 * exclusive when we are waiting for an exclusive lock
+	 */
+	exclusive &= (lck.num_shared == 0);
+
+	if (!exclusive) {
 		DBG_DEBUG("Not locked by us\n");
 		state->status = NT_STATUS_NOT_LOCKED;
 		return;
@@ -713,9 +888,7 @@ static void g_lock_dump_fn(TDB_DATA key, TDB_DATA data,
 			   void *private_data)
 {
 	struct g_lock_dump_state *state = private_data;
-	struct g_lock_rec *recs = NULL;
-	struct g_lock lck;
-	struct server_id exclusive = { .pid = 0 };
+	struct g_lock lck = (struct g_lock) { .exclusive.pid = 0 };
 	struct server_id *shared = NULL;
 	size_t i;
 	bool ok;
@@ -730,50 +903,28 @@ static void g_lock_dump_fn(TDB_DATA key, TDB_DATA data,
 		return;
 	}
 
-	recs = talloc_array(state->mem_ctx, struct g_lock_rec, lck.num_recs);
-	if (recs == NULL) {
+	shared = talloc_array(
+		state->mem_ctx, struct server_id, lck.num_shared);
+	if (shared == NULL) {
 		DBG_DEBUG("talloc failed\n");
 		state->status = NT_STATUS_NO_MEMORY;
-		goto fail;
-	}
-
-	for (i=0; i<lck.num_recs; i++) {
-		g_lock_get_rec(&lck, i, &recs[i]);
+		return;
 	}
 
-	if ((lck.num_recs == 1) && (recs[0].lock_type == G_LOCK_WRITE)) {
-		exclusive = recs[0].pid;
-		lck.num_recs = 0;
-	} else {
-		shared = talloc_array(recs, struct server_id, lck.num_recs);
-		if (shared == NULL) {
-			DBG_DEBUG("talloc_array failed\n");
-			state->status = NT_STATUS_NO_MEMORY;
-			goto fail;
-		}
-		for (i=0; i<lck.num_recs; i++) {
-			if (recs[i].lock_type != G_LOCK_READ) {
-				DBG_WARNING("locks[%zu] has wrong type %d\n",
-					    i,
-					    (int)recs[i].lock_type);
-				state->status =
-					NT_STATUS_INTERNAL_DB_CORRUPTION;
-				goto fail;
-			}
-			shared[i] = recs[i].pid;
-		}
+	for (i=0; i<lck.num_shared; i++) {
+		g_lock_get_shared(&lck, i, &shared[i]);
 	}
 
-	state->fn(exclusive,
-		  lck.num_recs,
+	state->fn(lck.exclusive,
+		  lck.num_shared,
 		  shared,
 		  lck.data,
 		  lck.datalen,
 		  state->private_data);
 
+	TALLOC_FREE(shared);
+
 	state->status = NT_STATUS_OK;
-fail:
-	TALLOC_FREE(recs);
 }
 
 NTSTATUS g_lock_dump(struct g_lock_ctx *ctx, TDB_DATA key,
diff --git a/source3/smbd/server.c b/source3/smbd/server.c
index 20f34bc401b..a8e69e0f0a1 100644
--- a/source3/smbd/server.c
+++ b/source3/smbd/server.c
@@ -1483,7 +1483,7 @@ static NTSTATUS smbd_claim_version(struct messaging_context *msg,
 		return NT_STATUS_OK;
 	}
 
-	status = g_lock_lock(ctx, string_term_tdb_data(name), G_LOCK_WRITE,
+	status = g_lock_lock(ctx, string_term_tdb_data(name), G_LOCK_UPGRADE,
 			     (struct timeval) { .tv_sec = 60 });
 	if (!NT_STATUS_IS_OK(status)) {
 		DBG_WARNING("g_lock_lock(G_LOCK_WRITE) failed: %s\n",
diff --git a/source3/torture/test_g_lock.c b/source3/torture/test_g_lock.c
index e00432c9915..91336352fcc 100644
--- a/source3/torture/test_g_lock.c
+++ b/source3/torture/test_g_lock.c
@@ -301,7 +301,7 @@ bool run_g_lock3(int dummy)
 		goto fail;
 	}
 
-	status = g_lock_lock(ctx, string_term_tdb_data(lockname), G_LOCK_WRITE,
+	status = g_lock_lock(ctx, string_term_tdb_data(lockname), G_LOCK_UPGRADE,
 			     (struct timeval) { .tv_sec = 1 });
 	if (!NT_STATUS_IS_OK(status)) {
 		fprintf(stderr, "g_lock_lock returned %s\n",
@@ -895,7 +895,7 @@ bool run_g_lock6(int dummy)
 		}
 	}
 
-	status = g_lock_lock(ctx, lockname, G_LOCK_WRITE,
+	status = g_lock_lock(ctx, lockname, G_LOCK_UPGRADE,
 			     (struct timeval) { .tv_sec = 1 });
 	if (!NT_STATUS_IS_OK(status)) {
 		fprintf(stderr, "g_lock_lock failed: %s\n",