]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Modernize and optimize pg_buffercache_pages()
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Tue, 7 Apr 2026 13:04:48 +0000 (16:04 +0300)
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>
Tue, 7 Apr 2026 13:04:48 +0000 (16:04 +0300)
Refactor pg_buffercache_pages() to use SFRM_Materialize mode and
construct a tuplestore directly. That's simpler and more efficient
than collecting all the data to a custom array first.

Author: Ashutosh Bapat <ashutosh.bapat.oss@gmail.com>
Author: Palak Chaturvedi <chaturvedipalak1911@gmail.com>
Discussion: https://www.postgresql.org/message-id/CAExHW5sMsaz1j+hrdhyo-DJp7JCgJx87=q2iJfOc_9mwYWyvmw@mail.gmail.com

contrib/pg_buffercache/pg_buffercache_pages.c

index db4d711cce7d66e11725feb4d684cb3dd85183c5..1ec2cf0e6f468049066903ee26cc852ad1c22151 100644 (file)
@@ -37,39 +37,6 @@ PG_MODULE_MAGIC_EXT(
                                        .version = PG_VERSION
 );
 
-/*
- * Record structure holding the to be exposed cache data.
- */
-typedef struct
-{
-       uint32          bufferid;
-       RelFileNumber relfilenumber;
-       Oid                     reltablespace;
-       Oid                     reldatabase;
-       ForkNumber      forknum;
-       BlockNumber blocknum;
-       bool            isvalid;
-       bool            isdirty;
-       uint16          usagecount;
-
-       /*
-        * An int32 is sufficiently large, as MAX_BACKENDS prevents a buffer from
-        * being pinned by too many backends and each backend will only pin once
-        * because of bufmgr.c's PrivateRefCount infrastructure.
-        */
-       int32           pinning_backends;
-} BufferCachePagesRec;
-
-
-/*
- * Function context for data persisting over repeated calls.
- */
-typedef struct
-{
-       TupleDesc       tupdesc;
-       BufferCachePagesRec *record;
-} BufferCachePagesContext;
-
 /*
  * Record structure holding the to be exposed cache data for OS pages.  This
  * structure is used by pg_buffercache_os_pages(), where NUMA information may
@@ -117,142 +84,89 @@ static bool firstNumaTouch = true;
 Datum
 pg_buffercache_pages(PG_FUNCTION_ARGS)
 {
-       FuncCallContext *funcctx;
-       Datum           result;
-       MemoryContext oldcontext;
-       BufferCachePagesContext *fctx;  /* User function context. */
-       TupleDesc       tupledesc;
+       ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
        TupleDesc       expected_tupledesc;
-       HeapTuple       tuple;
-
-       if (SRF_IS_FIRSTCALL())
-       {
-               int                     i;
-
-               funcctx = SRF_FIRSTCALL_INIT();
-
-               /* Switch context when allocating stuff to be used in later calls */
-               oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
-
-               /* Create a user function context for cross-call persistence */
-               fctx = palloc_object(BufferCachePagesContext);
-
-               /*
-                * To smoothly support upgrades from version 1.0 of this extension
-                * transparently handle the (non-)existence of the pinning_backends
-                * column. We unfortunately have to get the result type for that... -
-                * we can't use the result type determined by the function definition
-                * without potentially crashing when somebody uses the old (or even
-                * wrong) function definition though.
-                */
-               if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
-                       elog(ERROR, "return type must be a row type");
-
-               if (expected_tupledesc->natts < NUM_BUFFERCACHE_PAGES_MIN_ELEM ||
-                       expected_tupledesc->natts > NUM_BUFFERCACHE_PAGES_ELEM)
-                       elog(ERROR, "incorrect number of output arguments");
-
-               /* Construct a tuple descriptor for the result rows. */
-               tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
-               TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
-                                                  INT4OID, -1, 0);
-               TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
-                                                  OIDOID, -1, 0);
-               TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
-                                                  OIDOID, -1, 0);
-               TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
-                                                  OIDOID, -1, 0);
-               TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber",
-                                                  INT2OID, -1, 0);
-               TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber",
-                                                  INT8OID, -1, 0);
-               TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty",
-                                                  BOOLOID, -1, 0);
-               TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",
-                                                  INT2OID, -1, 0);
-
-               if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
-                       TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
-                                                          INT4OID, -1, 0);
-
-               TupleDescFinalize(tupledesc);
-               fctx->tupdesc = BlessTupleDesc(tupledesc);
-
-               /* Allocate NBuffers worth of BufferCachePagesRec records. */
-               fctx->record = (BufferCachePagesRec *)
-                       MemoryContextAllocHuge(CurrentMemoryContext,
-                                                                  sizeof(BufferCachePagesRec) * NBuffers);
-
-               /* Set max calls and remember the user function context. */
-               funcctx->max_calls = NBuffers;
-               funcctx->user_fctx = fctx;
-
-               /* Return to original context when allocating transient memory */
-               MemoryContextSwitchTo(oldcontext);
+       int                     i;
 
-               /*
-                * Scan through all the buffers, saving the relevant fields in the
-                * fctx->record structure.
-                *
-                * We don't hold the partition locks, so we don't get a consistent
-                * snapshot across all buffers, but we do grab the buffer header
-                * locks, so the information of each buffer is self-consistent.
-                */
-               for (i = 0; i < NBuffers; i++)
-               {
-                       BufferDesc *bufHdr;
-                       uint64          buf_state;
-
-                       CHECK_FOR_INTERRUPTS();
+       /*
+        * To smoothly support upgrades from version 1.0 of this extension
+        * transparently handle the (non-)existence of the pinning_backends
+        * column. We unfortunately have to get the result type for that... - we
+        * can't use the result type determined by the function definition without
+        * potentially crashing when somebody uses the old (or even wrong)
+        * function definition though.
+        */
+       if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
+               elog(ERROR, "return type must be a row type");
 
-                       bufHdr = GetBufferDescriptor(i);
-                       /* Lock each buffer header before inspecting. */
-                       buf_state = LockBufHdr(bufHdr);
+       if (expected_tupledesc->natts < NUM_BUFFERCACHE_PAGES_MIN_ELEM ||
+               expected_tupledesc->natts > NUM_BUFFERCACHE_PAGES_ELEM)
+               elog(ERROR, "incorrect number of output arguments");
 
-                       fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
-                       fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
-                       fctx->record[i].reltablespace = bufHdr->tag.spcOid;
-                       fctx->record[i].reldatabase = bufHdr->tag.dbOid;
-                       fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag);
-                       fctx->record[i].blocknum = bufHdr->tag.blockNum;
-                       fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
-                       fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
+       InitMaterializedSRF(fcinfo, 0);
 
-                       if (buf_state & BM_DIRTY)
-                               fctx->record[i].isdirty = true;
-                       else
-                               fctx->record[i].isdirty = false;
+       /*
+        * Scan through all the buffers, adding one row for each of the buffers to
+        * the tuplestore.
+        *
+        * We don't hold the partition locks, so we don't get a consistent
+        * snapshot across all buffers, but we do grab the buffer header locks, so
+        * the information of each buffer is self-consistent.
+        */
+       for (i = 0; i < NBuffers; i++)
+       {
+               BufferDesc *bufHdr;
+               uint64          buf_state;
+               uint32          bufferid;
+               RelFileNumber relfilenumber;
+               Oid                     reltablespace;
+               Oid                     reldatabase;
+               ForkNumber      forknum;
+               BlockNumber blocknum;
+               bool            isvalid;
+               bool            isdirty;
+               uint16          usagecount;
+               int32           pinning_backends;
+               Datum           values[NUM_BUFFERCACHE_PAGES_ELEM];
+               bool            nulls[NUM_BUFFERCACHE_PAGES_ELEM];
 
-                       /* Note if the buffer is valid, and has storage created */
-                       if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
-                               fctx->record[i].isvalid = true;
-                       else
-                               fctx->record[i].isvalid = false;
+               CHECK_FOR_INTERRUPTS();
 
-                       UnlockBufHdr(bufHdr);
-               }
-       }
+               bufHdr = GetBufferDescriptor(i);
+               /* Lock each buffer header before inspecting. */
+               buf_state = LockBufHdr(bufHdr);
+
+               bufferid = BufferDescriptorGetBuffer(bufHdr);
+               relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
+               reltablespace = bufHdr->tag.spcOid;
+               reldatabase = bufHdr->tag.dbOid;
+               forknum = BufTagGetForkNum(&bufHdr->tag);
+               blocknum = bufHdr->tag.blockNum;
+               usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
+               pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
 
-       funcctx = SRF_PERCALL_SETUP();
+               if (buf_state & BM_DIRTY)
+                       isdirty = true;
+               else
+                       isdirty = false;
 
-       /* Get the saved state */
-       fctx = funcctx->user_fctx;
+               /* Note if the buffer is valid, and has storage created */
+               if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
+                       isvalid = true;
+               else
+                       isvalid = false;
 
-       if (funcctx->call_cntr < funcctx->max_calls)
-       {
-               uint32          i = funcctx->call_cntr;
-               Datum           values[NUM_BUFFERCACHE_PAGES_ELEM];
-               bool            nulls[NUM_BUFFERCACHE_PAGES_ELEM];
+               UnlockBufHdr(bufHdr);
 
-               values[0] = Int32GetDatum(fctx->record[i].bufferid);
+               /* Build the tuple and add it to tuplestore */
+               values[0] = Int32GetDatum(bufferid);
                nulls[0] = false;
 
                /*
                 * Set all fields except the bufferid to null if the buffer is unused
                 * or not valid.
                 */
-               if (fctx->record[i].blocknum == InvalidBlockNumber ||
-                       fctx->record[i].isvalid == false)
+               if (blocknum == InvalidBlockNumber || isvalid == false)
                {
                        nulls[1] = true;
                        nulls[2] = true;
@@ -266,33 +180,29 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
                }
                else
                {
-                       values[1] = ObjectIdGetDatum(fctx->record[i].relfilenumber);
+                       values[1] = ObjectIdGetDatum(relfilenumber);
                        nulls[1] = false;
-                       values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);
+                       values[2] = ObjectIdGetDatum(reltablespace);
                        nulls[2] = false;
-                       values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase);
+                       values[3] = ObjectIdGetDatum(reldatabase);
                        nulls[3] = false;
-                       values[4] = Int16GetDatum(fctx->record[i].forknum);
+                       values[4] = Int16GetDatum(forknum);
                        nulls[4] = false;
-                       values[5] = Int64GetDatum((int64) fctx->record[i].blocknum);
+                       values[5] = Int64GetDatum((int64) blocknum);
                        nulls[5] = false;
-                       values[6] = BoolGetDatum(fctx->record[i].isdirty);
+                       values[6] = BoolGetDatum(isdirty);
                        nulls[6] = false;
-                       values[7] = UInt16GetDatum(fctx->record[i].usagecount);
+                       values[7] = UInt16GetDatum(usagecount);
                        nulls[7] = false;
                        /* unused for v1.0 callers, but the array is always long enough */
-                       values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
+                       values[8] = Int32GetDatum(pinning_backends);
                        nulls[8] = false;
                }
 
-               /* Build and return the tuple. */
-               tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
-               result = HeapTupleGetDatum(tuple);
-
-               SRF_RETURN_NEXT(funcctx, result);
+               tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
        }
-       else
-               SRF_RETURN_DONE(funcctx);
+
+       return (Datum) 0;
 }
 
 /*