hash: Refactor overflow page allocation.
authorRobert Haas <rhaas@postgresql.org>
Mon, 27 Feb 2017 17:26:34 +0000 (22:56 +0530)
committerRobert Haas <rhaas@postgresql.org>
Mon, 27 Feb 2017 17:29:55 +0000 (22:59 +0530)
As with commit b0f18cb77f50a54e997d857d592f6a511617f52c, the goal
here is to move all of the related page modifications to a single
section of code, in preparation for adding write-ahead logging.

Amit Kapila, with slight changes by me.  The larger patch series
of which this is a part has been reviewed and tested by Γlvaro
Herrera, Ashutosh Sharma, Mark Kirkwood, Jeff Janes, and Jesper
Pedersen, all of whom should also have been credited in the
previous commit message.

src/backend/access/hash/hashovfl.c
src/include/access/hash.h

index ff6c4e295c50c1f6d2f28e35817d6efdcc3cf72c..9d89e86aef1865f15fd6dc17bcc7f986838db1b9 100644 (file)
@@ -21,7 +21,6 @@
 #include "utils/rel.h"
 
 
-static Buffer _hash_getovflpage(Relation rel, Buffer metabuf);
 static uint32 _hash_firstfreebit(uint32 map);
 
 
@@ -113,13 +112,30 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin)
        Page            ovflpage;
        HashPageOpaque pageopaque;
        HashPageOpaque ovflopaque;
-
-       /* allocate and lock an empty overflow page */
-       ovflbuf = _hash_getovflpage(rel, metabuf);
+       HashMetaPage metap;
+       Buffer          mapbuf = InvalidBuffer;
+       Buffer          newmapbuf = InvalidBuffer;
+       BlockNumber blkno;
+       uint32          orig_firstfree;
+       uint32          splitnum;
+       uint32     *freep = NULL;
+       uint32          max_ovflpg;
+       uint32          bit;
+       uint32          bitmap_page_bit;
+       uint32          first_page;
+       uint32          last_bit;
+       uint32          last_page;
+       uint32          i,
+                               j;
+       bool            page_found = false;
 
        /*
-        * Write-lock the tail page.  It is okay to hold two buffer locks here
-        * since there cannot be anyone else contending for access to ovflbuf.
+        * Write-lock the tail page.  Here, we need to maintain locking order such
+        * that, first acquire the lock on tail page of bucket, then on meta page
+        * to find and lock the bitmap page and if it is found, then lock on meta
+        * page is released, then finally acquire the lock on new overflow buffer.
+        * We need this locking order to avoid deadlock with backends that are
+        * doing inserts.
         */
        LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 
@@ -153,60 +169,6 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin)
                buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
        }
 
-       /* now that we have correct backlink, initialize new overflow page */
-       ovflpage = BufferGetPage(ovflbuf);
-       ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
-       ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
-       ovflopaque->hasho_nextblkno = InvalidBlockNumber;
-       ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
-       ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
-       ovflopaque->hasho_page_id = HASHO_PAGE_ID;
-
-       MarkBufferDirty(ovflbuf);
-
-       /* logically chain overflow page to previous page */
-       pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
-       MarkBufferDirty(buf);
-       if (retain_pin)
-       {
-               /* pin will be retained only for the primary bucket page */
-               Assert(pageopaque->hasho_flag & LH_BUCKET_PAGE);
-               LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-       }
-       else
-               _hash_relbuf(rel, buf);
-
-       return ovflbuf;
-}
-
-/*
- *     _hash_getovflpage()
- *
- *     Find an available overflow page and return it.  The returned buffer
- *     is pinned and write-locked, and has had _hash_pageinit() applied,
- *     but it is caller's responsibility to fill the special space.
- *
- * The caller must hold a pin, but no lock, on the metapage buffer.
- * That buffer is left in the same state at exit.
- */
-static Buffer
-_hash_getovflpage(Relation rel, Buffer metabuf)
-{
-       HashMetaPage metap;
-       Buffer          mapbuf = 0;
-       Buffer          newbuf;
-       BlockNumber blkno;
-       uint32          orig_firstfree;
-       uint32          splitnum;
-       uint32     *freep = NULL;
-       uint32          max_ovflpg;
-       uint32          bit;
-       uint32          first_page;
-       uint32          last_bit;
-       uint32          last_page;
-       uint32          i,
-                               j;
-
        /* Get exclusive lock on the meta page */
        LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
 
@@ -255,11 +217,31 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
                for (; bit <= last_inpage; j++, bit += BITS_PER_MAP)
                {
                        if (freep[j] != ALL_SET)
+                       {
+                               page_found = true;
+
+                               /* Reacquire exclusive lock on the meta page */
+                               LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
+
+                               /* convert bit to bit number within page */
+                               bit += _hash_firstfreebit(freep[j]);
+                               bitmap_page_bit = bit;
+
+                               /* convert bit to absolute bit number */
+                               bit += (i << BMPG_SHIFT(metap));
+                               /* Calculate address of the recycled overflow page */
+                               blkno = bitno_to_blkno(metap, bit);
+
+                               /* Fetch and init the recycled page */
+                               ovflbuf = _hash_getinitbuf(rel, blkno);
+
                                goto found;
+                       }
                }
 
                /* No free space here, try to advance to next map page */
                _hash_relbuf(rel, mapbuf);
+               mapbuf = InvalidBuffer;
                i++;
                j = 0;                                  /* scan from start of next map page */
                bit = 0;
@@ -283,8 +265,15 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
                 * convenient to pre-mark them as "in use" too.
                 */
                bit = metap->hashm_spares[splitnum];
-               _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit), MAIN_FORKNUM);
-               metap->hashm_spares[splitnum]++;
+
+               /* metapage already has a write lock */
+               if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+                                        errmsg("out of overflow pages in hash index \"%s\"",
+                                                       RelationGetRelationName(rel))));
+
+               newmapbuf = _hash_getnewbuf(rel, bitno_to_blkno(metap, bit), MAIN_FORKNUM);
        }
        else
        {
@@ -295,7 +284,8 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
        }
 
        /* Calculate address of the new overflow page */
-       bit = metap->hashm_spares[splitnum];
+       bit = BufferIsValid(newmapbuf) ?
+               metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum];
        blkno = bitno_to_blkno(metap, bit);
 
        /*
@@ -303,41 +293,48 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
         * relation length stays in sync with ours.  XXX It's annoying to do this
         * with metapage write lock held; would be better to use a lock that
         * doesn't block incoming searches.
+        *
+        * It is okay to hold two buffer locks here (one on tail page of bucket
+        * and other on new overflow page) since there cannot be anyone else
+        * contending for access to ovflbuf.
         */
-       newbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM);
+       ovflbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM);
 
-       metap->hashm_spares[splitnum]++;
+found:
 
        /*
-        * Adjust hashm_firstfree to avoid redundant searches.  But don't risk
-        * changing it if someone moved it while we were searching bitmap pages.
+        * Do the update.
         */
-       if (metap->hashm_firstfree == orig_firstfree)
-               metap->hashm_firstfree = bit + 1;
-
-       /* Write updated metapage and release lock, but not pin */
-       MarkBufferDirty(metabuf);
-       LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
-
-       return newbuf;
-
-found:
-       /* convert bit to bit number within page */
-       bit += _hash_firstfreebit(freep[j]);
-
-       /* mark page "in use" in the bitmap */
-       SETBIT(freep, bit);
-       MarkBufferDirty(mapbuf);
-       _hash_relbuf(rel, mapbuf);
+       if (page_found)
+       {
+               Assert(BufferIsValid(mapbuf));
 
-       /* Reacquire exclusive lock on the meta page */
-       LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
+               /* mark page "in use" in the bitmap */
+               SETBIT(freep, bitmap_page_bit);
+               MarkBufferDirty(mapbuf);
+       }
+       else
+       {
+               /* update the count to indicate new overflow page is added */
+               metap->hashm_spares[splitnum]++;
 
-       /* convert bit to absolute bit number */
-       bit += (i << BMPG_SHIFT(metap));
+               if (BufferIsValid(newmapbuf))
+               {
+                       _hash_initbitmapbuffer(newmapbuf, metap->hashm_bmsize, false);
+                       MarkBufferDirty(newmapbuf);
+
+                       /* add the new bitmap page to the metapage's list of bitmaps */
+                       metap->hashm_mapp[metap->hashm_nmaps] = BufferGetBlockNumber(newmapbuf);
+                       metap->hashm_nmaps++;
+                       metap->hashm_spares[splitnum]++;
+                       MarkBufferDirty(metabuf);
+               }
 
-       /* Calculate address of the recycled overflow page */
-       blkno = bitno_to_blkno(metap, bit);
+               /*
+                * for new overflow page, we don't need to explicitly set the bit in
+                * bitmap page, as by default that will be set to "in use".
+                */
+       }
 
        /*
         * Adjust hashm_firstfree to avoid redundant searches.  But don't risk
@@ -346,19 +343,39 @@ found:
        if (metap->hashm_firstfree == orig_firstfree)
        {
                metap->hashm_firstfree = bit + 1;
-
-               /* Write updated metapage and release lock, but not pin */
                MarkBufferDirty(metabuf);
-               LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
        }
+
+       /* initialize new overflow page */
+       ovflpage = BufferGetPage(ovflbuf);
+       ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
+       ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
+       ovflopaque->hasho_nextblkno = InvalidBlockNumber;
+       ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
+       ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
+       ovflopaque->hasho_page_id = HASHO_PAGE_ID;
+
+       MarkBufferDirty(ovflbuf);
+
+       /* logically chain overflow page to previous page */
+       pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
+
+       MarkBufferDirty(buf);
+
+       if (retain_pin)
+               LockBuffer(buf, BUFFER_LOCK_UNLOCK);
        else
-       {
-               /* We didn't change the metapage, so no need to write */
-               LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
-       }
+               _hash_relbuf(rel, buf);
+
+       if (BufferIsValid(mapbuf))
+               _hash_relbuf(rel, mapbuf);
+
+       LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+
+       if (BufferIsValid(newmapbuf))
+               _hash_relbuf(rel, newmapbuf);
 
-       /* Fetch, init, and return the recycled page */
-       return _hash_getinitbuf(rel, blkno);
+       return ovflbuf;
 }
 
 /*
@@ -615,6 +632,42 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno,
 }
 
 
+/*
+ *     _hash_initbitmapbuffer()
+ *
+ *      Initialize a new bitmap page.  All bits in the new bitmap page are set to
+ *      "1", indicating "in use".
+ */
+void
+_hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage)
+{
+       Page            pg;
+       HashPageOpaque op;
+       uint32     *freep;
+
+       pg = BufferGetPage(buf);
+
+       /* initialize the page */
+       if (initpage)
+               _hash_pageinit(pg, BufferGetPageSize(buf));
+
+       /* initialize the page's special space */
+       op = (HashPageOpaque) PageGetSpecialPointer(pg);
+       op->hasho_prevblkno = InvalidBlockNumber;
+       op->hasho_nextblkno = InvalidBlockNumber;
+       op->hasho_bucket = -1;
+       op->hasho_flag = LH_BITMAP_PAGE;
+       op->hasho_page_id = HASHO_PAGE_ID;
+
+       /* set all of the bits to 1 */
+       freep = HashPageGetBitmap(pg);
+       MemSet(freep, 0xFF, bmsize);
+
+       /* Set pd_lower just past the end of the bitmap page data. */
+       ((PageHeader) pg)->pd_lower = ((char *) freep + bmsize) - (char *) pg;
+}
+
+
 /*
  *     _hash_squeezebucket(rel, bucket)
  *
index 5767deb02956e15e5ec6613301cc82f0810a8537..9c0b79f8a630fdd9e405f615ba9c7e5c7716de5d 100644 (file)
@@ -313,6 +313,7 @@ extern BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovf
                         Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy);
 extern void _hash_initbitmap(Relation rel, HashMetaPage metap,
                                 BlockNumber blkno, ForkNumber forkNum);
+extern void _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage);
 extern void _hash_squeezebucket(Relation rel,
                                        Bucket bucket, BlockNumber bucket_blkno,
                                        Buffer bucket_buf,