Write buffer image before and after every WAL record. page_image_logging
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Mon, 7 Apr 2014 11:51:45 +0000 (14:51 +0300)
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>
Fri, 4 Sep 2015 12:43:22 +0000 (15:43 +0300)
contrib/page_image_logging/Makefile [new file with mode: 0644]
contrib/page_image_logging/README [new file with mode: 0644]
contrib/page_image_logging/postprocess-images.c [new file with mode: 0644]
src/backend/access/transam/xloginsert.c
src/backend/storage/buffer/bufmgr.c
src/backend/storage/lmgr/lwlock.c
src/include/miscadmin.h
src/include/storage/lwlock.h

diff --git a/contrib/page_image_logging/Makefile b/contrib/page_image_logging/Makefile
new file mode 100644 (file)
index 0000000..9c68bbc
--- /dev/null
@@ -0,0 +1,20 @@
+# contrib/page_image_logging/Makefile
+
+PGFILEDESC = "postprocess-images - "
+
+PROGRAM = postprocess-images
+OBJS   = postprocess-images.o
+
+PG_CPPFLAGS = -I$(libpq_srcdir)
+PG_LIBS = $(libpq_pgport)
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/postprocess-images
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/page_image_logging/README b/contrib/page_image_logging/README
new file mode 100644 (file)
index 0000000..e2f8225
--- /dev/null
@@ -0,0 +1,50 @@
+Usage
+-----
+
+1. Apply the patch
+
+2. Set up a master and standby.
+
+3. stop master, then standby.
+
+4. Remove $PGDATA/buffer-images from both servers.
+
+5. Start master and standby
+
+6. Run "make installcheck", or whatever you want to test
+
+7. Stop master, then standby
+
+8. compare the logged page images using the postprocessing tool:
+
+./postprocess-images ~/data-master/buffer-images ~/data-standby/buffer-images  > differences
+
+9. The 'differences' file should be empty. If not, investigate.
+
+Tips
+----
+
+The page images take up a lot of disk space! The PostgreSQL regression
+suite generates about 11GB - double that when the same is generated also
+in a standby.
+
+Always stop the master first, then standby. Otherwise, when you restart
+the standby, it will start WAL replay from the previous checkpoint, and
+log some page images already. Stopping the master creates a checkpoint
+record, avoiding the problem.
+
+If you get errors like this from postprocess-images:
+
+    could not reorder line XXX
+
+It can be caused by an all-zeros page being logged with XLOG HEAP_NEWPAGE
+records. Look at the line in the buffer-image file, see if it's all-zeros.
+This can happen e.g when you change the tablespace of a table. See
+log_newpage() in xlog.c.
+
+You can use pg_xlogdump to see which WAL record a page image corresponds
+to. But beware that the LSN in the page image points to the *end* of the
+WAL record, while the LSN that pg_xlogdump prints is the *beginning* of
+the WAL record. So to find which WAL record a page image corresponds to,
+find the LSN from the page image in pg_xlogdump output, and back off one
+record. (you can't just grep for the line containing the LSN).
diff --git a/contrib/page_image_logging/postprocess-images.c b/contrib/page_image_logging/postprocess-images.c
new file mode 100644 (file)
index 0000000..f440cfd
--- /dev/null
@@ -0,0 +1,624 @@
+#include "postgres_fe.h"
+
+typedef uintptr_t Datum;
+typedef struct MemoryContextData *MemoryContext;
+
+#include "access/gin_private.h"
+#include "access/htup_details.h"
+#include "access/nbtree.h"
+#include "storage/bufpage.h"
+
+#define LINESZ (BLCKSZ*2 + 100)
+
+/* ----------------------------------------------------------------
+ * Masking functions.
+ *
+ * Most pages cannot be compared directly, because some parts of the
+ * page are not expected to be byte-by-byte identical. For example,
+ * hint bits. Our strategy is to normalize all pages by creating a
+ * mask of those bits that are not expected to match.
+ */
+
+/*
+ * Build a mask that covers unused space between pd_lower and pd_upper.
+ */
+static void
+mask_unused_space(char *page, uint8 *mask)
+{
+       int                     pd_lower = ((PageHeader) page)->pd_lower;
+       int                     pd_upper = ((PageHeader) page)->pd_upper;
+       int                     pd_special = ((PageHeader) page)->pd_upper;
+
+       if (pd_lower > pd_upper || pd_special < pd_upper ||
+               pd_lower < SizeOfPageHeaderData || pd_special > BLCKSZ)
+       {
+               fprintf(stderr, "invalid page at %X/%08X\n",
+                               ((PageHeader) page)->pd_lsn.xlogid,
+                               ((PageHeader) page)->pd_lsn.xrecoff);
+       }
+
+       memset(mask + pd_lower, 0xFF, pd_upper - pd_lower);
+}
+
+static void
+build_heap_mask(char *page, uint8 *mask)
+{
+       OffsetNumber off;
+       PageHeader mask_phdr = (PageHeader) mask;
+
+       mask_unused_space(page, mask);
+
+       /* Ignore prune_xid (it's like a hint-bit) */
+       mask_phdr->pd_prune_xid = 0xFFFFFFFF;
+
+       /* Ignore PD_PAGE_FULL and PD_HAS_FREE_LINES flags, they are just hints */
+       mask_phdr->pd_flags |= PD_PAGE_FULL | PD_HAS_FREE_LINES;
+
+       /*
+        * Also mask the all-visible flag.
+        *
+        * XXX: It is unfortunate that we have to do this. If the flag is set
+        * incorrectly, that's serious, and we would like to catch it. If the flag
+        * is cleared incorrectly, that's serious too. But redo of HEAP_CLEAN
+        * records don't currently set the flag, even though it is set in the
+        * master, so we must silence failures that that causes.
+        */
+       mask_phdr->pd_flags |= PD_ALL_VISIBLE;
+
+       for (off = 1; off <= PageGetMaxOffsetNumber(page); off++)
+       {
+               ItemId          iid = PageGetItemId(page, off);
+               Item            item = PageGetItem(page, iid);
+               char       *mask_item;
+
+               mask_item = (char *) (mask + ItemIdGetOffset(iid));
+
+               if (ItemIdIsNormal(iid))
+               {
+                       HeapTupleHeader mask_htup = (HeapTupleHeader) mask_item;
+
+                       /*
+                        * Ignore hint bits and command ID.
+                        */
+                       mask_htup->t_infomask =
+                               HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID |
+                               HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID;
+                       mask_htup->t_infomask |= HEAP_COMBOCID;
+                       mask_htup->t_choice.t_heap.t_field3.t_cid = 0xFFFFFFFF;
+
+                       /*
+                        * Ignore speculative insertion token on speculatively
+                        * inserted tuples.
+                        */
+
+                       if (HeapTupleHeaderIsSpeculative((HeapTupleHeader) item))
+                       {
+                               mask_htup->t_ctid.ip_blkid.bi_hi = 0xFFFF;
+                               mask_htup->t_ctid.ip_blkid.bi_lo = 0xFFFF;
+                       }
+               }
+
+               /*
+                * Ignore any padding bytes after the tuple, when the length of
+                * the item is not MAXALIGNed.
+                */
+               if (ItemIdHasStorage(iid))
+               {
+                       int             len = ItemIdGetLength(iid);
+                       int             padlen = MAXALIGN(len) - len;
+
+                       if (padlen > 0)
+                               memset(mask_item + len, 0xFF, padlen);
+               }
+       }
+}
+
+static void
+build_spgist_mask(char *page, uint8 *mask)
+{
+       mask_unused_space(page, mask);
+}
+
+static void
+build_gist_mask(char *page, uint8 *mask)
+{
+       mask_unused_space(page, mask);
+}
+
+static void
+build_gin_mask(BlockNumber blkno, char *page, uint8 *mask)
+{
+       /* GIN metapage doesn't use pd_lower/pd_upper. Other page types do. */
+       if (blkno != 0)
+       {
+               /*
+                * Mask everything on a DELETED page.
+                */
+               if (GinPageIsDeleted(page))
+               {
+                       /* page content, between standard page header and opaque struct */
+                       memset(mask + SizeOfPageHeaderData, 0xFF, BLCKSZ - MAXALIGN(sizeof(GinPageOpaqueData)));
+
+                       /* pd_lower and upper */
+                       memset(&((PageHeader) mask)->pd_lower, 0xFF, sizeof(uint16));
+                       memset(&((PageHeader) mask)->pd_upper, 0xFF, sizeof(uint16));
+               }
+               else
+                       mask_unused_space(page, mask);
+       }
+}
+
+static void
+build_brin_mask(char *page, uint8 *mask)
+{
+       OffsetNumber off;
+       PageHeader mask_phdr = (PageHeader) mask;
+
+       mask_unused_space(page, mask);
+
+       /* Ignore PD_HAS_FREE_LINES flag, it is just a hint */
+       mask_phdr->pd_flags |= PD_HAS_FREE_LINES;
+}
+
+static void
+build_sequence_mask(char *page, uint8 *mask)
+{
+       /*
+        * FIXME: currently, we just ignore sequence records altogether. nextval
+        * records a different value in the WAL record than it writes to the
+        * buffer. Ideally we would only mask out the value in the tuple.
+        */
+       memset(mask, 0xFF, BLCKSZ);
+}
+
+static void
+build_btree_mask(char *page, uint8 *mask)
+{
+       OffsetNumber off;
+       OffsetNumber maxoff;
+
+       mask_unused_space(page, mask);
+
+       /*
+        * Mask everything on a DELETED page.
+        */
+       if (((BTPageOpaque) PageGetSpecialPointer(page))->btpo_flags & BTP_DELETED)
+       {
+               /* page content, between standard page header and opaque struct */
+               memset(mask + SizeOfPageHeaderData, 0xFF, BLCKSZ - MAXALIGN(sizeof(BTPageOpaqueData)));
+
+               /* pd_lower and upper */
+               memset(&((PageHeader) mask)->pd_lower, 0xFF, sizeof(uint16));
+               memset(&((PageHeader) mask)->pd_upper, 0xFF, sizeof(uint16));
+       }
+       else
+       {
+               /* Mask DEAD line pointer bits */
+               maxoff = PageGetMaxOffsetNumber(page);
+               for (off = 1; off <= maxoff; off++)
+               {
+                       ItemId iid = PageGetItemId(page, off);
+                       ItemIdData m;
+
+                       if (ItemIdIsDead(iid))
+                       {
+                               memset(&m, 0, sizeof(ItemIdData));
+                               m.lp_flags = 2;
+
+                               memcpy((char *) mask + (((char *) iid) - page), &m, sizeof(ItemIdData));
+                       }
+               }
+       }
+
+       /* Mask BTP_HAS_GARBAGE flag and cycle-ID in the opaque struct */
+       {
+               BTPageOpaque maskopaq = (BTPageOpaque) (((char *) mask) + ((PageHeader) page)->pd_special);
+
+               maskopaq->btpo_flags |= BTP_HAS_GARBAGE | BTP_SPLIT_END;
+
+               maskopaq->btpo_cycleid = 0xFFFF;
+       }
+}
+
+static inline unsigned char
+parsehex(char digit, bool *success)
+{
+       if (digit >= '0' && digit <= '9')
+       {
+               *success = true;
+               return (unsigned char) (digit - '0');
+       }
+       if (digit >= 'a' && digit <= 'f')
+       {
+               *success = true;
+               return (unsigned char) (digit - 'a' + 10);
+       }
+       if (digit >= 'A' && digit <= 'F')
+       {
+               *success = true;
+               return (unsigned char) (digit - 'A' + 10);
+       }
+       *success = false;
+       return 0;
+}
+
+static inline void
+tohex(uint8 byte, char *out)
+{
+       const char *digits = "0123456789ABCDEF";
+
+       out[0] = digits[byte >> 4];
+       out[1] = digits[byte & 0x0F];
+}
+
+/*
+ * Mask any known changing parts, like hint bits, from the line. The line
+ * is modified in place. Full nibbles to be ignored are set to 'X' in the
+ * hex output, and individiual bits are set to 0.
+ */
+static void
+maskline(char *line)
+{
+       char            page[BLCKSZ];
+       uint8           mask[BLCKSZ];
+       int                     i;
+       uint16          tail;
+       char       *pstart;
+       char       *p;
+       BlockNumber blkno;
+
+       /* Parse the line */
+       p = strstr(line, " blk: ");
+       if (p == NULL)
+               return;
+
+       sscanf(p, " blk: %u", &blkno);
+
+       pstart = strstr(line, "after: ");
+       if (pstart == NULL)
+               return;
+       pstart += strlen("after: ");
+
+       /* Decode the hex-encoded page back to raw bytes */
+       p = pstart;
+       for (i = 0; i < BLCKSZ; i++)
+       {
+               bool            success;
+               unsigned char c;
+
+               c = parsehex(*(p++), &success) << 4;
+               if (!success)
+                       return;
+               c |= parsehex(*(p++), &success);
+               if (!success)
+                       return;
+
+               page[i] = (char) c;
+       }
+
+       /*
+        * Ok, we now have the original block contents in 'page'. Look at the
+        * size of the special area, and the last two bytes in it, to detect
+        * what kind of a page it is. Call the appropriate masking function.
+        */
+
+       /* begin with an empty mask */
+       memset(mask, 0, BLCKSZ);
+
+       memcpy(&tail, &page[BLCKSZ - 2], 2);
+
+       /* Try to detect what kind of a page it is */
+       if (PageGetSpecialSize(page) == 0)
+       {
+               build_heap_mask(page, mask);
+       }
+       else if (PageGetSpecialSize(page) == 16)
+       {
+               if (tail == 0xFF81)
+                       build_gist_mask(page, mask);
+               else if (tail <= 0xFF7F)
+                       build_btree_mask(page, mask);
+       }
+       else if (PageGetSpecialSize(page) == 8)
+       {
+               if (tail == 0xFF82)
+                       build_spgist_mask(page, mask);
+               else if (tail == 0xF093)
+                       build_brin_mask(page, mask);
+               else if (*((uint32 *) (page + BLCKSZ - MAXALIGN(sizeof(uint32)))) == 0x1717)
+                       build_sequence_mask(page, mask);
+               else
+                       build_gin_mask(blkno, page, mask);
+       }
+
+       /* Apply the mask, replacing masked nibbles with # */
+       for (i = 0; i < BLCKSZ; i++)
+       {
+               uint8 c;
+
+               if (mask[i] == 0)
+                       continue;
+
+               c = ((uint8) page[i]) & ~mask[i];
+
+               tohex(c, &pstart[2 * i]);
+
+               if ((mask[i] & 0xF0) == 0xF0)
+                       pstart[2 * i] = '#';
+               if ((mask[i] & 0x0F) == 0x0F)
+                       pstart[2 * i + 1] = '#';
+       }
+}
+
+
+
+/* ----------------------------------------------------------------
+ * Line reordering
+ *
+ * When the page images are logged in master and standby, they are
+ * not necessarily written out in the same order. For example, if a
+ * single WAL-logged operation modifies multiple pages, like an index
+ * page split, the standby might release the locks in different order
+ * than the master. Another cause is concurrent operations; writing
+ * the page images is not atomic with WAL insertion, so if two
+ * backends are running concurrently, their modifications in the
+ * image log can be interleaved in different order than their WAL
+ * records.
+ *
+ * To fix that, the lines are read into a reorder buffer, and sorted
+ * there. Sorting the whole file would be overkill, as the lines are
+ * mostly in order already. The fixed-size reorder buffer works as
+ * long as the lines are not out-of-order by more than
+ * REORDER_BUFFER_SIZE lines.
+ */
+
+#define REORDER_BUFFER_SIZE 1000
+
+typedef struct
+{
+       char        *lines[REORDER_BUFFER_SIZE];
+       int             nlines;         /* number of lines currently in buffer */
+
+       FILE       *fp;
+       int                     lineno;         /* current input line number (for debugging) */
+       bool            eof;            /* have we reached EOF from this source? */
+} reorder_buffer;
+
+/*
+ * Read lines from the file into the reorder buffer, until the buffer is full.
+ */
+static void
+fill_reorder_buffer(reorder_buffer *buf)
+{
+       if (buf->eof)
+               return;
+
+       while (buf->nlines < REORDER_BUFFER_SIZE)
+       {
+               char *linebuf = pg_malloc(LINESZ);
+
+               if (fgets(linebuf, LINESZ, buf->fp) == NULL)
+               {
+                       buf->eof = true;
+                       pg_free(linebuf);
+                       break;
+               }
+               buf->lineno++;
+
+               /* common case: the new line goes to the end */
+               if (buf->nlines == 0 ||
+                       strcmp(linebuf, buf->lines[buf->nlines - 1]) >= 0)
+               {
+                       buf->lines[buf->nlines] = linebuf;
+               }
+               else
+               {
+                       /* find the right place in the queue */
+                       int                     i;
+
+                       for (i = buf->nlines - 2; i >= 0; i--)
+                       {
+                               if (strcmp(linebuf, buf->lines[i]) >= 0)
+                                       break;
+                       }
+                       if (i < 0)
+                       {
+                               fprintf(stderr, "could not reorder line %d\n", buf->lineno);
+                               pg_free(linebuf);
+                               continue;
+                       }
+                       i++;
+                       memmove(&buf->lines[i + 1], &buf->lines[i],
+                                       (buf->nlines - i) * sizeof(char *));
+                       buf->lines[i] = linebuf;
+               }
+               buf->nlines++;
+       }
+}
+
+static reorder_buffer *
+init_reorder_buffer(FILE *fp)
+{
+       reorder_buffer *buf;
+
+       buf = pg_malloc(sizeof(reorder_buffer));
+       buf->fp = fp;
+       buf->eof = false;
+       buf->lineno = 0;
+       buf->nlines = 0;
+
+       fill_reorder_buffer(buf);
+
+       return buf;
+}
+
+/*
+ * Read all the lines that belong to the next WAL record from the reorder
+ * buffer.
+ */
+static int
+readrecord(reorder_buffer *buf, char **lines, uint64 *lsn)
+{
+       int                     nlines;
+       uint32          line_xlogid;
+       uint32          line_xrecoff;
+       uint64          line_lsn;
+       uint64          rec_lsn;
+
+       /* Get all the lines with the same LSN */
+       for (nlines = 0; nlines < buf->nlines; nlines++)
+       {
+               sscanf(buf->lines[nlines], "LSN: %X/%08X", &line_xlogid, &line_xrecoff);
+               line_lsn = ((uint64) line_xlogid) << 32 | (uint64) line_xrecoff;
+
+               if (nlines == 0)
+                       *lsn = rec_lsn = line_lsn;
+               else
+               {
+                       if (line_lsn != rec_lsn)
+                               break;
+               }
+       }
+       if (nlines == buf->nlines)
+       {
+               if (!buf->eof)
+               {
+                       fprintf(stderr, "max number of lines in record reached, LSN: %X/%08X\n",
+                                       line_xlogid, line_xrecoff);
+                       exit(1);
+               }
+       }
+
+       /* consume the lines from the reorder buffer */
+       memcpy(lines, buf->lines, sizeof(char *) * nlines);
+       memmove(&buf->lines[0], &buf->lines[nlines],
+                       sizeof(char *) * (buf->nlines - nlines));
+       buf->nlines -= nlines;
+
+       fill_reorder_buffer(buf);
+
+       return nlines;
+}
+
+static void
+freerecord(char **lines, int nlines)
+{
+       int                     i;
+
+       for (i = 0; i < nlines; i++)
+               pg_free(lines[i]);
+}
+
+static void
+printrecord(char **lines, int nlines)
+{
+       int                     i;
+
+       for (i = 0; i < nlines; i++)
+               printf("%s", lines[i]);
+}
+
+static bool
+diffrecords(char **lines_a, int nlines_a, char **lines_b, int nlines_b)
+{
+       int i;
+
+       if (nlines_a != nlines_b)
+               return true;
+       for (i = 0; i < nlines_a; i++)
+       {
+               /* First try a straight byte-per-byte comparison. */
+               if (strcmp(lines_a[i], lines_b[i]) != 0)
+               {
+                       /* They were not byte-per-byte identical. Try again after masking */
+                       maskline(lines_a[i]);
+                       maskline(lines_b[i]);
+                       if (strcmp(lines_a[i], lines_b[i]) != 0)
+                               return true;
+               }
+       }
+
+       return false;
+}
+
+static void
+usage(void)
+{
+       printf("usage: postprocess-images <master's image file> <standby's image file>\n");
+}
+
+int
+main(int argc, char **argv)
+{
+       char       *lines_a[REORDER_BUFFER_SIZE];
+       int                     nlines_a;
+       char       *lines_b[REORDER_BUFFER_SIZE];
+       int                     nlines_b;
+       FILE       *fp_a;
+       FILE       *fp_b;
+       uint64          lsn_a;
+       uint64          lsn_b;
+       reorder_buffer *buf_a;
+       reorder_buffer *buf_b;
+
+       if (argc != 3)
+       {
+               usage();
+               exit(1);
+       }
+
+       fp_a = fopen(argv[1], "rb");
+       if (fp_a == NULL)
+       {
+               fprintf(stderr, "could not open file \"%s\"\n", argv[1]);
+               exit(1);
+       }
+       fp_b = fopen(argv[2], "rb");
+       if (fp_b == NULL)
+       {
+               fprintf(stderr, "could not open file \"%s\"\n", argv[2]);
+               exit(1);
+       }
+
+       buf_a = init_reorder_buffer(fp_a);
+       buf_b = init_reorder_buffer(fp_b);
+
+       /* read first record from both */
+       nlines_a = readrecord(buf_a, lines_a, &lsn_a);
+       nlines_b = readrecord(buf_b, lines_b, &lsn_b);
+
+       while (nlines_a > 0 || nlines_b > 0)
+       {
+               /* compare the records */
+               if (lsn_a < lsn_b || nlines_b == 0)
+               {
+                       printf("Only in A:\n");
+                       printrecord(lines_a, nlines_a);
+                       freerecord(lines_a, nlines_a);
+                       nlines_a = readrecord(buf_a, lines_a, &lsn_a);
+               }
+               else if (lsn_a > lsn_b || nlines_a == 0)
+               {
+                       printf("Only in B:\n");
+                       printrecord(lines_b, nlines_b);
+                       freerecord(lines_b, nlines_b);
+                       nlines_b = readrecord(buf_b, lines_b, &lsn_b);
+               }
+               else if (lsn_a == lsn_b)
+               {
+                       if (diffrecords(lines_a, nlines_a, lines_b, nlines_b))
+                       {
+                               printf("lines differ, A:\n");
+                               printrecord(lines_a, nlines_a);
+                               printf("B:\n");
+                               printrecord(lines_b, nlines_b);
+                       }
+                       freerecord(lines_a, nlines_a);
+                       freerecord(lines_b, nlines_b);
+                       nlines_a = readrecord(buf_a, lines_a, &lsn_a);
+                       nlines_b = readrecord(buf_b, lines_b, &lsn_b);
+               }
+       }
+
+       return 0;
+}
index abd8420da69fd0bde72318fa99b9d9d3cce7bcfa..fa82f30e5d787747cb1d2a9e12c24e3f61918361 100644 (file)
@@ -957,6 +957,12 @@ log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
                PageSetLSN(page, recptr);
        }
 
+       /*
+        * the normal mechanism that hooks into LockBuffer doesn't work for this,
+        * because we're bypassing buffer manager.
+        */
+       log_page_change(page, rnode, forkNum, blkno);
+
        return recptr;
 }
 
index cd3aaad610646ef172a326129f283fd02e5a0d33..b06973f9e1046bdd3878551a0ec809af129fa9ed 100644 (file)
@@ -414,6 +414,154 @@ static void AtProcExit_Buffers(int code, Datum arg);
 static void CheckForBufferLeaks(void);
 static int     rnode_comparator(const void *p1, const void *p2);
 
+/* Support for capturing changes to pages */
+typedef struct
+{
+       Buffer          buffer;
+       char            content[BLCKSZ];
+} BufferImage;
+
+#define MAX_BEFORE_IMAGES              100
+static BufferImage *before_images[MAX_BEFORE_IMAGES];
+int               before_images_cnt = 0;
+
+static FILE *imagefp;
+
+static bool
+log_before_after_images(char *msg, BufferImage *img)
+{
+       Page            newcontent = BufferGetPage(img->buffer);
+       Page            oldcontent = (Page) img->content;
+       XLogRecPtr      oldlsn;
+       XLogRecPtr      newlsn;
+       RelFileNode rnode;
+       ForkNumber      forknum;
+       BlockNumber     blkno;
+
+       oldlsn = PageGetLSN(oldcontent);
+       newlsn = PageGetLSN(newcontent);
+
+       if (oldlsn == newlsn)
+       {
+               /* no change */
+               return false;
+       }
+
+       BufferGetTag(img->buffer, &rnode, &forknum, &blkno);
+
+       log_page_change(newcontent, &rnode, forknum, blkno);
+
+       return true;
+}
+
+void
+log_page_change(char *newcontent, RelFileNode *rnode, int forknum, uint32 blkno)
+{
+       XLogRecPtr newlsn = PageGetLSN((Page) newcontent);
+       int                     i;
+
+       /*
+        * We need a lock to make sure that only one backend writes to the file
+        * at a time. Abuse SyncScanLock for that - it happens to never be used
+        * while a buffer is locked/unlocked.
+        */
+       LWLockAcquire(SyncScanLock, LW_EXCLUSIVE);
+
+       fprintf(imagefp, "LSN: %X/%08X, rel: %u/%u/%u, blk: %u; ",
+                       (uint32) (newlsn >> 32), (uint32) newlsn,
+                       rnode->spcNode, rnode->dbNode, rnode->relNode,
+                       blkno);
+       if (forknum != MAIN_FORKNUM)
+               fprintf(imagefp, "forknum: %u; ", forknum);
+
+       /* write the page contents, in hex */
+       {
+               char            buf[BLCKSZ * 2];
+               int                     j = 0;
+
+               fprintf(imagefp, "after: ");
+               for (i = 0; i < BLCKSZ; i++)
+               {
+                       const char *digits = "0123456789ABCDEF";
+                       uint8           byte = (uint8) newcontent[i];
+
+                       buf[j++] = digits[byte >> 4];
+                       buf[j++] = digits[byte & 0x0F];
+               }
+               fwrite(buf, BLCKSZ * 2, 1, imagefp);
+       }
+
+       fprintf(imagefp, "\n");
+       fflush(imagefp);
+
+       LWLockRelease(SyncScanLock);
+}
+
+static void
+remember_before_image(Buffer buffer)
+{
+       BufferImage *img;
+
+       Assert(before_images_cnt < MAX_BEFORE_IMAGES);
+
+       img = before_images[before_images_cnt];
+       img->buffer = buffer;
+       memcpy (img->content, BufferGetPage(buffer), BLCKSZ);
+       before_images_cnt++;
+}
+
+/*
+ * Forget a buffer image. If the page was modified, log the new contents.
+ */
+static void
+forget_image(Buffer buffer)
+{
+       int                     i;
+
+       for (i = 0; i < before_images_cnt; i++)
+       {
+               BufferImage *img = before_images[i];
+
+               if (img->buffer == buffer)
+               {
+
+                       log_before_after_images("forget", before_images[i]);
+                       if (i != before_images_cnt)
+                       {
+                               /* swap the last still-used slot with this one */
+                               before_images[i] = before_images[before_images_cnt - 1];
+                               before_images[before_images_cnt - 1] = img;
+                       }
+                       before_images_cnt--;
+
+                       return;
+               }
+       }
+       elog(LOG, "could not find image for buffer %u", buffer);
+}
+
+/*
+ * See if any of the buffers we've memorized have changed.
+ */
+void
+log_page_changes(char *msg)
+{
+       int i;
+
+       for (i = 0; i < before_images_cnt; i++)
+       {
+               BufferImage *img = before_images[i];
+
+               /*
+                * Print the contents of the page if it was changed. Remember the
+                * new contents as the current image.
+                */
+               if (log_before_after_images(msg, img))
+               {
+                       memcpy(img->content, BufferGetPage(img->buffer), BLCKSZ);
+               }
+       }
+}
 
 /*
  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
@@ -677,7 +825,10 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                        if (!isLocalBuf)
                        {
                                if (mode == RBM_ZERO_AND_LOCK)
+                               {
                                        LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
+                                       remember_before_image(BufferDescriptorGetBuffer(bufHdr));
+                               }
                                else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
                                        LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
                        }
@@ -819,6 +970,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                !isLocalBuf)
        {
                LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
+               remember_before_image(BufferDescriptorGetBuffer(bufHdr));
        }
 
        if (isLocalBuf)
@@ -2077,6 +2229,10 @@ AtEOXact_Buffers(bool isCommit)
 {
        CheckForBufferLeaks();
 
+       if (before_images_cnt > 0)
+               elog(LOG, "released all page-images (AtEOXact_Buffers)");
+       before_images_cnt = 0;
+
        AtEOXact_LocalBuffers(isCommit);
 
        Assert(PrivateRefCountOverflowed == 0);
@@ -2121,7 +2277,18 @@ InitBufferPoolAccess(void)
 void
 InitBufferPoolBackend(void)
 {
+       int                     i;
+       BufferImage *images;
+
        on_shmem_exit(AtProcExit_Buffers, 0);
+
+       /* Initialize page image capturing */
+       images = palloc(MAX_BEFORE_IMAGES * sizeof(BufferImage));
+
+       for (i = 0; i < MAX_BEFORE_IMAGES; i++)
+               before_images[i] = &images[i];
+
+       imagefp = fopen("buffer-images", "ab");
 }
 
 /*
@@ -3107,7 +3274,30 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
                         * sometime later in this checkpoint cycle.
                         */
                        if (!XLogRecPtrIsInvalid(lsn))
+                       {
                                PageSetLSN(page, lsn);
+
+                               /*
+                                * the normal mechanism that hooks into LockBuffer doesn't work
+                                * for this, if we're only holding a SHARE lock on the buffer.
+                                */
+                               if (!LWLockHeldByMeInExclusiveMode(bufHdr->content_lock))
+                               {
+                                       RelFileNode rnode;
+                                       ForkNumber forkno;
+                                       BlockNumber blkno;
+
+                                       /*
+                                        * XXX: we pass the original page to log_page_change,
+                                        * ignoring the possibility that the buffer is modified
+                                        * concurrently by another hint bit update. That's OK,
+                                        * the comparison tool ignores differences in hint bits
+                                        * anyway.
+                                        */
+                                       BufferGetTag(buffer, &rnode, &forkno, &blkno);
+                                       log_page_change(BufferGetPage(buffer), &rnode, forkno, blkno);
+                               }
+                       }
                }
                bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
                UnlockBufHdr(bufHdr);
@@ -3171,6 +3361,16 @@ LockBuffer(Buffer buffer, int mode)
 
        buf = GetBufferDescriptor(buffer - 1);
 
+       /*
+        * If we were holding it in exclusive-mode, log any changes we made to
+        * it while holding the log in the page-image log.
+        */
+       if (mode == BUFFER_LOCK_UNLOCK &&
+               LWLockHeldByMeInExclusiveMode(buf->content_lock))
+       {
+               forget_image(buffer);
+       }
+
        if (mode == BUFFER_LOCK_UNLOCK)
                LWLockRelease(buf->content_lock);
        else if (mode == BUFFER_LOCK_SHARE)
@@ -3179,6 +3379,9 @@ LockBuffer(Buffer buffer, int mode)
                LWLockAcquire(buf->content_lock, LW_EXCLUSIVE);
        else
                elog(ERROR, "unrecognized buffer lock mode: %d", mode);
+
+       if (mode == BUFFER_LOCK_EXCLUSIVE)
+               remember_before_image(buffer);
 }
 
 /*
@@ -3190,6 +3393,7 @@ bool
 ConditionalLockBuffer(Buffer buffer)
 {
        volatile BufferDesc *buf;
+       bool    res;
 
        Assert(BufferIsValid(buffer));
        if (BufferIsLocal(buffer))
@@ -3197,7 +3401,11 @@ ConditionalLockBuffer(Buffer buffer)
 
        buf = GetBufferDescriptor(buffer - 1);
 
-       return LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE);
+       res = LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE);
+       if (res)
+               remember_before_image(buffer);
+
+       return res;
 }
 
 /*
index 687ed6399cb19ed3183bb75a0214b57229e05675..a98b35d5a15735e2a3ff1716cee2aabbded35947 100644 (file)
@@ -1629,6 +1629,10 @@ LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val)
 void
 LWLockReleaseAll(void)
 {
+       if (before_images_cnt > 0)
+               elog(LOG, "released all page images");
+       before_images_cnt = 0;
+
        while (num_held_lwlocks > 0)
        {
                HOLD_INTERRUPTS();              /* match the upcoming RESUME_INTERRUPTS */
@@ -1656,3 +1660,21 @@ LWLockHeldByMe(LWLock *l)
        }
        return false;
 }
+
+bool
+LWLockHeldByMeInExclusiveMode(LWLock *l)
+{
+       int                     i;
+
+       for (i = 0; i < num_held_lwlocks; i++)
+       {
+               if (held_lwlocks[i].lock == l)
+               {
+                       if (held_lwlocks[i].mode == LW_EXCLUSIVE)
+                               return true;
+                       else
+                               return false;
+               }
+       }
+       return false;
+}
index e0cc69f27ef1e39f182abb0796490262940c02eb..50692ea51a846c3fa7f0f265bdcd9d2d30e965f5 100644 (file)
@@ -91,6 +91,14 @@ extern PGDLLIMPORT volatile uint32 CritSectionCount;
 /* in tcop/postgres.c */
 extern void ProcessInterrupts(void);
 
+/* in bufmgr.c, related to capturing page images */
+extern void log_page_changes(char *msg);
+struct RelFileNode;
+extern void log_page_change(char *newcontent, struct RelFileNode *rnode, int forknum, uint32 blkno);
+
+extern int before_images_cnt;
+
+
 #ifndef WIN32
 
 #define CHECK_FOR_INTERRUPTS() \
@@ -132,6 +140,8 @@ do { \
 do { \
        Assert(CritSectionCount > 0); \
        CritSectionCount--; \
+       if (CritSectionCount == 0) \
+               log_page_changes("END_CRIT_SECTION");           \
 } while(0)
 
 
index f2ff6a06f0187feb89f87e7d5b42d44645b7bdae..6e9ad48dfa8326209f3f5ecd9d300bb910901a01 100644 (file)
@@ -189,6 +189,7 @@ extern void LWLockRelease(LWLock *lock);
 extern void LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val);
 extern void LWLockReleaseAll(void);
 extern bool LWLockHeldByMe(LWLock *lock);
+extern bool LWLockHeldByMeInExclusiveMode(LWLock *lock);
 
 extern bool LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval);
 extern void LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 value);