Allow time delayed standbys and recovery
authorSimon Riggs <simon@2ndQuadrant.com>
Thu, 12 Dec 2013 10:53:20 +0000 (10:53 +0000)
committerSimon Riggs <simon@2ndQuadrant.com>
Thu, 12 Dec 2013 10:53:20 +0000 (10:53 +0000)
Set min_recovery_apply_delay to force a delay in recovery apply for commit and
restore point WAL records. Other records are replayed immediately. Delay is
measured between WAL record time and local standby time.

Robert Haas, FabrΓ­zio de Royes Mello and Simon Riggs
Detailed review by Mitsumasa Kondo

doc/src/sgml/recovery-config.sgml
src/backend/access/transam/recovery.conf.sample
src/backend/access/transam/xlog.c

index 9d80256a5568665925b194e4ee737dd4c1ffda8d..ee5dc8687e2ee06b9aa40030a5aac13c2d5cff44 100644 (file)
@@ -142,6 +142,56 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"'  # Windows
       </listitem>
      </varlistentry>
 
+     <varlistentry id="min-recovery-apply-delay" xreflabel="min_recovery_apply_delay">
+      <term><varname>min_recovery_apply_delay</varname> (<type>integer</type>)</term>
+      <indexterm>
+        <primary><varname>min_recovery_apply_delay</> recovery parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        By default, a standby server keeps restoring WAL records from the
+        primary as soon as possible. It may be useful to have a time-delayed
+        copy of the data, offering various options to correct data loss errors.
+        This paramater allows you to delay recovery by a fixed period of time,
+        specified in milliseconds if no unit is specified.  For example, if
+        you set this parameter to <literal>5min</literal>, the standby will
+        replay each transaction commit only when the system time on the standby
+        is at least five minutes past the commit time reported by the master.
+       </para>
+       <para>
+        It is possible that the replication delay between servers exceeds the
+        value of this parameter, in which case no delay is added.
+        Note that the delay is calculated between the WAL timestamp as written
+        on master and the time on the current standby. Delays
+        in transfer because of networks or cascading replication configurations
+        may reduce the actual wait time significantly. If the system
+        clocks on master and standby are not synchronised, this may lead to
+        recovery applying records earlier than expected but is not a major issue
+        because the useful settings of the parameter are much larger than
+        typical time deviation between the servers. Be careful to allow for
+        different timezone settings on master and standby.
+       </para>
+       <para>
+        The delay occurs only on WAL records for COMMIT and Restore Points.
+        Other records may be replayed earlier than the specified delay, which
+        is not an issue for MVCC though may potentially increase the number
+        of recovery conflicts generated.
+       </para>
+       <para>
+        The delay occurs until the standby is promoted or triggered. After that
+        the standby will end recovery without further waiting.
+       </para>
+       <para>
+        This parameter is intended for use with streaming replication deployments,
+        however, if the parameter is specified it will be honoured in all cases.
+        Synchronous replication is not affected by this setting because there is
+        not yet any setting to request synchronous apply of transaction commits.
+        <varname>hot_standby_feedback</> will be delayed by use of this feature
+        which could lead to bloat on the master; use both together with care.
+       </para>
+      </listitem>
+     </varlistentry>
+
     </variablelist>
 
   </sect1>
index 5acfa572f0eec8d25c85d9627f69306bb938cb7a..673605cfc661d1a1bc69e78f7a79f4c0fbb7ac9e 100644 (file)
 #
 #trigger_file = ''
 #
+# By default, a standby server keeps restoring XLOG records from the
+# primary as soon as possible. If you want to explicitly delay the replay of
+# committed transactions from the master, specify a recovery apply delay.
+# For example, if you set this parameter to 5min, the standby will replay
+# each transaction commit only when the system time on the standby is least
+# five minutes past the commit time reported by the master.
+#
+#min_recovery_apply_delay = 0
+#
 #---------------------------------------------------------------------------
 # HOT STANDBY PARAMETERS
 #---------------------------------------------------------------------------
index 6fa5479c92b84b75819fc702f6a4d35be3ad638a..a76aef37f3d2f6066153fa6080d0183efe224390 100644 (file)
@@ -218,6 +218,8 @@ static bool recoveryPauseAtTarget = true;
 static TransactionId recoveryTargetXid;
 static TimestampTz recoveryTargetTime;
 static char *recoveryTargetName;
+static int min_recovery_apply_delay = 0;
+static TimestampTz recoveryDelayUntilTime;
 
 /* options taken from recovery.conf for XLOG streaming */
 static bool StandbyModeRequested = false;
@@ -728,8 +730,10 @@ static bool holdingAllSlots = false;
 
 static void readRecoveryCommandFile(void);
 static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo);
-static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
+static bool recoveryStopsHere(XLogRecord *record, bool *includeThis, bool *delayThis);
 static void recoveryPausesHere(void);
+static void recoveryApplyDelay(void);
+static bool SetRecoveryDelayUntilTime(TimestampTz xtime);
 static void SetLatestXTime(TimestampTz xtime);
 static void SetCurrentChunkStartTime(TimestampTz xtime);
 static void CheckRequiredParameterValues(void);
@@ -5476,6 +5480,19 @@ readRecoveryCommandFile(void)
                    (errmsg_internal("trigger_file = '%s'",
                                     TriggerFile)));
        }
+       else if (strcmp(item->name, "min_recovery_apply_delay") == 0)
+       {
+           const char *hintmsg;
+
+           if (!parse_int(item->value, &min_recovery_apply_delay, GUC_UNIT_MS,
+                   &hintmsg))
+               ereport(ERROR,
+                       (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                        errmsg("parameter \"%s\" requires a temporal value", "min_recovery_apply_delay"),
+                        hintmsg ? errhint("%s", _(hintmsg)) : 0));
+           ereport(DEBUG2,
+                   (errmsg("min_recovery_apply_delay = '%s'", item->value)));
+       }
        else
            ereport(FATAL,
                    (errmsg("unrecognized recovery parameter \"%s\"",
@@ -5625,10 +5642,11 @@ exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo)
  * We also track the timestamp of the latest applied COMMIT/ABORT
  * record in XLogCtl->recoveryLastXTime, for logging purposes.
  * Also, some information is saved in recoveryStopXid et al for use in
- * annotating the new timeline's history file.
+ * annotating the new timeline's history file; and recoveryDelayUntilTime
+ * is updated, for time-delayed standbys.
  */
 static bool
-recoveryStopsHere(XLogRecord *record, bool *includeThis)
+recoveryStopsHere(XLogRecord *record, bool *includeThis, bool *delayThis)
 {
    bool        stopsHere;
    uint8       record_info;
@@ -5645,6 +5663,8 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
 
        recordXactCommitData = (xl_xact_commit_compact *) XLogRecGetData(record);
        recordXtime = recordXactCommitData->xact_time;
+
+       *delayThis = SetRecoveryDelayUntilTime(recordXactCommitData->xact_time);
    }
    else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT)
    {
@@ -5652,6 +5672,8 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
 
        recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
        recordXtime = recordXactCommitData->xact_time;
+
+       *delayThis = SetRecoveryDelayUntilTime(recordXactCommitData->xact_time);
    }
    else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT)
    {
@@ -5659,6 +5681,13 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
 
        recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
        recordXtime = recordXactAbortData->xact_time;
+
+       /*
+        * We deliberately choose not to delay aborts since they have no
+        * effect on MVCC. We already allow replay of records that don't
+        * have a timestamp, so there is already opportunity for issues
+        * caused by early conflicts on standbys.
+        */
    }
    else if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
    {
@@ -5667,6 +5696,8 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
        recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
        recordXtime = recordRestorePointData->rp_time;
        strncpy(recordRPName, recordRestorePointData->rp_name, MAXFNAMELEN);
+
+       *delayThis = SetRecoveryDelayUntilTime(recordRestorePointData->rp_time);
    }
    else
        return false;
@@ -5833,6 +5864,66 @@ SetRecoveryPause(bool recoveryPause)
    SpinLockRelease(&xlogctl->info_lck);
 }
 
+static bool
+SetRecoveryDelayUntilTime(TimestampTz xtime)
+{
+   if (min_recovery_apply_delay != 0)
+   {
+       recoveryDelayUntilTime =
+           TimestampTzPlusMilliseconds(xtime, min_recovery_apply_delay);
+
+       return true;
+   }
+
+   return false;
+}
+/*
+ * When min_recovery_apply_delay is set, we wait long enough to make sure
+ * certain record types are applied at least that interval behind the master.
+ * See recoveryStopsHere().
+ *
+ * Note that the delay is calculated between the WAL record log time and
+ * the current time on standby. We would prefer to keep track of when this
+ * standby received each WAL record, which would allow a more consistent
+ * approach and one not affected by time synchronisation issues, but that
+ * is significantly more effort and complexity for little actual gain in
+ * usability.
+ */
+static void
+recoveryApplyDelay(void)
+{
+   while (true)
+   {
+       long    secs;
+       int     microsecs;
+
+       ResetLatch(&XLogCtl->recoveryWakeupLatch);
+
+       /* might change the trigger file's location */
+       HandleStartupProcInterrupts();
+
+       if (CheckForStandbyTrigger())
+           break;
+
+       /*
+        * Wait for difference between GetCurrentTimestamp() and
+        * recoveryDelayUntilTime
+        */
+       TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
+                           &secs, &microsecs);
+
+       if (secs <= 0 && microsecs <=0)
+           break;
+
+       elog(DEBUG2, "recovery apply delay %ld seconds, %d milliseconds",
+           secs, microsecs / 1000);
+
+       WaitLatch(&XLogCtl->recoveryWakeupLatch,
+                   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+                   secs * 1000L + microsecs / 1000);
+   }
+}
+
 /*
  * Save timestamp of latest processed commit/abort record.
  *
@@ -6660,6 +6751,7 @@ StartupXLOG(void)
        {
            bool        recoveryContinue = true;
            bool        recoveryApply = true;
+           bool        recoveryDelay = false;
            ErrorContextCallback errcallback;
            TimestampTz xtime;
 
@@ -6719,7 +6811,7 @@ StartupXLOG(void)
                /*
                 * Have we reached our recovery target?
                 */
-               if (recoveryStopsHere(record, &recoveryApply))
+               if (recoveryStopsHere(record, &recoveryApply, &recoveryDelay))
                {
                    if (recoveryPauseAtTarget)
                    {
@@ -6734,6 +6826,25 @@ StartupXLOG(void)
                        break;
                }
 
+               /*
+                * If we've been asked to lag the master, wait on
+                * latch until enough time has passed.
+                */
+               if (recoveryDelay)
+               {
+                   recoveryApplyDelay();
+
+                   /*
+                    * We test for paused recovery again here. If
+                    * user sets delayed apply, it may be because
+                    * they expect to pause recovery in case of
+                    * problems, so we must test again here otherwise
+                    * pausing during the delay-wait wouldn't work.
+                    */
+                   if (xlogctl->recoveryPause)
+                       recoveryPausesHere();
+               }
+
                /* Setup error traceback support for ereport() */
                errcallback.callback = rm_redo_error_callback;
                errcallback.arg = (void *) record;