Fix for 0000483: online-recovery is blocked after a child process exits ...
authorMuhammad Usama <m.usama@gmail.com>
Thu, 8 Aug 2019 13:50:51 +0000 (18:50 +0500)
committerMuhammad Usama <m.usama@gmail.com>
Thu, 8 Aug 2019 13:50:51 +0000 (18:50 +0500)
The problem is if some child process exits abnormally during the second stage
of online recovery, then the connection counter that keeps the track of exiting
processes does not get decremented and Pgpool-II keeps waiting for the exit of
the already exited process. Eventually, the recovery fails after
client_idle_limit_in_recovery expires.

The fix for this issue is to set the connection counter to zero when
client_idle_limit_in_recovery is enabled and it has less value than
recovery_timeout, Since all clients must have been kicked out by the time
when client_idle_limit_in_recovery expires.

A similar fix is already committed as part of bug 431 by Tatsuo Ishii, So this
commit basically imports the same logic in the watchdog function that processes
the remote online recovery requests.

Apart from the above-mentioned change,  Hoshiai San identified that the watchdog
IPC command timeout for the online recovery start functions executed through
watchdog is set exactly to the same as recovery_timeout which needs to be
increased to make the solution work correctly.

src/include/pool.h
src/pcp_con/recovery.c
src/watchdog/watchdog.c
src/watchdog/wd_commands.c

index 3569467d49b548983437ddb757e748a4bf8978a6..8995b961803c86a6f767c6e9fc416c984e674eb4 100644 (file)
@@ -780,7 +780,8 @@ extern void pool_ps_idle_display(POOL_CONNECTION_POOL * backend);
 /* recovery.c */
 extern void start_recovery(int recovery_node);
 extern void finish_recovery(void);
-extern int     wait_connection_closed(void);
+extern int wait_connection_closed(void);
+extern int ensure_conn_counter_validity(void);
 
 /* child.c */
 extern void cancel_request(CancelPacket * sp);
index 11597af2be4478031faf4a7b43b8b6c58f4a8b15..a7cc4c7abab8213ab7e04998f6f53eb13d0a0fe2 100644 (file)
@@ -483,7 +483,11 @@ wait_connection_closed(void)
        } while (i++ < WAIT_RETRY_COUNT);
        ereport(LOG,
                        (errmsg("wait_connection_closed: existing connections did not close in %d sec.", pool_config->recovery_timeout)));
+       return ensure_conn_counter_validity();
+}
 
+int ensure_conn_counter_validity(void)
+{
        /*
         * recovery_timeout was expired. Before returning with failure status,
         * let's check if this is caused by the malformed conn_counter. If a child
index ebf263f2519fd775c89f7596100b07df20097523..403a641e25ac3cad5fe3e06306493fb8eed527be 100644 (file)
@@ -6465,7 +6465,10 @@ process_remote_online_recovery_command(WatchdogNode * wdNode, WDPacketData * pkt
                                }
                                else if (pool_config->recovery_timeout <= 0)
                                {
-                                       reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
+                                       if (ensure_conn_counter_validity() == 0)
+                                               reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
+                                       else
+                                               reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
                                }
                                else
                                {
@@ -6589,7 +6592,11 @@ process_wd_command_timer_event(bool timer_expired, WDFunctionCommandData * wd_fu
                                WDPacketData emptyPkt;
 
                                emptyPkt.command_id = wd_func_command->commandID;
-                               reply_with_minimal_message(wd_func_command->wdNode, WD_REJECT_MESSAGE, &emptyPkt);
+
+                               if (ensure_conn_counter_validity() == 0)
+                                       reply_with_minimal_message(wd_func_command->wdNode, WD_ACCEPT_MESSAGE, &emptyPkt);
+                               else
+                                       reply_with_minimal_message(wd_func_command->wdNode, WD_REJECT_MESSAGE, &emptyPkt);
                                return true;
                        }
                        return false;
index 9b2e4791dad30fdcbdc5110aa38fb240c1a8cc56..82d91809b5485745cf4cbecd596934ce9465004f 100644 (file)
@@ -594,7 +594,7 @@ wd_start_recovery(void)
                                                                                                 shared_key ? *shared_key : 0, pool_config->wd_authkey);
 
        WDIPCCmdResult *result = issue_command_to_watchdog(WD_IPC_ONLINE_RECOVERY_COMMAND,
-                                                                                                          pool_config->recovery_timeout,
+                                                                                                          pool_config->recovery_timeout + WD_DEFAULT_IPC_COMMAND_TIMEOUT,
                                                                                                           func, strlen(func), true);
 
        pfree(func);
@@ -707,7 +707,7 @@ wd_send_failover_func_status_command(bool start)
        char       *json_data = get_wd_failover_state_json(start);
 
        WDIPCCmdResult *result = issue_command_to_watchdog(WD_FAILOVER_INDICATION
-                                                                                                          ,pool_config->recovery_timeout,
+                                                                                                          ,WD_DEFAULT_IPC_COMMAND_TIMEOUT,
                                                                                                           json_data, strlen(json_data), true);
 
        pfree(json_data);