From c402060814ba9737e9572ba07eef1be7664c9679 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Date: Thu, 8 Aug 2019 18:50:51 +0500 Subject: [PATCH] Fix for 0000483: online-recovery is blocked after a child process exits ... The problem is if some child process exits abnormally during the second stage of online recovery, then the connection counter that keeps the track of exiting processes does not get decremented and Pgpool-II keeps waiting for the exit of the already exited process. Eventually, the recovery fails after client_idle_limit_in_recovery expires. The fix for this issue is to set the connection counter to zero when client_idle_limit_in_recovery is enabled and it has less value than recovery_timeout, Since all clients must have been kicked out by the time when client_idle_limit_in_recovery expires. A similar fix is already committed as part of bug 431 by Tatsuo Ishii, So this commit basically imports the same logic in the watchdog function that processes the remote online recovery requests. Apart from the above-mentioned change, Hoshiai San identified that the watchdog IPC command timeout for the online recovery start functions executed through watchdog is set exactly to the same as recovery_timeout which needs to be increased to make the solution work correctly. --- src/include/pool.h | 3 ++- src/pcp_con/recovery.c | 4 ++++ src/watchdog/watchdog.c | 11 +++++++++-- src/watchdog/wd_commands.c | 4 ++-- 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/include/pool.h b/src/include/pool.h index 3569467d4..8995b9618 100644 --- a/src/include/pool.h +++ b/src/include/pool.h @@ -780,7 +780,8 @@ extern void pool_ps_idle_display(POOL_CONNECTION_POOL * backend); /* recovery.c */ extern void start_recovery(int recovery_node); extern void finish_recovery(void); -extern int wait_connection_closed(void); +extern int wait_connection_closed(void); +extern int ensure_conn_counter_validity(void); /* child.c */ extern void cancel_request(CancelPacket * sp); diff --git a/src/pcp_con/recovery.c b/src/pcp_con/recovery.c index 11597af2b..a7cc4c7ab 100644 --- a/src/pcp_con/recovery.c +++ b/src/pcp_con/recovery.c @@ -483,7 +483,11 @@ wait_connection_closed(void) } while (i++ < WAIT_RETRY_COUNT); ereport(LOG, (errmsg("wait_connection_closed: existing connections did not close in %d sec.", pool_config->recovery_timeout))); + return ensure_conn_counter_validity(); +} +int ensure_conn_counter_validity(void) +{ /* * recovery_timeout was expired. Before returning with failure status, * let's check if this is caused by the malformed conn_counter. If a child diff --git a/src/watchdog/watchdog.c b/src/watchdog/watchdog.c index ebf263f25..403a641e2 100644 --- a/src/watchdog/watchdog.c +++ b/src/watchdog/watchdog.c @@ -6465,7 +6465,10 @@ process_remote_online_recovery_command(WatchdogNode * wdNode, WDPacketData * pkt } else if (pool_config->recovery_timeout <= 0) { - reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt); + if (ensure_conn_counter_validity() == 0) + reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt); + else + reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt); } else { @@ -6589,7 +6592,11 @@ process_wd_command_timer_event(bool timer_expired, WDFunctionCommandData * wd_fu WDPacketData emptyPkt; emptyPkt.command_id = wd_func_command->commandID; - reply_with_minimal_message(wd_func_command->wdNode, WD_REJECT_MESSAGE, &emptyPkt); + + if (ensure_conn_counter_validity() == 0) + reply_with_minimal_message(wd_func_command->wdNode, WD_ACCEPT_MESSAGE, &emptyPkt); + else + reply_with_minimal_message(wd_func_command->wdNode, WD_REJECT_MESSAGE, &emptyPkt); return true; } return false; diff --git a/src/watchdog/wd_commands.c b/src/watchdog/wd_commands.c index 9b2e4791d..82d91809b 100644 --- a/src/watchdog/wd_commands.c +++ b/src/watchdog/wd_commands.c @@ -594,7 +594,7 @@ wd_start_recovery(void) shared_key ? *shared_key : 0, pool_config->wd_authkey); WDIPCCmdResult *result = issue_command_to_watchdog(WD_IPC_ONLINE_RECOVERY_COMMAND, - pool_config->recovery_timeout, + pool_config->recovery_timeout + WD_DEFAULT_IPC_COMMAND_TIMEOUT, func, strlen(func), true); pfree(func); @@ -707,7 +707,7 @@ wd_send_failover_func_status_command(bool start) char *json_data = get_wd_failover_state_json(start); WDIPCCmdResult *result = issue_command_to_watchdog(WD_FAILOVER_INDICATION - ,pool_config->recovery_timeout, + ,WD_DEFAULT_IPC_COMMAND_TIMEOUT, json_data, strlen(json_data), true); pfree(json_data); -- 2.39.5