Fix the case when all backends are down then 1 node attached.
authorTatsuo Ishii <ishii@postgresql.org>
Wed, 21 Sep 2016 00:26:46 +0000 (09:26 +0900)
committerTatsuo Ishii <ishii@postgresql.org>
Wed, 21 Sep 2016 07:19:32 +0000 (16:19 +0900)
When all backends are down, no connection is accepted. Then 1
PostgreSQL becomes up, and attach the node using pcp_attach_node. It
successfully finishes. However, when a new connection arrives, still
the connection is refused because pgpool child process looks into the
cached status, in which the recovered node is still in down status if
mode is streaming replication mode (native replication and other modes
are fine). Solution is, if all nodes are down, force to restart all
pgpool child.

Per bug 248.

main.c

diff --git a/main.c b/main.c
index fa202db5f04365aeed967f2b753d9db9f348f125..bedb0aacf8f6a27aa2513a7345b519d6da4385f9 100644 (file)
--- a/main.c
+++ b/main.c
@@ -1823,6 +1823,7 @@ static void failover(void)
        int status;
        int sts;
        bool need_to_restart_pcp = false;
+       bool all_backend_down = true;
 
        pool_debug("failover_handler called");
 
@@ -1933,6 +1934,19 @@ static void failover(void)
                                         BACKEND_INFO(node_id).backend_hostname,
                                         BACKEND_INFO(node_id).backend_port);
 
+                       /* Check to see if all backends are down */
+                       for (i=0;i<NUM_BACKENDS;i++)
+                       {
+                               if (BACKEND_INFO(i).backend_status != CON_DOWN &&
+                                       BACKEND_INFO(i).backend_status != CON_UNUSED)
+                               {
+                                       pool_log("Node %d is not down (status: %d)",
+                                                        i, BACKEND_INFO(i).backend_status);
+                                       all_backend_down = false;
+                                       break;
+                               }
+                       }
+
                        BACKEND_INFO(node_id).backend_status = CON_CONNECT_WAIT;        /* unset down status */
 
                        /* wait for failback command lock or to be lock holder */
@@ -2066,10 +2080,21 @@ static void failover(void)
                * attached node, but load balanced node is not changed until this
                * session ends, so it's harmless anyway.
                */
-               if (MASTER_SLAVE && !strcmp(pool_config->master_slave_sub_mode, MODE_STREAMREP) &&
-                       reqkind == NODE_UP_REQUEST)
+
+               /*
+                * On 2015/9/21 Tatsuo Ishii says: this judgment is not sufficient if
+                * all backends were down. Child process has local status in which all
+                * backends are down. In this case even if new connection arrives from
+                * frontend, the child will not accept it because the local status
+                * shows all backends are down. For this purpose we refer to
+                * "all_backend_down" variable, which was set before updating backend status.
+                *
+                * See bug 248 for more details.
+                */
+
+               if (STREAM && reqkind == NODE_UP_REQUEST &&     all_backend_down == false)
                {
-                       pool_log("Do not restart children because we are failbacking node id %d host%s port:%d and we are in streaming replication mode", node_id,
+                       pool_log("Do not restart children because we are failbacking node id %d host: %s port: %d and we are in streaming replication mode and not all backends were down", node_id,
                                         BACKEND_INFO(node_id).backend_hostname,
                                         BACKEND_INFO(node_id).backend_port);