From: Muhammad Usama <m.usama@gmail.com>
Date: Wed, 7 Aug 2019 15:22:01 +0000 (+0500)
Subject: Fix for no primary on standby pgpool when primary is quarantined on master
X-Git-Tag: V4_1_0_BETA1~46
X-Git-Url: http://git.postgresql.org/gitweb/?a=commitdiff_plain;h=3922c12c1f8efbc1b5f2e7def1e0ff921aafb989;p=pgpool2.git

Fix for no primary on standby pgpool when primary is quarantined on master

Master watchdog Pgpool sends primary_node_id = -1 in the backend status sync
message if the primary node is quarantined on it. So standby watchdog Pgpool
must not update its primary_node_id if the primary backend node id in sync
message is invalid_node_id (-1) while the same sync message reports the
backend status of the current primary node as "NOT DOWN".

The issue was reported by  "Tatsuo Ishii <ishii@sraoss.co.jp>" and fixed by me
---

diff --git a/src/main/pgpool_main.c b/src/main/pgpool_main.c
index 371aa69cf..1a03927bc 100644
--- a/src/main/pgpool_main.c
+++ b/src/main/pgpool_main.c
@@ -4151,17 +4151,6 @@ sync_backend_from_watchdog(void)
 	ereport(DEBUG1,
 			(errmsg("primary node on master watchdog node \"%s\" is %d", backendStatus->nodeName, backendStatus->primary_node_id)));
 
-	if (Req_info->primary_node_id != backendStatus->primary_node_id)
-	{
-		/* Do not produce this log message if we are starting up the Pgpool-II */
-		if (processState != INITIALIZING)
-			ereport(LOG,
-					(errmsg("primary node:%d on master watchdog node \"%s\" is different from local primary node:%d",
-							backendStatus->primary_node_id, backendStatus->nodeName, Req_info->primary_node_id)));
-
-		Req_info->primary_node_id = backendStatus->primary_node_id;
-		primary_changed = true;
-	}
 
 	/*
 	 * update the local backend status Also remove quarantine flags
@@ -4204,6 +4193,34 @@ sync_backend_from_watchdog(void)
 			}
 		}
 	}
+
+	if (Req_info->primary_node_id != backendStatus->primary_node_id)
+	{
+		/* Do not produce this log message if we are starting up the Pgpool-II */
+		if (processState != INITIALIZING)
+			ereport(LOG,
+					(errmsg("primary node:%d on master watchdog node \"%s\" is different from local primary node:%d",
+							backendStatus->primary_node_id, backendStatus->nodeName, Req_info->primary_node_id)));
+		/*
+		 * master node returns primary_node_id = -1 when the node primary
+		 * node is in  quarantine state on the master.
+		 * So we will not update our primary node id when the status of current primary node
+		 * is not CON_DOWN while primary_node_id sent by master watchdong node is -1
+		 */
+		if (backendStatus->primary_node_id == -1 && BACKEND_INFO(Req_info->primary_node_id).backend_status != CON_DOWN)
+		{
+			ereport(LOG,
+                (errmsg("primary node:%d on master watchdog node \"%s\" seems to be quarantined",
+					Req_info->primary_node_id, backendStatus->nodeName),
+                errdetail("keeping the current primary")));
+		}
+		else
+		{
+			Req_info->primary_node_id = backendStatus->primary_node_id;
+			primary_changed = true;
+		}
+	}
+
 	pfree(backendStatus);
 
 	if (reload_maste_node_id)
diff --git a/src/watchdog/wd_json_data.c b/src/watchdog/wd_json_data.c
index 7f3cb6625..4fa8dd77a 100644
--- a/src/watchdog/wd_json_data.c
+++ b/src/watchdog/wd_json_data.c
@@ -324,7 +324,7 @@ get_backend_node_status_json(WatchdogNode * wdNode)
 		if (backend_status == CON_DOWN && pool_config->backend_desc->backend_info[i].quarantine)
 		{
 			/*
-			 * since quarantine nodes are not cluster wide so send CON_WATI
+			 * since quarantine nodes are not cluster wide so send CON_WAIT
 			 * status for quarantine nodes
 			 */
 			backend_status = CON_CONNECT_WAIT;