Fix for miscellaneous watchdog issues.

author Muhammad Usama <m.usama@gmail.com>

Mon, 14 Oct 2019 10:03:08 +0000 (15:03 +0500)

committer Muhammad Usama <m.usama@gmail.com>

Mon, 14 Oct 2019 13:14:44 +0000 (18:14 +0500)
author Muhammad Usama <m.usama@gmail.com>
Mon, 14 Oct 2019 10:03:08 +0000 (15:03 +0500)
committer Muhammad Usama <m.usama@gmail.com>
Mon, 14 Oct 2019 13:14:44 +0000 (18:14 +0500)
diff --git a/src/include/watchdog/watchdog.h b/src/include/watchdog/watchdog.h

index 1146a4aec6f2c321743470a7dafd8945569ce141..83f6651b28c25401b9e9a6d392a2ceb03b9fcdb6 100644 (file)
--- a/src/include/watchdog/watchdog.h
+++ b/src/include/watchdog/watchdog.h
@@ -79,7 +79,8 @@ typedef enum
         WD_IN_NW_TROUBLE,
         /* the following states are only valid on remote nodes */
         WD_SHUTDOWN,
-       WD_ADD_MESSAGE_SENT
+       WD_ADD_MESSAGE_SENT,
+       WD_NETWORK_ISOLATION
  }                      WD_STATES;
  
  typedef enum
@@ -114,9 +115,21 @@ typedef enum
         WD_EVENT_NODE_CON_LOST,
         WD_EVENT_NODE_CON_FOUND,
         WD_EVENT_CLUSTER_QUORUM_CHANGED,
-       WD_EVENT_WD_STATE_REQUIRE_RELOAD
+       WD_EVENT_WD_STATE_REQUIRE_RELOAD,
+       WD_EVENT_I_AM_APPEARING_LOST,
+       WD_EVENT_I_AM_APPEARING_FOUND
  }                      WD_EVENTS;
  
+typedef enum {
+       NODE_LOST_UNKNOWN_REASON,
+       NODE_LOST_BY_LIFECHECK,
+       NODE_LOST_BY_SEND_FAILURE,
+       NODE_LOST_BY_MISSING_BEACON,
+       NODE_LOST_BY_RECEIVE_TIMEOUT,
+       NODE_LOST_BY_NOT_REACHABLE,
+       NODE_LOST_SHUTDOWN
+} WD_NODE_LOST_REASONS;
+
  typedef struct SocketConnection
  {
         int                     sock;                   /* socket descriptor */
@@ -135,6 +148,20 @@ typedef struct WatchdogNode
                                                                          * from the node */
         struct timeval last_sent_time;  /* timestamp when last packet was sent on
                                                                          * the node */
+       bool   has_lost_us;             /*
+                                                                        * True when this remote node thinks
+                                                                        * we are lost
+                                                                        */
+       int    sending_failures_count;  /* number of times we have failed
+                                                                        * to send message to the node.
+                                                                        * Gets reset after successfull sent
+                                                                        */
+       int    missed_beacon_count;     /* number of times the node has
+                                                                        * failed to reply for beacon.
+                                                                        * message
+                                                                        */
+       WD_NODE_LOST_REASONS node_lost_reason;
+
         char            pgp_version[MAX_VERSION_STR_LEN];               /* Pgpool-II version */
         int                     wd_data_major_version;  /* watchdog messaging version major*/
         int                     wd_data_minor_version;  /* watchdog messaging version minor*/
diff --git a/src/test/regression/regress.sh b/src/test/regression/regress.sh

index a891c2ae4192d570d947cdd3e5b1e1fc2f15226c..37efee029d60b21c5efbf4c74c84283628b64222 100755 (executable)
--- a/src/test/regression/regress.sh
+++ b/src/test/regression/regress.sh
@@ -38,7 +38,7 @@ function install_pgpool
  
         test -d $log || mkdir $log
          
-       make install HEALTHCHECK_DEBUG=1 -C $dir/../../ -e prefix=${PGPOOL_PATH} >& regression.log 2>&1
+       make install HEALTHCHECK_DEBUG=1 WATCHDOG_DEBUG=1 -C $dir/../../ -e prefix=${PGPOOL_PATH} >& regression.log 2>&1
  
         if [ $? != 0 ];then
             echo "make install failed"
diff --git a/src/watchdog/Makefile.am b/src/watchdog/Makefile.am

index bb4c2204c1ae0ad33dbb8ac2a7b21b896a344c3a..0400459f2ce5e39971df11876c00b12de2a5c093 100644 (file)
--- a/src/watchdog/Makefile.am
+++ b/src/watchdog/Makefile.am
@@ -1,6 +1,6 @@
  top_builddir = ../..
  AM_CPPFLAGS = -D_GNU_SOURCE -I @PGSQL_INCLUDE_DIR@
-
+WATCHDOG_DEBUG=0
  noinst_LIBRARIES = lib-watchdog.a
  
  lib_watchdog_a_SOURCES = \
@@ -16,3 +16,4 @@ lib_watchdog_a_SOURCES = \
         wd_utils.c \
         wd_escalation.c
  
+DEFS = @DEFS@ -DWATCHDOG_DEBUG_OPTS=$(WATCHDOG_DEBUG)
diff --git a/src/watchdog/Makefile.in b/src/watchdog/Makefile.in

index 891d6058b0014bc68bb668269b02bbc472072ded..103dd3189ef60d1b6325d19a43e56a8c27fe1a6d 100644 (file)
--- a/src/watchdog/Makefile.in
+++ b/src/watchdog/Makefile.in
@@ -189,7 +189,7 @@ COLLATEINDEX = @COLLATEINDEX@
  CPP = @CPP@
  CPPFLAGS = @CPPFLAGS@
  CYGPATH_W = @CYGPATH_W@
-DEFS = @DEFS@
+DEFS = @DEFS@ -DWATCHDOG_DEBUG_OPTS=$(WATCHDOG_DEBUG)
  DLLTOOL = @DLLTOOL@
  DOCBOOKSTYLE = @DOCBOOKSTYLE@
  DSYMUTIL = @DSYMUTIL@
@@ -312,6 +312,7 @@ top_build_prefix = @top_build_prefix@
  top_builddir = ../..
  top_srcdir = @top_srcdir@
  AM_CPPFLAGS = -D_GNU_SOURCE -I @PGSQL_INCLUDE_DIR@
+WATCHDOG_DEBUG = 0
  noinst_LIBRARIES = lib-watchdog.a
  lib_watchdog_a_SOURCES = \
         watchdog.c \
diff --git a/src/watchdog/watchdog.c b/src/watchdog/watchdog.c

index c240717da3c5d00db7b46ad9e13ba84ff67c9c3a..625660ec6cf89e0c6bc7ee456feea2f4d48cdeec 100644 (file)
--- a/src/watchdog/watchdog.c
+++ b/src/watchdog/watchdog.c
@@ -89,10 +89,23 @@ typedef enum IPC_CMD_PREOCESS_RES
  #define        MAX_SECS_WAIT_FOR_REPLY_FROM_NODE       5       /* time in seconds to wait for
                                                                                                  * the reply from remote
                                                                                                  * watchdog node */
+
+#define MAX_ALLOWED_SEND_FAILURES           3  /* number of times sending message failure
+                                                  * can be tolerated
+                                                  */
+#define MAX_ALLOWED_BEACON_REPLY_MISS       3  /* number of times missing beacon message reply
+                                                  * can be tolerated
+                                                  */
+
+
  #define        FAILOVER_COMMAND_FINISH_TIMEOUT         15      /* timeout in seconds to wait
                                                                                                  * for Pgpool-II to build
                                                                                                  * consensus for failover */
  
+#define MIN_SECS_BETWEEN_BROADCAST_SRV_MSG 5   /* minimum amount of seconds to wait
+                                                                                                * before broadcasting the same cluster
+                                                                                                * service message */
+
  
  #define WD_NO_MESSAGE                                          0
  #define WD_ADD_NODE_MESSAGE                                    'A'
@@ -129,6 +142,8 @@ typedef enum IPC_CMD_PREOCESS_RES
  #define CLUSTER_IAM_RESIGNING_FROM_MASTER      'R'
  #define CLUSTER_NODE_INVALID_VERSION           'V'
  #define CLUSTER_NODE_REQUIRE_TO_RELOAD         'I'
+#define CLUSTER_NODE_APPEARING_LOST            'Y'
+#define CLUSTER_NODE_APPEARING_FOUND           'Z'
  
  #define WD_MASTER_NODE getMasterWatchdogNode()
  
@@ -199,7 +214,8 @@ char           *wd_event_name[] =
         "NODE CONNECTION LOST",
         "NODE CONNECTION FOUND",
         "CLUSTER QUORUM STATUS CHANGED",
-       "NODE REQUIRE TO RELOAD STATE"
+       "NODE REQUIRE TO RELOAD STATE",
+       "I AM APPEARING LOST"
  };
  
  char      *wd_state_names[] = {
@@ -214,7 +230,18 @@ char          *wd_state_names[] = {
         "LOST",
         "IN NETWORK TROUBLE",
         "SHUTDOWN",
-       "ADD MESSAGE SENT"
+       "ADD MESSAGE SENT",
+       "NETWORK ISOLATION"
+};
+
+char *wd_node_lost_reasons[] = {
+       "UNKNOWN REASON",
+       "REPORTED BY LIFECHECK",
+       "SEND MESSAGE FAILURES",
+       "MISSING BEACON REPLIES",
+       "RECEIVE TIMEOUT",
+       "NOT REACHABLE",
+       "SHUTDOWN"
  };
  
  typedef struct WDPacketData
@@ -329,6 +356,10 @@ typedef struct wd_cluster
         bool            clusterInitialized;
         bool            ipc_auth_needed;
         int                     current_failover_id;
+       struct timeval last_bcast_srv_msg_time; /* timestamp when last packet was
+                                                                                        *  boradcasted by the local node */
+       char            last_bcast_srv_msg;
+
         List       *unidentified_socks;
         List       *notify_clients;
         List       *ipc_command_socks;
@@ -353,6 +384,22 @@ typedef struct WDFailoverObject
         int                     state;
  }                      WDFailoverObject;
  
+#ifdef WATCHDOG_DEBUG_OPTS
+#if WATCHDOG_DEBUG_OPTS > 0
+#define WATCHDOG_DEBUG
+#endif
+#endif
+
+static bool check_debug_request_do_not_send_beacon(void);
+static bool check_debug_request_do_not_reply_beacon(void);
+static bool check_debug_request_kill_all_communication(void);
+static bool check_debug_request_kill_all_receivers(void);
+static bool check_debug_request_kill_all_senders(void);
+
+
+#ifdef WATCHDOG_DEBUG
+static void load_watchdog_debug_test_option(void);
+#endif
  
  static void process_remote_failover_command_on_coordinator(WatchdogNode * wdNode, WDPacketData * pkt);
  static WDFailoverObject * get_failover_object(POOL_REQUEST_KIND reqKind, int nodesCount, int *nodeList);
@@ -422,7 +469,7 @@ static void service_internal_command(void);
  static unsigned int get_next_commandID(void);
  static WatchdogNode * parse_node_info_message(WDPacketData * pkt, char **authkey);
  static void update_quorum_status(void);
-static int     get_mimimum_remote_nodes_required_for_quorum(void);
+static int     get_minimum_remote_nodes_required_for_quorum(void);
  static int     get_minimum_votes_to_resolve_consensus(void);
  
  static bool write_packet_to_socket(int sock, WDPacketData * pkt, bool ipcPacket);
@@ -462,6 +509,7 @@ static int  watchdog_state_machine_joining(WD_EVENTS event, WatchdogNode * wdNode
  static int     watchdog_state_machine_loading(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
  static int     watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
  static int     watchdog_state_machine_nw_error(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
+static int watchdog_state_machine_nw_isolation(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
  
  static int     I_am_master_and_cluser_in_split_brain(WatchdogNode * otherMasterNode);
  static void handle_split_brain(WatchdogNode * otherMasterNode, WDPacketData * pkt);
@@ -530,6 +578,7 @@ static void set_cluster_master_node(WatchdogNode * wdNode);
  static void clear_standby_nodes_list(void);
  static int     standby_node_left_cluster(WatchdogNode * wdNode);
  static int     standby_node_join_cluster(WatchdogNode * wdNode);
+static void update_missed_beacon_count(WDCommandData* ipcCommand, bool clear);
  
  /* global variables */
  wd_cluster     g_cluster;
@@ -1171,6 +1220,9 @@ watchdog_main(void)
                                 g_timeout_sec = 0;
                         }
                 }
+#ifdef WATCHDOG_DEBUG
+               load_watchdog_debug_test_option();
+#endif
                 if (select_ret > 0)
                 {
                         int                     processed_fds = 0;
@@ -1410,10 +1462,14 @@ read_sockets(fd_set *rmask, int pending_fds_count)
  
                                 if (pkt)
                                 {
-                                       watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
-                                       /* since a packet is received reset last sent time */
-                                       wdNode->last_sent_time.tv_sec = 0;
-                                       wdNode->last_sent_time.tv_usec = 0;
+                                       if (check_debug_request_kill_all_communication() == false &&
+                                               check_debug_request_kill_all_receivers() == false)
+                                       {
+                                               watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
+                                               /* since a packet is received reset last sent time */
+                                               wdNode->last_sent_time.tv_sec = 0;
+                                               wdNode->last_sent_time.tv_usec = 0;
+                                       }
                                         free_packet(pkt);
                                 }
                                 else
@@ -1437,10 +1493,14 @@ read_sockets(fd_set *rmask, int pending_fds_count)
  
                                 if (pkt)
                                 {
-                                       watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
-                                       /* since a packet is received reset last sent time */
-                                       wdNode->last_sent_time.tv_sec = 0;
-                                       wdNode->last_sent_time.tv_usec = 0;
+                                       if (check_debug_request_kill_all_communication() == false &&
+                                               check_debug_request_kill_all_receivers() == false)
+                                       {
+                                               watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
+                                               /* since a packet is received reset last sent time */
+                                               wdNode->last_sent_time.tv_sec = 0;
+                                               wdNode->last_sent_time.tv_usec = 0;
+                                       }
                                         free_packet(pkt);
                                 }
                                 else
@@ -1470,6 +1530,7 @@ read_sockets(fd_set *rmask, int pending_fds_count)
                         pkt = read_packet_of_type(conn, WD_ADD_NODE_MESSAGE);
                         if (pkt)
                         {
+                               struct timeval  previous_startup_time;
                                 char       *authkey = NULL;
                                 WatchdogNode *tempNode = parse_node_info_message(pkt, &authkey);
  
@@ -1486,6 +1547,7 @@ read_sockets(fd_set *rmask, int pending_fds_count)
                                         /* verify this node */
                                         if (authenticated)
                                         {
+                                               WD_STATES oldNodeState = WD_DEAD;
                                                 for (i = 0; i < g_cluster.remoteNodeCount; i++)
                                                 {
                                                         wdNode = &(g_cluster.remoteNodes[i]);
@@ -1495,6 +1557,9 @@ read_sockets(fd_set *rmask, int pending_fds_count)
                                                         {
                                                                 /* We have found the match */
                                                                 found = true;
+                                                               previous_startup_time.tv_sec = wdNode->startup_time.tv_sec;
+                                                               oldNodeState = wdNode->state;
+
                                                                 close_socket_connection(&wdNode->server_socket);
                                                                 strlcpy(wdNode->delegate_ip, tempNode->delegate_ip, WD_MAX_HOST_NAMELEN);
                                                                 strlcpy(wdNode->nodeName, tempNode->nodeName, WD_MAX_HOST_NAMELEN);
@@ -1528,7 +1593,35 @@ read_sockets(fd_set *rmask, int pending_fds_count)
                                                                                            wdNode->wd_data_major_version,
                                                                                            wdNode->wd_data_minor_version)));
  
-                                                       watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
+                                                       if (oldNodeState == WD_SHUTDOWN)
+                                                       {
+                                                               ereport(LOG,
+                                                                               (errmsg("The newly joined node:\"%s\" had left the cluster because it was shutdown",wdNode->nodeName)));
+                                                               watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
+
+                                                       }
+                                                       else if (oldNodeState == WD_LOST)
+                                                       {
+                                                               ereport(LOG,
+                                                                               (errmsg("The newly joined node:\"%s\" had left the cluster because it was lost",wdNode->nodeName),
+                                                                                errdetail("lost reason was \"%s\" and startup time diff = %d",
+                                                                                                  wd_node_lost_reasons[wdNode->node_lost_reason],
+                                                                                                  abs((int)(previous_startup_time.tv_sec - wdNode->startup_time.tv_sec)))));
+
+                                                               if (abs((int)(previous_startup_time.tv_sec - wdNode->startup_time.tv_sec)) <= 2 &&
+                                                                       wdNode->node_lost_reason == NODE_LOST_BY_LIFECHECK)
+                                                               {
+                                                                       ereport(LOG,
+                                                                               (errmsg("node:\"%s\" was reported lost by the lifecheck process",wdNode->nodeName),
+                                                                                        errdetail("only lifecheck process can mark this node alive again")));
+                                                                       /* restore the node's lost state */
+                                                                       wdNode->state = oldNodeState;
+                                                               }
+                                                               else
+                                                                       watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
+
+                                                       }
+
                                                 }
                                                 else
                                                         ereport(NOTICE,
@@ -1721,6 +1814,12 @@ write_ipc_command_with_result_data(WDCommandData * ipcCommand, char type, char *
                                 (errmsg("not replying to IPC, Invalid IPC command.")));
                 return false;
         }
+       /* DEBUG AID */
+       if (ipcCommand->commandSource == COMMAND_SOURCE_REMOTE &&
+               (check_debug_request_kill_all_senders() ||
+               check_debug_request_kill_all_communication()))
+               return false;
+
         return write_packet_to_socket(ipcCommand->sourceIPCSocket, &pkt, true);
  }
  
@@ -1932,7 +2031,7 @@ static IPC_CMD_PREOCESS_RES process_IPC_get_runtime_variable_value_request(WDCom
                 json_value_free(root);
                 ereport(NOTICE,
                                 (errmsg("failed to process get local variable IPC command"),
-                                errdetail("unable to parse json data")));
+                                errdetail("unable to parse JSON data")));
                 return IPC_CMD_ERROR;
         }
  
@@ -2031,7 +2130,7 @@ static IPC_CMD_PREOCESS_RES process_IPC_nodeStatusChange_command(WDCommandData *
         {
                 ereport(NOTICE,
                                 (errmsg("failed to process NODE STATE CHANGE IPC command"),
-                                errdetail("unable to parse json data")));
+                                errdetail("unable to parse JSON data")));
                 return IPC_CMD_ERROR;
         }
  
@@ -2086,7 +2185,10 @@ fire_node_status_event(int nodeID, int nodeStatus)
                 if (wdNode == g_cluster.localNode)
                         watchdog_state_machine(WD_EVENT_LOCAL_NODE_LOST, wdNode, NULL, NULL);
                 else
+               {
+                       wdNode->node_lost_reason = NODE_LOST_BY_LIFECHECK;
                         watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
+               }
         }
         else if (nodeStatus == WD_LIFECHECK_NODE_STATUS_ALIVE)
         {
@@ -2188,7 +2290,7 @@ service_expired_failovers(void)
                                                                                 BACKEND_INFO(node_id).role == ROLE_PRIMARY)
                                                                         {
                                                                                 ereport(LOG,
-                                                                                               (errmsg("We are not able to build consensus for our primary node failover request, got %d votesonly for failover request ID:%d", failoverObj->request_count, failoverObj->failoverID),
+                                                                                               (errmsg("We are not able to build consensus for our primary node failover request, got %d votes only for failover request ID:%d", failoverObj->request_count, failoverObj->failoverID),
                                                                                                  errdetail("resigning from the coordinator")));
                                                                                 need_to_resign = true;
                                                                         }
@@ -2797,7 +2899,7 @@ static IPC_CMD_PREOCESS_RES process_IPC_failover_indication(WDCommandData * ipcC
                         {
                                 ereport(LOG,
                                                 (errmsg("unable to process failover indication"),
-                                                errdetail("invalid json data in command packet")));
+                                                errdetail("invalid JSON data in command packet")));
                                 res = FAILOVER_RES_INVALID_FUNCTION;
                         }
                         if (root)
@@ -2808,7 +2910,7 @@ static IPC_CMD_PREOCESS_RES process_IPC_failover_indication(WDCommandData * ipcC
                 {
                         ereport(LOG,
                                         (errmsg("unable to process failover indication"),
-                                        errdetail("invalid json data in command packet")));
+                                        errdetail("invalid JSON data in command packet")));
                         res = FAILOVER_RES_INVALID_FUNCTION;
                 }
                 else if (failoverState == 0)    /* start */
@@ -2851,7 +2953,7 @@ failover_start_indication(WDCommandData * ipcCommand)
         }
         else if (get_local_node_state() == WD_STANDBY)
         {
-               /* The node might be performing the locl quarantine opetaion */
+               /* The node might be performing the local quarantine operation */
                 ereport(DEBUG1,
                                 (errmsg("main process is starting the local quarantine operation")));
                 return FAILOVER_RES_PROCEED;
@@ -2878,7 +2980,7 @@ failover_end_indication(WDCommandData * ipcCommand)
         }
         else if (get_local_node_state() == WD_STANDBY)
         {
-               /* The node might be performing the locl quarantine opetaion */
+               /* The node might be performing the local quarantine operation */
                 ereport(DEBUG1,
                                 (errmsg("main process is ending the local quarantine operation")));
                 return FAILOVER_RES_PROCEED;
@@ -3257,7 +3359,7 @@ update_successful_outgoing_cons(fd_set *wmask, int pending_fds_count)
                                 {
                                         ereport(DEBUG1,
                                                         (errmsg("error in outbound connection to %s:%d ", wdNode->hostname, wdNode->wd_port),
-                                                        errdetail("getsockopt faile with error \"%s\"", strerror(errno))));
+                                                        errdetail("getsockopt failed with error \"%s\"", strerror(errno))));
                                         close_socket_connection(&wdNode->client_socket);
                                         wdNode->client_socket.sock_state = WD_SOCK_ERROR;
  
@@ -3721,9 +3823,9 @@ cluster_service_message_processor(WatchdogNode * wdNode, WDPacketData * pkt)
                                 else
                                 {
                                         ereport(LOG,
-                                                       (errmsg("master/coordinator node \"%s\" decided to resigning from master, probably because of split-brain",
+                                                       (errmsg("master/coordinator node \"%s\" decided to resign from master, probably because of split-brain",
                                                                         wdNode->nodeName),
-                                                        errdetail("but it was not our coordinator/master anyway. ignoring the message")));
+                                                        errdetail("It was not our coordinator/master anyway. ignoring the message")));
                                 }
                         }
                         break;
@@ -3769,7 +3871,7 @@ cluster_service_message_processor(WatchdogNode * wdNode, WDPacketData * pkt)
                                 else
                                 {
                                         ereport(LOG,
-                                                       (errmsg("master/coordinator node \"%s\" decided to resigning from master, probably because of split-brain",
+                                                       (errmsg("master/coordinator node \"%s\" decided to resign from master, probably because of split-brain",
                                                                         wdNode->nodeName),
                                                          errdetail("but it was not our coordinator/master anyway. ignoring the message")));
                                 }
@@ -3782,13 +3884,34 @@ cluster_service_message_processor(WatchdogNode * wdNode, WDPacketData * pkt)
                 }
                         break;
  
+               case CLUSTER_NODE_APPEARING_LOST:
+               {
+                       ereport(LOG,
+                               (errmsg("remote node \"%s\" is reporting that it has lost us",
+                                                       wdNode->nodeName)));
+                       wdNode->has_lost_us = true;
+                       watchdog_state_machine(WD_EVENT_I_AM_APPEARING_LOST, wdNode, NULL, NULL);
+               }
+                       break;
+
+               case CLUSTER_NODE_APPEARING_FOUND:
+               {
+                       ereport(LOG,
+                               (errmsg("remote node \"%s\" is reporting that it has found us again",
+                                                       wdNode->nodeName)));
+                       wdNode->has_lost_us = false;
+                       watchdog_state_machine(WD_EVENT_I_AM_APPEARING_FOUND, wdNode, NULL, NULL);
+               }
+                       break;
+
                 case CLUSTER_NODE_INVALID_VERSION:
                         {
                                 /*
                                  * this should never happen means something is seriously wrong
                                  */
                                 ereport(FATAL,
-                                               (errmsg("\"%s\" node has found serious issues in our watchdog messages",
+                                               (return_code(POOL_EXIT_FATAL),
+                                                errmsg("\"%s\" node has found serious issues in our watchdog messages",
                                                                 wdNode->nodeName),
                                                  errdetail("shutting down")));
                         }
@@ -3853,16 +3976,6 @@ standard_packet_processor(WatchdogNode * wdNode, WDPacketData * pkt)
                         }
                         break;
  
-               case WD_EVENT_REMOTE_NODE_FOUND:
-                       {
-                               ereport(LOG,
-                                               (errmsg("remote node \"%s\" became reachable again", wdNode->nodeName),
-                                                errdetail("requesting the node info")));
-                               send_message_of_type(wdNode, WD_REQ_INFO_MESSAGE, NULL);
-                               break;
-                       }
-                       break;
-
                 case WD_ADD_NODE_MESSAGE:
                 case WD_REQ_INFO_MESSAGE:
                         replyPkt = get_mynode_info_message(pkt);
@@ -3962,6 +4075,33 @@ standard_packet_processor(WatchdogNode * wdNode, WDPacketData * pkt)
                                 {
                                         standby_node_left_cluster(wdNode);
                                 }
+                               if (oldNodeState == WD_LOST)
+                               {
+                                       /*
+                                        * We have received the message from lost node
+                                        * add it back to cluster if it was not marked by
+                                        * life-check
+                                        * Node lost by life-check processes can only be
+                                        * added back when we get alive notification for the
+                                        * node from life-check
+                                        */
+                                       ereport(LOG,
+                                               (errmsg("we have received the NODE INFO message from the node:\"%s\" that was lost",wdNode->nodeName),
+                                                errdetail("we had lost this node because of \"%s\"",wd_node_lost_reasons[wdNode->node_lost_reason])));
+
+                                       if (wdNode->node_lost_reason == NODE_LOST_BY_LIFECHECK)
+                                       {
+                                               ereport(LOG,
+                                                       (errmsg("node:\"%s\" was reported lost by the lifecheck process",wdNode->nodeName),
+                                                                errdetail("only life-check process can mark this node alive again")));
+                                               /* restore the node's lost state */
+                                               wdNode->state = oldNodeState;
+                                       }
+                                       else
+                                       {
+                                               watchdog_state_machine(WD_EVENT_REMOTE_NODE_FOUND, wdNode, NULL, NULL);
+                                       }
+                               }
                         }
                         break;
  
@@ -3996,11 +4136,16 @@ standard_packet_processor(WatchdogNode * wdNode, WDPacketData * pkt)
  
                                         send_cluster_service_message(NULL, pkt, CLUSTER_IN_SPLIT_BRAIN);
                                 }
-                               else
+                               else if (WD_MASTER_NODE != NULL)
                                 {
                                         replyPkt = get_mynode_info_message(pkt);
                                         beacon_message_received_from_node(wdNode, pkt);
                                 }
+                               /*
+                                * if (WD_MASTER_NODE == NULL)
+                                * do not reply to beacon if we are not connected to
+                                * any master node
+                                */
                         }
                         break;
  
@@ -4021,6 +4166,10 @@ standard_packet_processor(WatchdogNode * wdNode, WDPacketData * pkt)
  static bool
  send_message_to_connection(SocketConnection * conn, WDPacketData * pkt)
  {
+       if (check_debug_request_kill_all_communication() == true ||
+               check_debug_request_kill_all_senders() == true)
+               return false;
+
         if (conn->sock > 0 && conn->sock_state == WD_SOCK_CONNECTED)
         {
                 if (write_packet_to_socket(conn->sock, pkt, false) == true)
@@ -4047,6 +4196,8 @@ send_message_to_node(WatchdogNode * wdNode, WDPacketData * pkt)
         }
         if (ret)
         {
+               /* reset the sending error counter */
+               wdNode->sending_failures_count = 0;
                 /* we only update the last sent time if reply for packet is expected */
                 switch (pkt->type)
                 {
@@ -4061,6 +4212,7 @@ send_message_to_node(WatchdogNode * wdNode, WDPacketData * pkt)
         }
         else
         {
+               wdNode->sending_failures_count++;
                 ereport(DEBUG1,
                                 (errmsg("sending packet %c to node \"%s\" failed", pkt->type, wdNode->nodeName)));
         }
@@ -4130,7 +4282,7 @@ static IPC_CMD_PREOCESS_RES wd_command_processor_for_node_lost_event(WDCommandDa
                                 if (nodeResult->cmdState == COMMAND_STATE_SENT)
                                 {
                                         ereport(LOG,
-                                                       (errmsg("remote node \"%s\" lost while ipc command was in progress ", wdLostNode->nodeName)));
+                                                       (errmsg("remote node \"%s\" lost while IPC command was in progress ", wdLostNode->nodeName)));
  
                                         /*
                                          * since the node is lost and will be removed from the
@@ -4353,15 +4505,36 @@ service_unreachable_nodes(void)
                                                         (errmsg("remote node \"%s\" is not replying..", wdNode->nodeName),
                                                          errdetail("marking the node as lost")));
                                         /* mark the node as lost */
+                                       wdNode->node_lost_reason = NODE_LOST_BY_RECEIVE_TIMEOUT;
                                         watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
                                 }
                         }
+                       else if (wdNode->sending_failures_count > MAX_ALLOWED_SEND_FAILURES)
+                       {
+                               ereport(LOG,
+                                               (errmsg("not able to send messages to remote node \"%s\"",wdNode->nodeName),
+                                                errdetail("marking the node as lost")));
+                               /* mark the node as lost */
+                               wdNode->node_lost_reason = NODE_LOST_BY_SEND_FAILURE;
+                               watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
+                       }
+                       else if (wdNode->missed_beacon_count > MAX_ALLOWED_BEACON_REPLY_MISS)
+                       {
+                               ereport(LOG,
+                                               (errmsg("remote node \"%s\" is not responding to our beacon messages",wdNode->nodeName),
+                                                errdetail("marking the node as lost")));
+                               /* mark the node as lost */
+                               wdNode->node_lost_reason = NODE_LOST_BY_MISSING_BEACON;
+                               wdNode->missed_beacon_count = 0; /* Reset the counter */
+                               watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
+                       }
                 }
                 else
                 {
                         ereport(LOG,
                                         (errmsg("remote node \"%s\" is not reachable", wdNode->nodeName),
                                          errdetail("marking the node as lost")));
+                       wdNode->node_lost_reason = NODE_LOST_BY_NOT_REACHABLE;
                         watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
                 }
         }
@@ -4390,8 +4563,6 @@ watchdog_internal_command_packet_processor(WatchdogNode * wdNode, WDPacketData *
         for (i = 0; i < g_cluster.remoteNodeCount; i++)
         {
                 WDCommandNodeResult *nodeRes = &clusterCommand->nodeResults[i];
-
-               clear_command_node_result(nodeRes);
                 if (nodeRes->wdNode == wdNode)
                 {
                         nodeResult = nodeRes;
@@ -4536,7 +4707,7 @@ issue_watchdog_internal_command(WatchdogNode * wdNode, WDPacketData * pkt, int t
                                 if (send_message_to_node(nodeResult->wdNode, pkt) == false)
                                 {
                                         ereport(DEBUG1,
-                                                       (errmsg("failed to send watchdog internla command packet %s", nodeResult->wdNode->nodeName),
+                                                       (errmsg("failed to send watchdog internal command packet %s", nodeResult->wdNode->nodeName),
                                                          errdetail("saving the packet. will try to resend it if connection recovers")));
  
                                         /* failed to send. May be try again later */
@@ -4733,6 +4904,30 @@ reply_with_minimal_message(WatchdogNode * wdNode, char type, WDPacketData * repl
  static bool
  send_cluster_service_message(WatchdogNode * wdNode, WDPacketData * replyFor, char message)
  {
+       /* Check if its a broadcast message */
+       if (wdNode == NULL)
+       {
+               /* see if we have already broadcasted the similar message recently */
+               if (message == g_cluster.last_bcast_srv_msg)
+               {
+                       struct timeval currTime;
+                       gettimeofday(&currTime, NULL);
+                       int      last_bcast_sec = WD_TIME_DIFF_SEC(currTime, g_cluster.last_bcast_srv_msg_time);
+                       if (last_bcast_sec < MIN_SECS_BETWEEN_BROADCAST_SRV_MSG)
+                       {
+                               /*
+                                * do not broadcast this message
+                                * to prevent flooding
+                                */
+                               ereport(DEBUG4,
+                                               (errmsg("not broadcasting cluster service message %c to prevent flooding ",message),
+                                                errdetail("last time same message was sent %d seconds ago",last_bcast_sec)));
+                               return true;
+                       }
+               }
+               g_cluster.last_bcast_srv_msg = message;
+               gettimeofday(&g_cluster.last_bcast_srv_msg_time, NULL);
+       }
         return reply_with_message(wdNode, WD_CLUSTER_SERVICE_MESSAGE, &message, 1, replyFor);
  }
  
@@ -4933,9 +5128,6 @@ watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pk
  
         if (event == WD_EVENT_REMOTE_NODE_LOST)
         {
-               /* close all socket connections to the node */
-               close_socket_connection(&wdNode->client_socket);
-               close_socket_connection(&wdNode->server_socket);
  
                 if (wdNode->state == WD_SHUTDOWN)
                 {
@@ -4947,6 +5139,8 @@ watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pk
                         wdNode->state = WD_LOST;
                         ereport(LOG,
                                         (errmsg("remote node \"%s\" is lost", wdNode->nodeName)));
+                       /* Inform the node, that it is lost for us */
+                        send_cluster_service_message(wdNode, pkt, CLUSTER_NODE_APPEARING_LOST);
                 }
                 if (wdNode == WD_MASTER_NODE)
                 {
@@ -4955,11 +5149,29 @@ watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pk
                         set_cluster_master_node(NULL);
                 }
  
+               /* close all socket connections to the node */
+               close_socket_connection(&wdNode->client_socket);
+               close_socket_connection(&wdNode->server_socket);
+
                 /* clear the wait timer on the node */
                 wdNode->last_sent_time.tv_sec = 0;
                 wdNode->last_sent_time.tv_usec = 0;
+               wdNode->sending_failures_count = 0;
                 node_lost_while_ipc_command(wdNode);
         }
+       else if (event == WD_EVENT_REMOTE_NODE_FOUND)
+       {
+               ereport(LOG,
+                               (errmsg("remote node \"%s\" became reachable again", wdNode->nodeName),
+                                errdetail("requesting the node info")));
+               /*
+                * remove the lost state from the node
+                * and change it to joining for now
+                */
+               wdNode->node_lost_reason = NODE_LOST_UNKNOWN_REASON;
+               wdNode->state = WD_LOADING;
+               send_cluster_service_message(wdNode, pkt, CLUSTER_NODE_APPEARING_FOUND);
+       }
         else if (event == WD_EVENT_PACKET_RCV)
         {
                 print_packet_node_info(pkt, wdNode, false);
@@ -4974,6 +5186,7 @@ watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pk
                 if (pkt->type == WD_INFORM_I_AM_GOING_DOWN)
                 {
                         wdNode->state = WD_SHUTDOWN;
+                       wdNode->node_lost_reason = NODE_LOST_SHUTDOWN;
                         return watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
                 }
  
@@ -4998,7 +5211,7 @@ watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pk
                 if (any_interface_available() == false)
                 {
                         ereport(WARNING,
-                                       (errmsg("network event has occured and all monitored interfaces are down"),
+                                       (errmsg("network event has occurred and all monitored interfaces are down"),
                                          errdetail("changing the state to in network trouble")));
  
                         set_state(WD_IN_NW_TROUBLE);
@@ -5040,7 +5253,7 @@ watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pk
         else if (event == WD_EVENT_LOCAL_NODE_LOST)
         {
                 ereport(WARNING,
-                               (errmsg("watchdog lifecheck reported, we are disconnected from the network"),
+                               (errmsg("watchdog life-check reported, we are disconnected from the network"),
                                  errdetail("changing the state to LOST")));
                 set_state(WD_LOST);
         }
@@ -5075,6 +5288,9 @@ watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pk
                 case WD_IN_NW_TROUBLE:
                         watchdog_state_machine_nw_error(event, wdNode, pkt, clusterCommand);
                         break;
+               case WD_NETWORK_ISOLATION:
+                       watchdog_state_machine_nw_isolation(event, wdNode, pkt, clusterCommand);
+                       break;
                 default:
                         /* Should never ever happen */
                         ereport(WARNING,
@@ -5130,7 +5346,7 @@ watchdog_state_machine_loading(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD
                                         case WD_STAND_FOR_COORDINATOR_MESSAGE:
                                                 {
                                                         /*
-                                                        * We are loading but a note is already contesting
+                                                        * We are loading but a node is already contesting
                                                          * for coordinator node well we can ignore it but
                                                          * then this could eventually mean a lower
                                                          * priority node can became a coordinator node. So
@@ -5387,8 +5603,10 @@ watchdog_state_machine_standForCord(WD_EVENTS event, WatchdogNode * wdNode, WDPa
                                                         if (pkt->type == WD_ERROR_MESSAGE)
                                                         {
                                                                 ereport(LOG,
-                                                                               (errmsg("our stand for coordinator request is rejected by node \"%s\"", wdNode->nodeName)));
-                                                               set_state(WD_JOINING);
+                                                                               (errmsg("our stand for coordinator request is rejected by node \"%s\"",wdNode->nodeName),
+                                                                                errdetail("we might be in partial network isolation and cluster already have a valid master"),
+                                                                                errhint("please verify the watchdog life-check and network is working properly")));
+                                                               set_state(WD_NETWORK_ISOLATION);
                                                         }
                                                         else if (pkt->type == WD_REJECT_MESSAGE)
                                                         {
@@ -5493,6 +5711,7 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac
  
                                 send_cluster_command(NULL, WD_DECLARE_COORDINATOR_MESSAGE, 4);
                                 set_timeout(MAX_SECS_WAIT_FOR_REPLY_FROM_NODE);
+                               update_missed_beacon_count(NULL,true);
                                 ereport(LOG,
                                                 (errmsg("I am announcing my self as master/coordinator watchdog node")));
  
@@ -5565,6 +5784,8 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac
  
                                 else if (clusterCommand->commandPacket.type == WD_IAM_COORDINATOR_MESSAGE)
                                 {
+                                       update_missed_beacon_count(clusterCommand,false);
+
                                         if (clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED)
                                         {
                                                 ereport(DEBUG1,
@@ -5688,11 +5909,49 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac
  
                 case WD_EVENT_TIMEOUT:
                         {
-                               send_cluster_command(NULL, WD_IAM_COORDINATOR_MESSAGE, 5);
+                               if (check_debug_request_do_not_send_beacon() == false)
+                                       send_cluster_command(NULL, WD_IAM_COORDINATOR_MESSAGE, 5);
                                 set_timeout(BEACON_MESSAGE_INTERVAL_SECONDS);
                         }
                         break;
  
+               case WD_EVENT_I_AM_APPEARING_LOST:
+               {
+                       /* The remote node has lost us, It would have already marked
+                        * us as lost, So remove it from standby*/
+                       standby_node_left_cluster(wdNode);
+               }
+                       break;
+
+               case WD_EVENT_I_AM_APPEARING_FOUND:
+               {
+                       /* The remote node has found us again */
+                       if (wdNode->wd_data_major_version >= 1 && wdNode->wd_data_minor_version >= 1)
+                       {
+                               /*
+                                * Since data version 1.1 we support CLUSTER_NODE_REQUIRE_TO_RELOAD
+                                * which makes the standby nodes to re-send the join master node
+                                */
+                               ereport(DEBUG1,
+                                       (errmsg("asking remote node \"%s\" to rejoin master", wdNode->nodeName),
+                                               errdetail("watchdog data version %s",WD_MESSAGE_DATA_VERSION)));
+
+                               send_cluster_service_message(wdNode, pkt, CLUSTER_NODE_REQUIRE_TO_RELOAD);
+                       }
+                       else
+                       {
+                               /*
+                                * The node is on older version
+                                * So ask it to re-join the cluster
+                                */
+                               ereport(DEBUG1,
+                                       (errmsg("asking remote node \"%s\" to rejoin cluster", wdNode->nodeName),
+                                               errdetail("watchdog data version %s",WD_MESSAGE_DATA_VERSION)));
+                               send_cluster_service_message(wdNode, pkt, CLUSTER_NEEDS_ELECTION);
+                       }
+               }
+                       break;
+
                 case WD_EVENT_REMOTE_NODE_LOST:
                         {
                                 standby_node_left_cluster(wdNode);
@@ -5704,6 +5963,7 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac
                         ereport(LOG,
                                 (errmsg("remote node \"%s\" is reachable again", wdNode->nodeName),
                                         errdetail("trying to add it back as a standby")));
+                       wdNode->node_lost_reason = NODE_LOST_UNKNOWN_REASON;
                         /* If I am the cluster master. Ask for the node info and to re-send the join message */
                         send_message_of_type(wdNode, WD_REQ_INFO_MESSAGE, NULL);
                         if (wdNode->wd_data_major_version >= 1 && wdNode->wd_data_minor_version >= 1)
@@ -5736,12 +5996,21 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac
                         {
                                 switch (pkt->type)
                                 {
+                                       case WD_ADD_NODE_MESSAGE:
+                                               /* In case we received the ADD node message from
+                                                * one of our standby, Remove that standby from
+                                                * the list
+                                                */
+                                               standby_node_left_cluster(wdNode);
+                                               standard_packet_processor(wdNode, pkt);
+                                               break;
+
                                         case WD_STAND_FOR_COORDINATOR_MESSAGE:
                                                 reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
                                                 break;
                                         case WD_DECLARE_COORDINATOR_MESSAGE:
                                                 ereport(NOTICE,
-                                                               (errmsg("We are corrdinator and another node tried a coup")));
+                                                               (errmsg("We are coordinator and another node tried a coup")));
                                                 reply_with_minimal_message(wdNode, WD_ERROR_MESSAGE, pkt);
                                                 break;
  
@@ -5776,14 +6045,24 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac
  
                                         case WD_JOIN_COORDINATOR_MESSAGE:
                                                 {
-                                                       reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
-
                                                         /*
-                                                        * Also get the configurations from the standby
-                                                        * node
+                                                        * If the node is marked as lost because of
+                                                        * life-check, Do not let it join the cluster
                                                          */
-                                                       send_message_of_type(wdNode, WD_ASK_FOR_POOL_CONFIG, NULL);
-                                                       standby_node_join_cluster(wdNode);
+                                                       if (wdNode->state == WD_LOST && wdNode->node_lost_reason == NODE_LOST_BY_LIFECHECK)
+                                                       {
+                                                               ereport(LOG,
+                                                                               (errmsg("lost remote node \"%s\" is requesting to join the cluster",wdNode->nodeName),
+                                                                                errdetail("rejecting the request until life-check inform us that it is reachable again")));
+                                                               reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
+                                                       }
+                                                       else
+                                                       {
+                                                               reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
+                                                               /* Also get the configurations from the standby node */
+                                                               send_message_of_type(wdNode,WD_ASK_FOR_POOL_CONFIG,NULL);
+                                                               standby_node_join_cluster(wdNode);
+                                                       }
                                                 }
                                                 break;
  
@@ -5814,7 +6093,7 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac
   * watchdog node when the network becomes reachable, but there is a problem.
   *
   * Once the cable on the system is unplugged or when the node gets isolated from the
- * cluster there is every likelihood that the backend healthcheck of the isolated node
+ * cluster there is every likelihood that the backend health-check of the isolated node
   * start reporting the backend node failure and the pgpool-II proceeds to perform
   * the failover for all attached backend nodes. Since the pgpool-II is yet not
   * smart enough to figure out it is because of the network failure of its own
@@ -5891,6 +6170,42 @@ watchdog_state_machine_nw_error(WD_EVENTS event, WatchdogNode * wdNode, WDPacket
         return 0;
  }
  
+/*
+ * we could end up in tis state if we were connected to the
+ * master node as standby and got lost on the master.
+ * Here we just wait for BEACON_MESSAGE_INTERVAL_SECONDS
+ * and retry to join the cluster.
+ */
+static int
+watchdog_state_machine_nw_isolation(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand)
+{
+       switch (event)
+       {
+               case WD_EVENT_WD_STATE_CHANGED:
+                       set_timeout(BEACON_MESSAGE_INTERVAL_SECONDS);
+                       break;
+
+               case WD_EVENT_PACKET_RCV:
+                       standard_packet_processor(wdNode, pkt);
+                       break;
+
+               case WD_EVENT_REMOTE_NODE_FOUND:
+               case WD_EVENT_WD_STATE_REQUIRE_RELOAD:
+               case WD_EVENT_I_AM_APPEARING_FOUND:
+               case WD_EVENT_TIMEOUT:
+                       /* fall through */
+               case WD_EVENT_NW_IP_IS_ASSIGNED:
+                       ereport(LOG,
+                               (errmsg("trying again to join the cluster")));
+                       set_state(WD_JOINING);
+                       break;
+
+               default:
+                       break;
+       }
+       return 0;
+}
+
  static bool
  beacon_message_received_from_node(WatchdogNode * wdNode, WDPacketData * pkt)
  {
@@ -6063,7 +6378,7 @@ handle_split_brain(WatchdogNode * otherMasterNode, WDPacketData * pkt)
                                 (errmsg("We are in split brain, and \"%s\" node is the best candidate for master/coordinator"
                                                 ,otherMasterNode->nodeName),
                                  errdetail("re-initializing the local watchdog cluster state")));
-               /* brodcast the message about I am not the true master node */
+               /* broadcast the message about I am not the true master node */
                 send_cluster_service_message(NULL, pkt, CLUSTER_IAM_NOT_TRUE_MASTER);
                 set_state(WD_JOINING);
         }
@@ -6089,8 +6404,8 @@ start_escalated_node(void)
         while (g_cluster.de_escalation_pid > 0 && wait_secs-- > 0)
         {
                 /*
-                * de_escalation proceess was already running and we are esclating
-                * again. give some time to de-escalation process to exit normaly
+                * de_escalation process was already running and we are escalating
+                * again. give some time to de-escalation process to exit normally
                  */
                 ereport(LOG,
                                 (errmsg("waiting for de-escalation process to exit before starting escalation")));
@@ -6131,8 +6446,8 @@ resign_from_escalated_node(void)
         while (g_cluster.escalation_pid > 0 && wait_secs-- > 0)
         {
                 /*
-                * escalation proceess was already running and we are resigning from
-                * it. wait for the escalation process to exit normaly
+                * escalation process was already running and we are resigning from
+                * it. wait for the escalation process to exit normally
                  */
                 ereport(LOG,
                                 (errmsg("waiting for escalation process to exit before starting de-escalation")));
@@ -6228,10 +6543,11 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD
                         send_cluster_command(WD_MASTER_NODE, WD_JOIN_COORDINATOR_MESSAGE, 5);
                         /* Also reset my priority as per the original configuration */
                         g_cluster.localNode->wd_priority = pool_config->wd_priority;
+                       set_timeout(BEACON_MESSAGE_INTERVAL_SECONDS);
                         break;
  
                 case WD_EVENT_TIMEOUT:
-                       set_timeout(5);
+                       set_timeout(BEACON_MESSAGE_INTERVAL_SECONDS);
                         break;
  
                 case WD_EVENT_WD_STATE_REQUIRE_RELOAD:
@@ -6243,27 +6559,54 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD
                         break;
  
                 case WD_EVENT_COMMAND_FINISHED:
+               {
+                       if (clusterCommand->commandPacket.type == WD_JOIN_COORDINATOR_MESSAGE)
                         {
-                               if (clusterCommand->commandPacket.type == WD_JOIN_COORDINATOR_MESSAGE)
+                               if (clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED ||
+                                       clusterCommand->commandStatus == COMMAND_FINISHED_TIMEOUT)
                                 {
-                                       if (clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED ||
-                                               clusterCommand->commandStatus == COMMAND_FINISHED_TIMEOUT)
-                                       {
-                                               register_watchdog_state_change_interupt();
+                                       register_watchdog_state_change_interupt();
  
+                                       ereport(LOG,
+                                                       (errmsg("successfully joined the watchdog cluster as standby node"),
+                                                        errdetail("our join coordinator request is accepted by cluster leader node \"%s\"", WD_MASTER_NODE->nodeName)));
+                               }
+                               else
+                               {
+                                       ereport(NOTICE,
+                                                       (errmsg("our join coordinator is rejected by node \"%s\"", wdNode->nodeName),
+                                                        errhint("rejoining the cluster.")));
+
+                                       if (WD_MASTER_NODE->has_lost_us)
+                                       {
                                                 ereport(LOG,
-                                                               (errmsg("successfully joined the watchdog cluster as standby node"),
-                                                                errdetail("our join coordinator request is accepted by cluster leader node \"%s\"", WD_MASTER_NODE->nodeName)));
+                                                               (errmsg("master node \"%s\" thinks we are lost, and \"%s\" is not letting us join",WD_MASTER_NODE->nodeName,wdNode->nodeName),
+                                                                errhint("please verify the watchdog life-check and network is working properly")));
+                                               set_state(WD_NETWORK_ISOLATION);
                                         }
                                         else
                                         {
-                                               ereport(NOTICE,
-                                                               (errmsg("our join coordinator is rejected by node \"%s\"", wdNode->nodeName),
-                                                                errhint("rejoining the cluster.")));
                                                 set_state(WD_JOINING);
                                         }
                                 }
                         }
+               }
+                       break;
+
+               case WD_EVENT_I_AM_APPEARING_LOST:
+               {
+                       /* The remote node has lost us, and if it
+                        * was our coordinator we might already be
+                        * removed from it's standby list
+                        * So re-Join the cluster
+                        */
+                       if (WD_MASTER_NODE == wdNode)
+                       {
+                               ereport(LOG,
+                                               (errmsg("we are lost on the master node \"%s\"",wdNode->nodeName)));
+                               set_state(WD_JOINING);
+                       }
+               }
                         break;
  
                 case WD_EVENT_REMOTE_NODE_LOST:
@@ -6285,6 +6628,22 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD
                         {
                                 switch (pkt->type)
                                 {
+                                       case WD_ADD_NODE_MESSAGE:
+                                       {
+                                               /* In case we received the ADD node message from
+                                                * our coordinator. Reset the cluster state
+                                                */
+                                               if (wdNode == WD_MASTER_NODE)
+                                               {
+                                                       ereport(LOG,
+                                                                       (errmsg("received ADD NODE message from the master node \"%s\"", wdNode->nodeName),
+                                                                        errdetail("re-joining the cluster")));
+                                                       set_state(WD_JOINING);
+                                               }
+                                               standard_packet_processor(wdNode, pkt);
+                                       }
+                                               break;
+
                                         case WD_FAILOVER_END:
                                                 {
                                                         register_backend_state_sync_req_interupt();
@@ -6300,8 +6659,11 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD
                                                         }
                                                         else
                                                         {
+                                                               ereport(LOG,
+                                                                               (errmsg("We are connected to master node \"%s\" and another node \"%s\" is trying to become a master",WD_MASTER_NODE->nodeName, wdNode->nodeName)));
                                                                 reply_with_minimal_message(wdNode, WD_ERROR_MESSAGE, pkt);
-                                                               set_state(WD_JOINING);
+                                                               /* Ask master to re-send its node info */
+                                                               send_message_of_type(WD_MASTER_NODE, WD_REQ_INFO_MESSAGE, NULL);
                                                         }
                                                 }
                                                 break;
@@ -6312,14 +6674,14 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD
                                                         {
                                                                 /*
                                                                  * we already have a master node and we got a
-                                                                * new node trying to be master re-initialize
-                                                                * the cluster, something is wrong
+                                                                * new node trying to be master
                                                                  */
+                                                               ereport(LOG,
+                                                                               (errmsg("We are connected to master node \"%s\" and another node \"%s\" is trying to declare itself as a master",WD_MASTER_NODE->nodeName, wdNode->nodeName)));
                                                                 reply_with_minimal_message(wdNode, WD_ERROR_MESSAGE, pkt);
-                                                       }
-                                                       else
-                                                       {
-                                                               set_state(WD_JOINING);
+                                                               /* Ask master to re-send its node info */
+                                                               send_message_of_type(WD_MASTER_NODE, WD_REQ_INFO_MESSAGE, NULL);
+
                                                         }
                                                 }
                                                 break;
@@ -6339,7 +6701,7 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD
  
                                                                 send_cluster_service_message(NULL, pkt, CLUSTER_IN_SPLIT_BRAIN);
                                                         }
-                                                       else
+                                                       else if (check_debug_request_do_not_reply_beacon() == false)
                                                         {
                                                                 send_message_of_type(wdNode, WD_INFO_MESSAGE, pkt);
                                                                 beacon_message_received_from_node(wdNode, pkt);
@@ -6369,7 +6731,7 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD
                 gettimeofday(&currTime, NULL);
                 int                     last_rcv_sec = WD_TIME_DIFF_SEC(currTime, WD_MASTER_NODE->last_rcv_time);
  
-               if (last_rcv_sec >= (2 * BEACON_MESSAGE_INTERVAL_SECONDS))
+               if (last_rcv_sec >= (3 * BEACON_MESSAGE_INTERVAL_SECONDS))
                 {
                         /* we have missed atleast two beacons from master node */
                         ereport(WARNING,
@@ -6377,9 +6739,8 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD
                                                         WD_MASTER_NODE->nodeName),
                                          errdetail("re-initializing the cluster")));
                         set_state(WD_JOINING);
-
                 }
-               else if (last_rcv_sec >= BEACON_MESSAGE_INTERVAL_SECONDS)
+               else if (last_rcv_sec >= (2 * BEACON_MESSAGE_INTERVAL_SECONDS))
                 {
                         /*
                          * We have not received a last becacon from master ask for the
@@ -6400,10 +6761,10 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD
   * The function identifies the current quorum state
   * quorum values:
   * -1:
- *     quorum is lost or does not exisits
+ *     quorum is lost or does not exists
   * 0:
   *     The quorum is on the edge. (when participating cluster is configured
- *     with even number of nodes, and we have exectly 50% nodes
+ *     with even number of nodes, and we have exactly 50% nodes
   * 1:
   *     quorum exists
   */
@@ -6412,11 +6773,11 @@ update_quorum_status(void)
  {
         int                     quorum_status = g_cluster.quorum_status;
  
-       if (g_cluster.clusterMasterInfo.standby_nodes_count > get_mimimum_remote_nodes_required_for_quorum())
+       if (g_cluster.clusterMasterInfo.standby_nodes_count > get_minimum_remote_nodes_required_for_quorum())
         {
                 g_cluster.quorum_status = 1;
         }
-       else if (g_cluster.clusterMasterInfo.standby_nodes_count == get_mimimum_remote_nodes_required_for_quorum())
+       else if (g_cluster.clusterMasterInfo.standby_nodes_count == get_minimum_remote_nodes_required_for_quorum())
         {
                 if (g_cluster.remoteNodeCount % 2 != 0)
                 {
@@ -6443,10 +6804,10 @@ update_quorum_status(void)
   * returns the minimum number of remote nodes required for quorum
   */
  static int
-get_mimimum_remote_nodes_required_for_quorum(void)
+get_minimum_remote_nodes_required_for_quorum(void)
  {
         /*
-        * Even numner of remote nodes, That means total number of nodes are odd,
+        * Even number of remote nodes, That means total number of nodes are odd,
          * so minimum quorum is just remote/2.
          */
         if (g_cluster.remoteNodeCount % 2 == 0)
@@ -6466,11 +6827,11 @@ static int
  get_minimum_votes_to_resolve_consensus(void)
  {
         /*
-        * Since get_mimimum_remote_nodes_required_for_quorum() returns
+        * Since get_minimum_remote_nodes_required_for_quorum() returns
          * the number of remote nodes required to complete the quorum
          * that is always one less than the total number of nodes required
          * for the cluster to build quorum or consensus, reason being
-        * in get_mimimum_remote_nodes_required_for_quorum()
+        * in get_minimum_remote_nodes_required_for_quorum()
          * we always consider the local node as a valid pre-casted vote.
          * But when it comes to count the number of votes required to build
          * consensus for any type of decision, for example for building the
@@ -6482,8 +6843,8 @@ get_minimum_votes_to_resolve_consensus(void)
          * For example
          * If Total nodes in cluster = 4
          *              remote node will be = 3
-        *              get_mimimum_remote_nodes_required_for_quorum() return = 1
-        *              Minimum number of votes required for consensu will be
+        *              get_minimum_remote_nodes_required_for_quorum() return = 1
+        *              Minimum number of votes required for consensus will be
          *
          *              if(pool_config->enable_consensus_with_half_votes = true)
          *                      (exact 50% n/2) ==> 4/2 = 2
@@ -6493,13 +6854,13 @@ get_minimum_votes_to_resolve_consensus(void)
          *
          */
  
-       int required_node_count = get_mimimum_remote_nodes_required_for_quorum()  + 1;
+       int required_node_count = get_minimum_remote_nodes_required_for_quorum()  + 1;
         /*
          * When the total number of nodes in the watchdog cluster including the
          * local node are even, The number of votes required for the consensus
          * depends on the enable_consensus_with_half_votes.
          * So for even number of nodes when enable_consensus_with_half_votes is
-        * not allowed than we would nedd one more vote than exact 50%
+        * not allowed than we would add one more vote than exact 50%
          */
         if (g_cluster.remoteNodeCount % 2 != 0)
         {
@@ -6512,7 +6873,7 @@ get_minimum_votes_to_resolve_consensus(void)
  
  /*
   * sets the state of local watchdog node, and fires a state change event
- * if the new and old state differes
+ * if the new and old state differs
   */
  static int
  set_state(WD_STATES newState)
@@ -6973,7 +7334,7 @@ check_IPC_client_authentication(json_value * rootObj, bool internal_client_only)
         if (json_get_int_value_for_key(rootObj, WD_IPC_SHARED_KEY, (int *) &packet_key))
         {
                 ereport(DEBUG2,
-                               (errmsg("IPC json data packet does not contain shared key")));
+                               (errmsg("IPC JSON data packet does not contain shared key")));
                 has_shared_key = false;
         }
         else
@@ -6994,15 +7355,15 @@ check_IPC_client_authentication(json_value * rootObj, bool internal_client_only)
                 if (has_shared_key == false)
                 {
                         ereport(LOG,
-                                       (errmsg("invalid json data packet"),
-                                        errdetail("authentication shared key not found in json data")));
+                                       (errmsg("invalid JSON data packet"),
+                                        errdetail("authentication shared key not found in JSON data")));
                         return false;
                 }
                 /* compare if shared keys match */
                 if (*shared_key != packet_key)
                         return false;
  
-               /* providing a valid shared key for inetenal clients is enough */
+               /* providing a valid shared key for internal clients is enough */
                 return true;
         }
  
@@ -7014,14 +7375,14 @@ check_IPC_client_authentication(json_value * rootObj, bool internal_client_only)
         if (has_shared_key == true && *shared_key == packet_key)
                 return true;
  
-       /* shared key is out of question validate the authKey valurs */
+       /* shared key is out of question validate the authKey values */
         packet_auth_key = json_get_string_value_for_key(rootObj, WD_IPC_AUTH_KEY);
  
         if (packet_auth_key == NULL)
         {
                 ereport(DEBUG1,
-                               (errmsg("invalid json data packet"),
-                                errdetail("authentication key not found in json data")));
+                               (errmsg("invalid JSON data packet"),
+                                errdetail("authentication key not found in JSON data")));
                 return false;
         }
  
@@ -7265,7 +7626,7 @@ set_cluster_master_node(WatchdogNode * wdNode)
         {
                 if (wdNode == NULL)
                         ereport(LOG,
-                                       (errmsg("unassigning the %s node \"%s\" from watchdog cluster master",
+                                       (errmsg("removing the %s node \"%s\" from watchdog cluster master",
                                                         (g_cluster.localNode == WD_MASTER_NODE) ? "local" : "remote",
                                                         WD_MASTER_NODE->nodeName)));
                 else
@@ -7365,3 +7726,202 @@ clear_standby_nodes_list(void)
         g_cluster.clusterMasterInfo.standby_nodes_count = 0;
         g_cluster.localNode->standby_nodes_count = 0;
  }
+
+static void update_missed_beacon_count(WDCommandData* ipcCommand, bool clear)
+{
+       int i;
+       for (i=0; i< g_cluster.remoteNodeCount; i++)
+       {
+               if (clear)
+               {
+                       WatchdogNode* wdNode = &(g_cluster.remoteNodes[i]);
+                       wdNode->missed_beacon_count = 0;
+               }
+               else
+               {
+                       WDCommandNodeResult* nodeResult = &ipcCommand->nodeResults[i];
+                       if (ipcCommand->commandStatus == COMMAND_IN_PROGRESS )
+                               return;
+
+                       if (nodeResult->cmdState == COMMAND_STATE_SENT)
+                       {
+                               if (nodeResult->wdNode->state == WD_STANDBY)
+                               {
+                                       nodeResult->wdNode->missed_beacon_count++;
+                                       if (nodeResult->wdNode->missed_beacon_count > 1)
+                                               ereport(LOG,
+                                                       (errmsg("remote node \"%s\" is not replying to our beacons",nodeResult->wdNode->nodeName),
+                                                        errdetail("missed beacon reply count:%d",nodeResult->wdNode->missed_beacon_count)));
+                               }
+                               else
+                                       nodeResult->wdNode->missed_beacon_count = 0;
+                       }
+                       if (nodeResult->cmdState == COMMAND_STATE_REPLIED)
+                       {
+                               if (nodeResult->wdNode->missed_beacon_count > 0)
+                                       ereport(LOG,
+                                                       (errmsg("remote node \"%s\" is replying again after missing %d beacons",nodeResult->wdNode->nodeName,
+                                                                       nodeResult->wdNode->missed_beacon_count)));
+                               nodeResult->wdNode->missed_beacon_count = 0;
+                       }
+               }
+       }
+}
+
+#ifdef WATCHDOG_DEBUG
+/*
+ * Node down request file. In the file, each line consists of watchdog
+ * debug command.  The possible commands are same as the defines below
+ * for example to stop Pgpool-II from sending the reply to beacon messages
+ * from the master node write DO_NOT_REPLY_TO_BEACON in watchdog_debug_requests
+ *
+ *
+ * echo "DO_NOT_REPLY_TO_BEACON" > pgpool_logdir/watchdog_debug_requests
+ */
+
+typedef struct watchdog_debug_commands
+{
+       char            command[100];
+       unsigned int code;
+}                      watchdog_debug_commands;
+
+unsigned int watchdog_debug_command = 0;
+
+
+#define WATCHDOG_DEBUG_FILE    "watchdog_debug_requests"
+
+#define DO_NOT_REPLY_TO_BEACON         1
+#define DO_NOT_SEND_BEACON             2
+#define KILL_ALL_COMMUNICATION 4
+#define KILL_ALL_RECEIVERS             8
+#define KILL_ALL_SENDERS               16
+
+
+watchdog_debug_commands wd_debug_commands[] = {
+       {"DO_NOT_REPLY_TO_BEACON", DO_NOT_REPLY_TO_BEACON},
+       {"DO_NOT_SEND_BEACON",     DO_NOT_SEND_BEACON},
+       {"KILL_ALL_COMMUNICATION", KILL_ALL_COMMUNICATION},
+       {"KILL_ALL_RECEIVERS",     KILL_ALL_RECEIVERS},
+       {"KILL_ALL_SENDERS",       KILL_ALL_SENDERS},
+       {"", 0}
+};
+
+static bool
+check_debug_request_kill_all_communication(void)
+{
+       return (watchdog_debug_command & KILL_ALL_COMMUNICATION);
+}
+static bool
+check_debug_request_kill_all_receivers(void)
+{
+       return (watchdog_debug_command & KILL_ALL_RECEIVERS);
+}
+static bool
+check_debug_request_kill_all_senders(void)
+{
+       return (watchdog_debug_command & KILL_ALL_SENDERS);
+}
+
+static bool
+check_debug_request_do_not_send_beacon(void)
+{
+       return (watchdog_debug_command & DO_NOT_SEND_BEACON);
+}
+
+static bool
+check_debug_request_do_not_reply_beacon(void)
+{
+       return (watchdog_debug_command & DO_NOT_REPLY_TO_BEACON);
+}
+/*
+ * Check watchdog debug request options file for debug commands
+ * each line should contain only one command
+ *
+ * Possible commands
+ *             DO_NOT_REPLY_TO_BEACON
+ *             DO_NOT_SEND_BEACON
+ *             KILL_ALL_COMMUNICATION
+ *             KILL_ALL_RECEIVERS
+ *             KILL_ALL_SENDERS
+ */
+
+static void
+load_watchdog_debug_test_option(void)
+{
+       static char wd_debug_request_file[POOLMAXPATHLEN];
+       FILE       *fd;
+       int                     i;
+#define MAXLINE 128
+       char            readbuf[MAXLINE];
+
+       watchdog_debug_command = 0;
+
+       if (wd_debug_request_file[0] == '\0')
+       {
+               snprintf(wd_debug_request_file, sizeof(wd_debug_request_file),
+                                "%s/%s", pool_config->logdir, WATCHDOG_DEBUG_FILE);
+       }
+
+       fd = fopen(wd_debug_request_file, "r");
+       if (!fd)
+       {
+               ereport(DEBUG3,
+                               (errmsg("load_watchdog_debug_test_option: failed to open file %s",
+                                               wd_debug_request_file),
+                                errdetail("\"%s\"", strerror(errno))));
+               return;
+       }
+
+       for (i = 0;; i++)
+       {
+               int cmd = 0;
+               bool valid_command = false;
+               readbuf[MAXLINE - 1] = '\0';
+               if (fgets(readbuf, MAXLINE - 1, fd) == 0)
+                       break;
+               for (cmd =0 ;; cmd++)
+               {
+                       if (strlen(wd_debug_commands[cmd].command) == 0 || wd_debug_commands[cmd].code == 0)
+                               break;
+
+                       if (strncasecmp(wd_debug_commands[cmd].command,readbuf,strlen(wd_debug_commands[cmd].command)) == 0)
+                       {
+                               ereport(DEBUG3,
+                                               (errmsg("Watchdog DEBUG COMMAND %d: \"%s\" request found",
+                                                               cmd,wd_debug_commands[cmd].command)));
+
+                               watchdog_debug_command |= wd_debug_commands[cmd].code;
+                               valid_command = true;
+                               break;
+                       }
+               }
+               if (!valid_command)
+                       ereport(WARNING,
+                               (errmsg("%s file contains invalid command",
+                                                       wd_debug_request_file),
+                                        errdetail("\"%s\" not recognized", readbuf)));
+       }
+
+       fclose(fd);
+}
+#else
+/*
+ * All these command checks return false when WATCHDOG_DEBUG is
+ * not enabled
+ */
+static bool
+check_debug_request_do_not_send_beacon(void)
+{return false;}
+static bool
+check_debug_request_do_not_reply_beacon(void)
+{return false;}
+static bool
+check_debug_request_kill_all_communication(void)
+{return false;}
+static bool
+check_debug_request_kill_all_receivers(void)
+{return false;}
+static bool
+check_debug_request_kill_all_senders(void)
+{return false;}
+#endif
author	Muhammad Usama <m.usama@gmail.com>
	Mon, 14 Oct 2019 10:03:08 +0000 (15:03 +0500)
committer	Muhammad Usama <m.usama@gmail.com>
	Mon, 14 Oct 2019 13:14:44 +0000 (18:14 +0500)
src/include/watchdog/watchdog.h		patch \| blob \| blame \| history
src/test/regression/regress.sh		patch \| blob \| blame \| history
src/watchdog/Makefile.am		patch \| blob \| blame \| history
src/watchdog/Makefile.in		patch \| blob \| blame \| history
src/watchdog/watchdog.c		patch \| blob \| blame \| history