Fix issues with FAILOVER
authorSteve Singer <ssinger@ca.afilias.info>
Mon, 10 Apr 2017 13:38:15 +0000 (09:38 -0400)
committerSteve Singer <ssinger@ca.afilias.info>
Mon, 10 Apr 2017 13:38:15 +0000 (09:38 -0400)
In particular it is possible when doing a multi-node failover
with a cascaded node for the result after the failover to
have nodes with a self subscription, a row in sl_subscribe
with the provider and receiver equal.

In fixing that issue we also discovered that slon was not
reloading the set origin's into memory following a FAILOVER
command.  This could mean that we were processing SYNC events
from a node which without realizing that node now is the origin
for a set.

src/backend/slony1_funcs.sql
src/slon/remote_worker.c
src/slon/runtime_config.c
src/slon/slon.h

index a47e89d2cb98bcb10ed59425396e067926351685..46d9017dfa5b8e2361b904a844af8c9a7fd4675b 100644 (file)
@@ -793,7 +793,7 @@ $$ language plpgsql
 
 comment on function @NAMESPACE@.storeNode(p_no_id int4, p_no_comment text) is
 'no_id - Node ID #
-no_comment - Human-oriented comment
+no_comment - Human-oriented commentb
 
 Generate the STORE_NODE event for node no_id';
 
@@ -1526,13 +1526,24 @@ begin
        -- provider for all subscriptions served
        -- by the failed node. (otherwise it
        -- wouldn't be a allowable backup node).
+--     delete from @NAMESPACE@.sl_subscribe
+--                where sub_receiver=p_backup_node;
+                  
        update @NAMESPACE@.sl_subscribe        
               set sub_provider=p_backup_node
               from @NAMESPACE@.sl_node
               where sub_provider=p_failed_node
               and sl_node.no_id=sub_receiver
-              and sl_node.no_failed=false;     
-
+              and sl_node.no_failed=false
+                  and sub_receiver<>p_backup_node;
+                  
+       update @NAMESPACE@.sl_subscribe        
+              set sub_provider=(select set_origin from
+                          @NAMESPACE@.sl_set where set_id=
+                          sub_set)
+                       where sub_provider=p_failed_node
+                       and sub_receiver=p_backup_node;
+                  
        update @NAMESPACE@.sl_node
                   set no_active=false WHERE 
                   no_id=p_failed_node;
index cde16db319991c131aa1879b090aba04964d78a9..f1c360bea34680169a1c30de2fa0c1cb8271c96c 100644 (file)
@@ -301,6 +301,8 @@ remoteWorkerThread_main(void *cdata)
        char            seqbuf[64];
        bool            event_ok;
        bool            need_reloadListen = false;
+       bool            need_reloadSets = false;
+       
        char            conn_symname[32];
 
        SlonSyncStatus sync_status = SYNC_INITIAL;
@@ -1276,8 +1278,14 @@ remoteWorkerThread_main(void *cdata)
                                                                 rtcfg_namespace,
                                                                 rtcfg_namespace,
                                                                 failed_node, node->no_id, seq_no_c);
-
+                               slon_log(SLON_INFO, "remoteWorkerThread_%d FAILOVER_NODE finished %d\n"
+                                                        ,node->no_id,
+                                                        failed_node);
+                               /**
+                                * The list of set origins has now changed.
+                                */
                                need_reloadListen = true;
+                               need_reloadSets = true;
                        }
                        else if (strcmp(event->ev_type, "SUBSCRIBE_SET") == 0)
                        {
@@ -1516,6 +1524,11 @@ remoteWorkerThread_main(void *cdata)
                                rtcfg_reloadListen(local_dbconn);
                                need_reloadListen = false;
                        }
+                       if(need_reloadSets)
+                       {
+                               rtcfg_reloadSets(local_dbconn);
+                               need_reloadSets = true;
+                       }
                }
 
 #ifdef SLON_MEMDEBUG
index 566d5acb41a9bf1801d92b2932e98bc9abb76ef6..8877f9d90aab4a95bbdc27ec77a9d47e6f0c4ff3 100644 (file)
@@ -772,6 +772,54 @@ rtcfg_dropSet(int set_id)
        rtcfg_unlock();
 }
 
+/* ------
+ * rtcfg_reloadSets
+ */
+void rtcfg_reloadSets(PGconn * db)
+{
+       SlonDString query;
+       PGresult   *res;
+       int                     i,
+                               n;
+       SlonSet    *set;
+       
+       rtcfg_lock();
+       
+       /*
+        * Read configuration table sl_set
+        */
+       slon_mkquery(&query,
+                                "select set_id, set_origin, set_comment "
+                                "from %s.sl_set",
+                                rtcfg_namespace);
+       res = PQexec(db, dstring_data(&query));
+       if (PQresultStatus(res) != PGRES_TUPLES_OK)
+       {
+               slon_log(SLON_FATAL, "main: Cannot get set config - %s\n",
+                                PQresultErrorMessage(res));
+               PQclear(res);
+               dstring_free(&query);
+               slon_retry();
+       }
+       for (i = 0, n = PQntuples(res); i < n; i++)
+       {
+               int                     set_id = (int) strtol(PQgetvalue(res, i, 0), NULL, 10);
+               int                     set_origin = (int) strtol(PQgetvalue(res, i, 1), NULL, 10);
+               for (set = rtcfg_set_list_head; set; set = set->next)
+               {
+                       if (set->set_id == set_id)
+                       {
+                               set->set_origin=set_origin;                             
+                       }
+               }/*for set in array*/
+       }/*for tuple*/
+       PQclear(res);
+       rtcfg_unlock();
+}
+
+
+
+
 /* ----------
  * rtcfg_moveSet
  * ----------
index c0adf6eae89e4c56b1c4ee7caf9947179b66394b..cdc68e4be882a6afe78cf4d8c074bfdaf570112f 100644 (file)
@@ -478,6 +478,7 @@ extern void rtcfg_storeSet(int set_id, int set_origin, char *set_comment);
 extern void rtcfg_dropSet(int set_id);
 extern void rtcfg_moveSet(int set_id, int old_origin, int new_origin,
                          int sub_provider);
+extern void rtcfg_reloadSets(PGconn *db);
 
 extern void rtcfg_storeSubscribe(int sub_set, int sub_provider,
                                         char *sub_forward);