From bfc35eaf7cb80d392c9b930a04ed64f7ba1af3cc Mon Sep 17 00:00:00 2001
From: Philip O'Toole <philip.otoole@yahoo.com>
Date: Tue, 15 Nov 2022 09:41:21 -0500
Subject: [PATCH] Simplify reap command line flags (#1118)

Simplify reap command line flags
---
 CHANGELOG.md                |   2 +-
 DOC/CLUSTER_MGMT.md         |  10 +--
 cluster/client.go           |   2 +-
 cmd/rqlited/flags.go        |  21 +++++--
 cmd/rqlited/main.go         |   1 -
 queue/queue.go              |   2 +-
 store/store.go              |  44 +++++++-------
 system_test/cluster_test.go | 118 +++++++++++++++++++++++++++++++++++-
 8 files changed, 162 insertions(+), 38 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9b87e4e8..9bb73657 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,6 @@
 ## 7.11.0 (unreleased)
 ### New features
-- [PR #1114](https://github.com/rqlite/rqlite/pull/1114): Support automatically removing non-reachable nodes after a configurable period. Fixes [issue #728](https://github.com/rqlite/rqlite/issues/728).
+- [PR #1114](https://github.com/rqlite/rqlite/pull/1114), [PR #1118](https://github.com/rqlite/rqlite/pull/1118): Support automatically removing non-reachable nodes after a configurable period. Fixes [issue #728](https://github.com/rqlite/rqlite/issues/728).
 - [PR #1116](https://github.com/rqlite/rqlite/pull/1116), [PR #1117](https://github.com/rqlite/rqlite/pull/1117): Support associative form for query responses. Fixes [issue #1115](https://github.com/rqlite/rqlite/issues/1115).
 
 ## 7.10.1 (November 11th 2022)
diff --git a/DOC/CLUSTER_MGMT.md b/DOC/CLUSTER_MGMT.md
index 81274a86..c4d901b9 100644
--- a/DOC/CLUSTER_MGMT.md
+++ b/DOC/CLUSTER_MGMT.md
@@ -89,14 +89,16 @@ If you cannot bring sufficient nodes back online such that the cluster can elect
 ## Automatically removing failed nodes
 > :warning: **This functionality was introduced in version 7.11.0. It does not exist in earlier releases.**
 
-rqlite supports automatically removing both voting and non-voting (read-only) nodes that have been non-reachable for a configurable period of time. A non-reachable node is defined as a node that the Leader cannot heartbeat with. To enable reaping set `-raft-reap-nodes` when launching `rqlited`. The reaping timeout for each type of node can be set independently, but defaults to 72 hours. It is recommended that this is set to at least double the maximum expected recoverable outage time for a node or network partition for nodes. Note that the timeout clock is reset if a cluster elects a new Leader.
+rqlite supports automatically removing both voting (the default type) and non-voting (read-only) nodes that have been non-reachable for a configurable period of time. A non-reachable node is defined as a node that the Leader cannot heartbeat with. To enable reaping of voting nodes set `-raft-reap-node-timeout` to a non-zero time interval. Likewise, to enable reaping of non-voting (read-only) nodes set `-raft-reap-read-only-node-timeout`.
+
+It is recommended that these values be set conservatively, especially for voting nodes. Setting them too low may mean you don't account for the normal kinds of network outages and tempoary failures that can affect distributed systems such as rqlite. Note that the timeout clock is reset if a cluster elects a new Leader.
 
 ### Example configuration
-Enable reaping, instructing rqlite to reap non-reachable voting nodes after 2 days, and non-reachable read-only nodes after 4 hours.
+Instruct rqlite to reap non-reachable voting nodes after 2 days, and non-reachable read-only nodes after 30 minutes:
 ```bash
-rqlited -node-id 1 -raft-reap-nodes -raft-reap-node-timeout=48h -raft-reap-read-only-node-timeout=4h data
+rqlited -node-id 1 -raft-reap-node-timeout=48h -raft-reap-read-only-node-timeout=30m data
 ```
-For reaping to work properly you **must** set these flags on **every** voting node in the cluster -- in otherwords, every node that could potentially become the Leader. To effectively disable reaping for one type of node, but not the other, simply set the relevant timeout to a very long time.
+For reaping to work consistently you **must** set these flags on **every** voting node in the cluster -- in otherwords, every node that could potentially become the Leader. You can also set the flags on read-only nodes, but they will simply be silently ignored.
 
 # Dealing with failure
 It is the nature of clustered systems that nodes can fail at anytime. Depending on the size of your cluster, it will tolerate various amounts of failure. With a 3-node cluster, it can tolerate the failure of a single node, including the leader.
diff --git a/cluster/client.go b/cluster/client.go
index b85f8bb0..3c9252ae 100644
--- a/cluster/client.go
+++ b/cluster/client.go
@@ -271,7 +271,7 @@ func (c *Client) Stats() (map[string]interface{}, error) {
 	defer c.mu.RUnlock()
 
 	stats := map[string]interface{}{
-		"timeout":         c.timeout,
+		"timeout":         c.timeout.String(),
 		"local_node_addr": c.localNodeAddr,
 	}
 
diff --git a/cmd/rqlited/flags.go b/cmd/rqlited/flags.go
index b7579190..0d486e78 100644
--- a/cmd/rqlited/flags.go
+++ b/cmd/rqlited/flags.go
@@ -168,9 +168,6 @@ type Config struct {
 	// a full database re-sync during recovery.
 	RaftNoFreelistSync bool
 
-	// RaftReapNodes enables reaping of non-reachable nodes.
-	RaftReapNodes bool
-
 	// RaftReapNodeTimeout sets the duration after which a non-reachable voting node is
 	// reaped i.e. removed from the cluster.
 	RaftReapNodeTimeout time.Duration
@@ -399,9 +396,8 @@ func ParseFlags(name, desc string, build *BuildInfo) (*Config, error) {
 	flag.BoolVar(&config.RaftShutdownOnRemove, "raft-remove-shutdown", false, "Shutdown Raft if node removed")
 	flag.BoolVar(&config.RaftNoFreelistSync, "raft-no-freelist-sync", false, "Do not sync Raft log database freelist to disk")
 	flag.StringVar(&config.RaftLogLevel, "raft-log-level", "INFO", "Minimum log level for Raft module")
-	flag.BoolVar(&config.RaftReapNodes, "raft-reap-nodes", false, "Enable reaping of non-reachable nodes")
-	flag.DurationVar(&config.RaftReapNodeTimeout, "raft-reap-node-timeout", 72*time.Hour, "Time after which a nonreachable voting node will be reaped")
-	flag.DurationVar(&config.RaftReapReadOnlyNodeTimeout, "raft-reap-read-only-node-timeout", 72*time.Hour, "Time after which a non-reachable non-voting node will be reaped")
+	flag.DurationVar(&config.RaftReapNodeTimeout, "raft-reap-node-timeout", 0*time.Hour, "Time after which a non-reachable voting node will be reaped. If not set, no reaping takes place")
+	flag.DurationVar(&config.RaftReapReadOnlyNodeTimeout, "raft-reap-read-only-node-timeout", 0*time.Hour, "Time after which a non-reachable non-voting node will be reaped. If not set, no reaping takes place")
 	flag.DurationVar(&config.ClusterConnectTimeout, "cluster-connect-timeout", 30*time.Second, "Timeout for initial connection to other nodes")
 	flag.IntVar(&config.WriteQueueCap, "write-queue-capacity", 1024, "Write queue capacity")
 	flag.IntVar(&config.WriteQueueBatchSz, "write-queue-batch-size", 128, "Write queue batch size")
@@ -425,6 +421,19 @@ func ParseFlags(name, desc string, build *BuildInfo) (*Config, error) {
 		errorExit(0, msg)
 	}
 
+	// Ensure, if set explicitly, that reap times are not too low.
+	flag.Visit(func(f *flag.Flag) {
+		if f.Name == "raft-reap-node-timeout" || f.Name == "raft-reap-read-only-node-timeout" {
+			d, err := time.ParseDuration(f.Value.String())
+			if err != nil {
+				errorExit(1, fmt.Sprintf("failed to parse duration: %s", err.Error()))
+			}
+			if d <= 0 {
+				errorExit(1, fmt.Sprintf("-%s must be greater than 0", f.Name))
+			}
+		}
+	})
+
 	// Ensure the data path is set.
 	if flag.NArg() < 1 {
 		errorExit(1, "no data directory set")
diff --git a/cmd/rqlited/main.go b/cmd/rqlited/main.go
index 444b92e3..13155435 100644
--- a/cmd/rqlited/main.go
+++ b/cmd/rqlited/main.go
@@ -197,7 +197,6 @@ func createStore(cfg *Config, ln *tcp.Layer) (*store.Store, error) {
 	str.ElectionTimeout = cfg.RaftElectionTimeout
 	str.ApplyTimeout = cfg.RaftApplyTimeout
 	str.BootstrapExpect = cfg.BootstrapExpect
-	str.ReapNodes = cfg.RaftReapNodes
 	str.ReapTimeout = cfg.RaftReapNodeTimeout
 	str.ReapReadOnlyTimeout = cfg.RaftReapReadOnlyNodeTimeout
 
diff --git a/queue/queue.go b/queue/queue.go
index b5d96f9c..b1df5238 100644
--- a/queue/queue.go
+++ b/queue/queue.go
@@ -140,7 +140,7 @@ func (q *Queue) Stats() (map[string]interface{}, error) {
 	return map[string]interface{}{
 		"max_size":   q.maxSize,
 		"batch_size": q.batchSize,
-		"timeout":    q.timeout,
+		"timeout":    q.timeout.String(),
 	}, nil
 }
 
diff --git a/store/store.go b/store/store.go
index 1ba11825..6799c45b 100644
--- a/store/store.go
+++ b/store/store.go
@@ -60,7 +60,6 @@ const (
 	retainSnapshotCount = 2
 	applyTimeout        = 10 * time.Second
 	openTimeout         = 120 * time.Second
-	reapTimeout         = 72 * time.Hour
 	sqliteFile          = "db.sqlite"
 	leaderWaitDelay     = 100 * time.Millisecond
 	appliedWaitDelay    = 100 * time.Millisecond
@@ -88,6 +87,7 @@ const (
 	snapshotDBOnDiskSize     = "snapshot_db_ondisk_size"
 	leaderChangesObserved    = "leader_changes_observed"
 	leaderChangesDropped     = "leader_changes_dropped"
+	failedHeartbeatObserved  = "failed_heartbeat_observed"
 	nodesReapedOK            = "nodes_reaped_ok"
 	nodesReapedFailed        = "nodes_reaped_failed"
 )
@@ -112,6 +112,7 @@ func init() {
 	stats.Add(snapshotDBOnDiskSize, 0)
 	stats.Add(leaderChangesObserved, 0)
 	stats.Add(leaderChangesDropped, 0)
+	stats.Add(failedHeartbeatObserved, 0)
 	stats.Add(nodesReapedOK, 0)
 	stats.Add(nodesReapedFailed, 0)
 }
@@ -202,7 +203,6 @@ type Store struct {
 	NoFreeListSync     bool
 
 	// Node-reaping configuration
-	ReapNodes           bool
 	ReapTimeout         time.Duration
 	ReapReadOnlyTimeout time.Duration
 
@@ -244,20 +244,18 @@ func New(ln Listener, c *Config) *Store {
 	}
 
 	return &Store{
-		ln:                  ln,
-		raftDir:             c.Dir,
-		peersPath:           filepath.Join(c.Dir, peersPath),
-		peersInfoPath:       filepath.Join(c.Dir, peersInfoPath),
-		raftID:              c.ID,
-		dbConf:              c.DBConf,
-		dbPath:              dbPath,
-		leaderObservers:     make([]chan<- struct{}, 0),
-		reqMarshaller:       command.NewRequestMarshaler(),
-		logger:              logger,
-		notifyingNodes:      make(map[string]*Server),
-		ApplyTimeout:        applyTimeout,
-		ReapTimeout:         reapTimeout,
-		ReapReadOnlyTimeout: reapTimeout,
+		ln:              ln,
+		raftDir:         c.Dir,
+		peersPath:       filepath.Join(c.Dir, peersPath),
+		peersInfoPath:   filepath.Join(c.Dir, peersInfoPath),
+		raftID:          c.ID,
+		dbConf:          c.DBConf,
+		dbPath:          dbPath,
+		leaderObservers: make([]chan<- struct{}, 0),
+		reqMarshaller:   command.NewRequestMarshaler(),
+		logger:          logger,
+		notifyingNodes:  make(map[string]*Server),
+		ApplyTimeout:    applyTimeout,
 	}
 }
 
@@ -387,7 +385,7 @@ func (s *Store) Open() (retErr error) {
 	s.observer = raft.NewObserver(s.observerChan, false, func(o *raft.Observation) bool {
 		_, isLeaderChange := o.Data.(raft.LeaderObservation)
 		_, isFailedHeartBeat := o.Data.(raft.FailedHeartbeatObservation)
-		return isLeaderChange || (isFailedHeartBeat && s.ReapNodes)
+		return isLeaderChange || isFailedHeartBeat
 	})
 
 	// Register and listen for leader changes.
@@ -718,10 +716,9 @@ func (s *Store) Stats() (map[string]interface{}, error) {
 		"heartbeat_timeout":      s.HeartbeatTimeout.String(),
 		"election_timeout":       s.ElectionTimeout.String(),
 		"snapshot_threshold":     s.SnapshotThreshold,
-		"snapshot_interval":      s.SnapshotInterval,
-		"reap_nodes":             s.ReapNodes,
-		"reap_timeout":           s.ReapTimeout,
-		"reap_read_only_timeout": s.ReapReadOnlyTimeout,
+		"snapshot_interval":      s.SnapshotInterval.String(),
+		"reap_timeout":           s.ReapTimeout.String(),
+		"reap_read_only_timeout": s.ReapReadOnlyTimeout.String(),
 		"no_freelist_sync":       s.NoFreeListSync,
 		"trailing_logs":          s.numTrailingLogs,
 		"request_marshaler":      s.reqMarshaller.Stats(),
@@ -1351,6 +1348,8 @@ func (s *Store) observe() (closeCh, doneCh chan struct{}) {
 			case o := <-s.observerChan:
 				switch signal := o.Data.(type) {
 				case raft.FailedHeartbeatObservation:
+					stats.Add(failedHeartbeatObserved, 1)
+
 					nodes, err := s.Nodes()
 					if err != nil {
 						s.logger.Printf("failed to get nodes configuration during reap check: %s", err.Error())
@@ -1365,7 +1364,8 @@ func (s *Store) observe() (closeCh, doneCh chan struct{}) {
 						break
 					}
 
-					if (isReadOnly && dur > s.ReapReadOnlyTimeout) || (!isReadOnly && dur > s.ReapTimeout) {
+					if (isReadOnly && s.ReapReadOnlyTimeout > 0 && dur > s.ReapReadOnlyTimeout) ||
+						(!isReadOnly && s.ReapTimeout > 0 && dur > s.ReapTimeout) {
 						pn := "voting node"
 						if isReadOnly {
 							pn = "non-voting node"
diff --git a/system_test/cluster_test.go b/system_test/cluster_test.go
index 554a2e77..e47033ef 100644
--- a/system_test/cluster_test.go
+++ b/system_test/cluster_test.go
@@ -1507,7 +1507,6 @@ func Test_MultiNodeClusterRecoverFull(t *testing.T) {
 // Test_MultiNodeClusterReapNodes tests that unreachable nodes are reaped.
 func Test_MultiNodeClusterReapNodes(t *testing.T) {
 	cfgStoreFn := func(n *Node) {
-		n.Store.ReapNodes = true
 		n.Store.ReapTimeout = time.Second
 		n.Store.ReapReadOnlyTimeout = time.Second
 	}
@@ -1612,7 +1611,6 @@ func Test_MultiNodeClusterReapNodes(t *testing.T) {
 // its time.
 func Test_MultiNodeClusterNoReap(t *testing.T) {
 	cfgStoreFn := func(n *Node) {
-		n.Store.ReapNodes = true
 		n.Store.ReapReadOnlyTimeout = 120 * time.Second
 	}
 
@@ -1654,3 +1652,119 @@ func Test_MultiNodeClusterNoReap(t *testing.T) {
 		t.Fatalf("didn't time out waiting for node to be removed")
 	}
 }
+
+// Test_MultiNodeClusterNoReapZero tests that unreachable nodes are reaped.
+func Test_MultiNodeClusterNoReapZero(t *testing.T) {
+	cfgStoreFn := func(n *Node) {
+		n.Store.ReapTimeout = 0
+	}
+
+	node1 := mustNewLeaderNode()
+	defer node1.Deprovision()
+	cfgStoreFn(node1)
+	_, err := node1.WaitForLeader()
+	if err != nil {
+		t.Fatalf("failed waiting for leader: %s", err.Error())
+	}
+
+	node2 := mustNewNode(false)
+	defer node2.Deprovision()
+	cfgStoreFn(node2)
+	if err := node2.Join(node1); err != nil {
+		t.Fatalf("node failed to join leader: %s", err.Error())
+	}
+	_, err = node2.WaitForLeader()
+	if err != nil {
+		t.Fatalf("failed waiting for leader: %s", err.Error())
+	}
+
+	// Get the new leader, in case it changed.
+	c := Cluster{node1, node2}
+	leader, err := c.Leader()
+	if err != nil {
+		t.Fatalf("failed to find cluster leader: %s", err.Error())
+	}
+
+	node3 := mustNewNode(false)
+	defer node3.Deprovision()
+	cfgStoreFn(node3)
+	if err := node3.Join(leader); err != nil {
+		t.Fatalf("node failed to join leader: %s", err.Error())
+	}
+	_, err = node3.WaitForLeader()
+	if err != nil {
+		t.Fatalf("failed waiting for leader: %s", err.Error())
+	}
+
+	// Get the new leader, in case it changed.
+	c = Cluster{node1, node2, node3}
+	leader, err = c.Leader()
+	if err != nil {
+		t.Fatalf("failed to find cluster leader: %s", err.Error())
+	}
+
+	// Confirm voting node is in the the cluster config.
+	nodes, err := leader.Nodes(true)
+	if err != nil {
+		t.Fatalf("failed to get nodes: %s", err.Error())
+	}
+	if !nodes.HasAddr(node3.RaftAddr) {
+		t.Fatalf("nodes do not contain non-voter node")
+	}
+
+	// Kill voting node, confirm it's not reaped.
+	node3.Deprovision()
+	tFn := func() bool {
+		nodes, _ = leader.Nodes(true)
+		return !nodes.HasAddr(node3.RaftAddr)
+	}
+
+	if trueOrTimeout(tFn, 10*time.Second) {
+		t.Fatalf("didn't time out waiting for node to be removed")
+	}
+}
+
+// Test_MultiNodeClusterNoReapReadOnlyZero tests that a node is not incorrectly reaped.
+func Test_MultiNodeClusterNoReapReadOnlyZero(t *testing.T) {
+	cfgStoreFn := func(n *Node) {
+		n.Store.ReapReadOnlyTimeout = 0
+	}
+
+	node1 := mustNewLeaderNode()
+	defer node1.Deprovision()
+	cfgStoreFn(node1)
+	_, err := node1.WaitForLeader()
+	if err != nil {
+		t.Fatalf("failed waiting for leader: %s", err.Error())
+	}
+
+	nonVoter := mustNewNode(false)
+	defer nonVoter.Deprovision()
+	cfgStoreFn(nonVoter)
+	if err := nonVoter.JoinAsNonVoter(node1); err != nil {
+		t.Fatalf("non-voting node failed to join leader: %s", err.Error())
+	}
+	_, err = nonVoter.WaitForLeader()
+	if err != nil {
+		t.Fatalf("failed waiting for leader: %s", err.Error())
+	}
+
+	// Confirm non-voter node is in the the cluster config.
+	nodes, err := node1.Nodes(true)
+	if err != nil {
+		t.Fatalf("failed to get nodes: %s", err.Error())
+	}
+	if !nodes.HasAddr(nonVoter.RaftAddr) {
+		t.Fatalf("nodes do not contain non-voter node")
+	}
+
+	// Kill non-voter node, confirm it's not removed.
+	nonVoter.Deprovision()
+	tFn := func() bool {
+		nodes, _ = node1.Nodes(true)
+		return !nodes.HasAddr(nonVoter.RaftAddr)
+	}
+	if trueOrTimeout(tFn, 10*time.Second) {
+		t.Fatalf("didn't time out waiting for node to be removed")
+	}
+}