diff --git a/CHANGELOG.md b/CHANGELOG.md index 46a4256c..300df8e5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ### Implementation changes and bug fixes - [PR #1097](https://github.com/rqlite/rqlite/pull/1097): Start HTTP server as soon as possible after launch. +- [PR #1098](https://github.com/rqlite/rqlite/pull/1098): Bootstrapper doesn't need to know bootstrap-expect. ## 7.10.0 (October 26th 2022) ### New features diff --git a/cluster/bootstrap.go b/cluster/bootstrap.go index 2b6c9bd3..79a4e094 100644 --- a/cluster/bootstrap.go +++ b/cluster/bootstrap.go @@ -35,7 +35,6 @@ type AddressProvider interface { // Bootstrapper performs a bootstrap of this node. type Bootstrapper struct { provider AddressProvider - expect int tlsConfig *tls.Config joiner *Joiner @@ -48,10 +47,9 @@ type Bootstrapper struct { } // NewBootstrapper returns an instance of a Bootstrapper. -func NewBootstrapper(p AddressProvider, expect int, tlsConfig *tls.Config) *Bootstrapper { +func NewBootstrapper(p AddressProvider, tlsConfig *tls.Config) *Bootstrapper { bs := &Bootstrapper{ provider: p, - expect: expect, tlsConfig: &tls.Config{InsecureSkipVerify: true}, joiner: NewJoiner("", 1, 0, tlsConfig), logger: log.New(os.Stderr, "[cluster-bootstrap] ", log.LstdFlags), @@ -87,7 +85,6 @@ func (b *Bootstrapper) Boot(id, raftAddr string, done func() bool, timeout time. tickerT := time.NewTimer(jitter(time.Millisecond)) defer tickerT.Stop() - notifySuccess := false for { select { case <-timeoutT.C: @@ -104,27 +101,31 @@ func (b *Bootstrapper) Boot(id, raftAddr string, done func() bool, timeout time. if err != nil { b.logger.Printf("provider lookup failed %s", err.Error()) } - if len(targets) < b.expect { + + if len(targets) == 0 { continue } - // Try an explicit join. + // Try an explicit join first. Joining an existing cluster is always given priority + // over trying to form a new cluster. b.joiner.SetBasicAuth(b.username, b.password) if j, err := b.joiner.Do(targets, id, raftAddr, true); err == nil { b.logger.Printf("succeeded directly joining cluster via node at %s", j) return nil } - // Join didn't work, so perhaps perform a notify if we haven't done - // one yet. - if !notifySuccess { - if err := b.notify(targets, id, raftAddr); err != nil { - b.logger.Printf("failed to notify all targets: %s (%s, will retry)", targets, - err.Error()) - } else { - b.logger.Printf("succeeded notifying all targets: %s", targets) - notifySuccess = true - } + // This is where we have to be careful. This node failed to join with any node + // in the targets list. This could be because none of the nodes are contactable, + // or none of the nodes are in a functioning cluster with a leader. That means that + // this node could be part of a set nodes that are bootstrapping to form a cluster + // de novo. For that to happen it needs to now let the otehr nodes know it is here. + // If this is a new cluster, some node will then reach the bootstrap-expect value, + // form the cluster, beating all other nodes to it. + if err := b.notify(targets, id, raftAddr); err != nil { + b.logger.Printf("failed to notify all targets: %s (%s, will retry)", targets, + err.Error()) + } else { + b.logger.Printf("succeeded notifying all targets: %s", targets) } } } diff --git a/cluster/bootstrap_test.go b/cluster/bootstrap_test.go index 36fb31f9..632ea4b4 100644 --- a/cluster/bootstrap_test.go +++ b/cluster/bootstrap_test.go @@ -24,7 +24,7 @@ func Test_AddressProviderString(t *testing.T) { } func Test_NewBootstrapper(t *testing.T) { - bs := NewBootstrapper(nil, 1, nil) + bs := NewBootstrapper(nil, nil) if bs == nil { t.Fatalf("failed to create a simple Bootstrapper") } @@ -39,7 +39,7 @@ func Test_BootstrapperBootDoneImmediately(t *testing.T) { return true } p := NewAddressProviderString([]string{ts.URL}) - bs := NewBootstrapper(p, 1, nil) + bs := NewBootstrapper(p, nil) if err := bs.Boot("node1", "192.168.1.1:1234", done, 10*time.Second); err != nil { t.Fatalf("failed to boot: %s", err) } @@ -54,7 +54,7 @@ func Test_BootstrapperBootTimeout(t *testing.T) { return false } p := NewAddressProviderString([]string{ts.URL}) - bs := NewBootstrapper(p, 1, nil) + bs := NewBootstrapper(p, nil) bs.Interval = time.Second err := bs.Boot("node1", "192.168.1.1:1234", done, 5*time.Second) if err == nil { @@ -97,7 +97,7 @@ func Test_BootstrapperBootSingleNotify(t *testing.T) { } p := NewAddressProviderString([]string{ts.URL}) - bs := NewBootstrapper(p, 1, nil) + bs := NewBootstrapper(p, nil) bs.Interval = time.Second err := bs.Boot("node1", "192.168.1.1:1234", done, 60*time.Second) @@ -145,7 +145,7 @@ func Test_BootstrapperBootSingleNotifyAuth(t *testing.T) { } p := NewAddressProviderString([]string{ts.URL}) - bs := NewBootstrapper(p, 1, nil) + bs := NewBootstrapper(p, nil) bs.SetBasicAuth("username1", "password1") bs.Interval = time.Second @@ -192,7 +192,7 @@ func Test_BootstrapperBootMultiNotify(t *testing.T) { } p := NewAddressProviderString([]string{ts1.URL, ts2.URL}) - bs := NewBootstrapper(p, 2, nil) + bs := NewBootstrapper(p, nil) bs.Interval = time.Second err := bs.Boot("node1", "192.168.1.1:1234", done, 60*time.Second) diff --git a/cmd/rqlited/main.go b/cmd/rqlited/main.go index fbe89b6f..eb99fa18 100644 --- a/cmd/rqlited/main.go +++ b/cmd/rqlited/main.go @@ -373,8 +373,7 @@ func createCluster(cfg *Config, tlsConfig *tls.Config, hasPeers bool, str *store return nil } - bs := cluster.NewBootstrapper(cluster.NewAddressProviderString(joins), - cfg.BootstrapExpect, tlsConfig) + bs := cluster.NewBootstrapper(cluster.NewAddressProviderString(joins), tlsConfig) if cfg.JoinAs != "" { pw, ok := credStr.Password(cfg.JoinAs) if !ok { @@ -424,7 +423,7 @@ func createCluster(cfg *Config, tlsConfig *tls.Config, hasPeers bool, str *store provider = dnssrv.New(dnssrvCfg) } - bs := cluster.NewBootstrapper(provider, cfg.BootstrapExpect, tlsConfig) + bs := cluster.NewBootstrapper(provider, tlsConfig) if cfg.JoinAs != "" { pw, ok := credStr.Password(cfg.JoinAs) if !ok { diff --git a/system_test/cluster_test.go b/system_test/cluster_test.go index c04a6231..5e0e6070 100644 --- a/system_test/cluster_test.go +++ b/system_test/cluster_test.go @@ -238,9 +238,9 @@ func Test_MultiNodeClusterBootstrap(t *testing.T) { provider := cluster.NewAddressProviderString( []string{node1.APIAddr, node2.APIAddr, node3.APIAddr}) - node1Bs := cluster.NewBootstrapper(provider, 3, nil) - node2Bs := cluster.NewBootstrapper(provider, 3, nil) - node3Bs := cluster.NewBootstrapper(provider, 3, nil) + node1Bs := cluster.NewBootstrapper(provider, nil) + node2Bs := cluster.NewBootstrapper(provider, nil) + node3Bs := cluster.NewBootstrapper(provider, nil) // Have all nodes start a bootstrap basically in parallel, // ensure only 1 leader actually gets elected. @@ -284,6 +284,21 @@ func Test_MultiNodeClusterBootstrap(t *testing.T) { t.Fatalf("failed to find cluster leader: %s", err.Error()) } + // Ensure each node has the same leader! + leaderAddr, err := leader.WaitForLeader() + if err != nil { + t.Fatalf("failed to find cluster leader: %s", err.Error()) + } + for i, n := range []*Node{node1, node2, node3} { + addr, err := n.WaitForLeader() + if err != nil { + t.Fatalf("failed waiting for a leader on node %d: %s", i, err.Error()) + } + if exp, got := leaderAddr, addr; exp != got { + t.Fatalf("node %d has wrong leader, exp %s, got %s", i, exp, got) + } + } + // Run queries against cluster. tests := []struct { stmt string @@ -388,11 +403,11 @@ func Test_MultiNodeClusterBootstrapLaterJoin(t *testing.T) { provider := cluster.NewAddressProviderString( []string{node1.APIAddr, node2.APIAddr, node3.APIAddr}) - node1Bs := cluster.NewBootstrapper(provider, 3, nil) + node1Bs := cluster.NewBootstrapper(provider, nil) node1Bs.Interval = time.Second - node2Bs := cluster.NewBootstrapper(provider, 3, nil) + node2Bs := cluster.NewBootstrapper(provider, nil) node2Bs.Interval = time.Second - node3Bs := cluster.NewBootstrapper(provider, 3, nil) + node3Bs := cluster.NewBootstrapper(provider, nil) node3Bs.Interval = time.Second // Have all nodes start a bootstrap basically in parallel, @@ -451,7 +466,7 @@ func Test_MultiNodeClusterBootstrapLaterJoin(t *testing.T) { node4 := mustNewNode(false) node4.Store.BootstrapExpect = 3 defer node3.Deprovision() - node4Bs := cluster.NewBootstrapper(provider, 3, nil) + node4Bs := cluster.NewBootstrapper(provider, nil) node4Bs.Interval = time.Second done := func() bool { addr, _ := node4.Store.LeaderAddr() @@ -486,11 +501,11 @@ func Test_MultiNodeClusterBootstrapLaterJoinHTTPS(t *testing.T) { provider := cluster.NewAddressProviderString( []string{node1.APIAddr, node2.APIAddr, node3.APIAddr}) - node1Bs := cluster.NewBootstrapper(provider, 3, nil) + node1Bs := cluster.NewBootstrapper(provider, nil) node1Bs.Interval = time.Second - node2Bs := cluster.NewBootstrapper(provider, 3, nil) + node2Bs := cluster.NewBootstrapper(provider, nil) node2Bs.Interval = time.Second - node3Bs := cluster.NewBootstrapper(provider, 3, nil) + node3Bs := cluster.NewBootstrapper(provider, nil) node3Bs.Interval = time.Second // Have all nodes start a bootstrap basically in parallel, @@ -549,7 +564,7 @@ func Test_MultiNodeClusterBootstrapLaterJoinHTTPS(t *testing.T) { node4 := mustNewNodeEncrypted(false, true, true) node4.Store.BootstrapExpect = 3 defer node3.Deprovision() - node4Bs := cluster.NewBootstrapper(provider, 3, nil) + node4Bs := cluster.NewBootstrapper(provider, nil) node4Bs.Interval = time.Second done := func() bool { addr, _ := node4.Store.LeaderAddr()