diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4e46004d..acc6adc1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -13,7 +13,7 @@ If you open a pull request, please ensure the commit history is clean. Squash th Please avoid using libaries other than those available in the standard library, unless absolutely necessary. This requirement is relaxed somewhat for software other than rqlite node software itself. To understand why this approach is taken, check out this [post](https://blog.gopheracademy.com/advent-2014/case-against-3pl/). ## Building rqlite -*Building rqlite requires Go 1.8 or later. [gvm](https://github.com/moovweb/gvm) is a great tool for installing and managing your versions of Go.* +*Building rqlite requires Go 1.9 or later. [gvm](https://github.com/moovweb/gvm) is a great tool for installing and managing your versions of Go.* Download, build, and run rqlite like so (tested on 64-bit Kubuntu 14.04 and OSX): diff --git a/circle.yml b/circle.yml index 60020f87..cd5f47cd 100644 --- a/circle.yml +++ b/circle.yml @@ -1,6 +1,6 @@ machine: environment: - GODIST: "go1.8.1.linux-amd64.tar.gz" + GODIST: "go1.9.1.linux-amd64.tar.gz" post: - mkdir -p download diff --git a/vagrant_setup.sh b/vagrant_setup.sh index 1f148fd8..5e140672 100755 --- a/vagrant_setup.sh +++ b/vagrant_setup.sh @@ -9,8 +9,8 @@ apt-get install -y curl git bison make mercurial # Go bash < <(curl -s -S -L https://raw.githubusercontent.com/moovweb/gvm/master/binscripts/gvm-installer) source ~/.gvm/scripts/gvm -gvm install go1.8.1 -gvm use go1.8.1 +gvm install go1.9.1 +gvm use go1.9.1 # rqlite mkdir -p rqlite diff --git a/vendor/github.com/hashicorp/raft-boltdb/.travis.yml b/vendor/github.com/hashicorp/raft-boltdb/.travis.yml new file mode 100644 index 00000000..58357418 --- /dev/null +++ b/vendor/github.com/hashicorp/raft-boltdb/.travis.yml @@ -0,0 +1,10 @@ +language: go + +go: + - 1.6 + - 1.7 + - tip + +install: make deps +script: + - make test diff --git a/vendor/github.com/hashicorp/raft-boltdb/LICENSE b/vendor/github.com/hashicorp/raft-boltdb/LICENSE new file mode 100644 index 00000000..f0e5c79e --- /dev/null +++ b/vendor/github.com/hashicorp/raft-boltdb/LICENSE @@ -0,0 +1,362 @@ +Mozilla Public License, version 2.0 + +1. Definitions + +1.1. "Contributor" + + means each individual or legal entity that creates, contributes to the + creation of, or owns Covered Software. + +1.2. "Contributor Version" + + means the combination of the Contributions of others (if any) used by a + Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + + means Source Code Form to which the initial Contributor has attached the + notice in Exhibit A, the Executable Form of such Source Code Form, and + Modifications of such Source Code Form, in each case including portions + thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + a. that the initial Contributor has attached the notice described in + Exhibit B to the Covered Software; or + + b. that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the terms of + a Secondary License. + +1.6. "Executable Form" + + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + + means a work that combines Covered Software with other material, in a + separate file or files, that is not Covered Software. + +1.8. "License" + + means this document. + +1.9. "Licensable" + + means having the right to grant, to the maximum extent possible, whether + at the time of the initial grant or subsequently, any and all of the + rights conveyed by this License. + +1.10. "Modifications" + + means any of the following: + + a. any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered Software; or + + b. any new file in Source Code Form that contains any Covered Software. + +1.11. "Patent Claims" of a Contributor + + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the License, + by the making, using, selling, offering for sale, having made, import, + or transfer of either its Contributions or its Contributor Version. + +1.12. "Secondary License" + + means either the GNU General Public License, Version 2.0, the GNU Lesser + General Public License, Version 2.1, the GNU Affero General Public + License, Version 3.0, or any later versions of those licenses. + +1.13. "Source Code Form" + + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that controls, is + controlled by, or is under common control with You. For purposes of this + definition, "control" means (a) the power, direct or indirect, to cause + the direction or management of such entity, whether by contract or + otherwise, or (b) ownership of more than fifty percent (50%) of the + outstanding shares or beneficial ownership of such entity. + + +2. License Grants and Conditions + +2.1. Grants + + Each Contributor hereby grants You a world-wide, royalty-free, + non-exclusive license: + + a. under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + + b. under Patent Claims of such Contributor to make, use, sell, offer for + sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + + The licenses granted in Section 2.1 with respect to any Contribution + become effective for each Contribution on the date the Contributor first + distributes such Contribution. + +2.3. Limitations on Grant Scope + + The licenses granted in this Section 2 are the only rights granted under + this License. No additional rights or licenses will be implied from the + distribution or licensing of Covered Software under this License. + Notwithstanding Section 2.1(b) above, no patent license is granted by a + Contributor: + + a. for any code that a Contributor has removed from Covered Software; or + + b. for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + + c. under Patent Claims infringed by Covered Software in the absence of + its Contributions. + + This License does not grant any rights in the trademarks, service marks, + or logos of any Contributor (except as may be necessary to comply with + the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + + No Contributor makes additional grants as a result of Your choice to + distribute the Covered Software under a subsequent version of this + License (see Section 10.2) or under the terms of a Secondary License (if + permitted under the terms of Section 3.3). + +2.5. Representation + + Each Contributor represents that the Contributor believes its + Contributions are its original creation(s) or it has sufficient rights to + grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + + This License is not intended to limit any rights You have under + applicable copyright doctrines of fair use, fair dealing, or other + equivalents. + +2.7. Conditions + + Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in + Section 2.1. + + +3. Responsibilities + +3.1. Distribution of Source Form + + All distribution of Covered Software in Source Code Form, including any + Modifications that You create or to which You contribute, must be under + the terms of this License. You must inform recipients that the Source + Code Form of the Covered Software is governed by the terms of this + License, and how they can obtain a copy of this License. You may not + attempt to alter or restrict the recipients' rights in the Source Code + Form. + +3.2. Distribution of Executable Form + + If You distribute Covered Software in Executable Form then: + + a. such Covered Software must also be made available in Source Code Form, + as described in Section 3.1, and You must inform recipients of the + Executable Form how they can obtain a copy of such Source Code Form by + reasonable means in a timely manner, at a charge no more than the cost + of distribution to the recipient; and + + b. You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter the + recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + + You may create and distribute a Larger Work under terms of Your choice, + provided that You also comply with the requirements of this License for + the Covered Software. If the Larger Work is a combination of Covered + Software with a work governed by one or more Secondary Licenses, and the + Covered Software is not Incompatible With Secondary Licenses, this + License permits You to additionally distribute such Covered Software + under the terms of such Secondary License(s), so that the recipient of + the Larger Work may, at their option, further distribute the Covered + Software under the terms of either this License or such Secondary + License(s). + +3.4. Notices + + You may not remove or alter the substance of any license notices + (including copyright notices, patent notices, disclaimers of warranty, or + limitations of liability) contained within the Source Code Form of the + Covered Software, except that You may alter any license notices to the + extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + + You may choose to offer, and to charge a fee for, warranty, support, + indemnity or liability obligations to one or more recipients of Covered + Software. However, You may do so only on Your own behalf, and not on + behalf of any Contributor. You must make it absolutely clear that any + such warranty, support, indemnity, or liability obligation is offered by + You alone, and You hereby agree to indemnify every Contributor for any + liability incurred by such Contributor as a result of warranty, support, + indemnity or liability terms You offer. You may include additional + disclaimers of warranty and limitations of liability specific to any + jurisdiction. + +4. Inability to Comply Due to Statute or Regulation + + If it is impossible for You to comply with any of the terms of this License + with respect to some or all of the Covered Software due to statute, + judicial order, or regulation then You must: (a) comply with the terms of + this License to the maximum extent possible; and (b) describe the + limitations and the code they affect. Such description must be placed in a + text file included with all distributions of the Covered Software under + this License. Except to the extent prohibited by statute or regulation, + such description must be sufficiently detailed for a recipient of ordinary + skill to be able to understand it. + +5. Termination + +5.1. The rights granted under this License will terminate automatically if You + fail to comply with any of its terms. However, if You become compliant, + then the rights granted under this License from a particular Contributor + are reinstated (a) provisionally, unless and until such Contributor + explicitly and finally terminates Your grants, and (b) on an ongoing + basis, if such Contributor fails to notify You of the non-compliance by + some reasonable means prior to 60 days after You have come back into + compliance. Moreover, Your grants from a particular Contributor are + reinstated on an ongoing basis if such Contributor notifies You of the + non-compliance by some reasonable means, this is the first time You have + received notice of non-compliance with this License from such + Contributor, and You become compliant prior to 30 days after Your receipt + of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent + infringement claim (excluding declaratory judgment actions, + counter-claims, and cross-claims) alleging that a Contributor Version + directly or indirectly infringes any patent, then the rights granted to + You by any and all Contributors for the Covered Software under Section + 2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user + license agreements (excluding distributors and resellers) which have been + validly granted by You or Your distributors under this License prior to + termination shall survive termination. + +6. Disclaimer of Warranty + + Covered Software is provided under this License on an "as is" basis, + without warranty of any kind, either expressed, implied, or statutory, + including, without limitation, warranties that the Covered Software is free + of defects, merchantable, fit for a particular purpose or non-infringing. + The entire risk as to the quality and performance of the Covered Software + is with You. Should any Covered Software prove defective in any respect, + You (not any Contributor) assume the cost of any necessary servicing, + repair, or correction. This disclaimer of warranty constitutes an essential + part of this License. No use of any Covered Software is authorized under + this License except under this disclaimer. + +7. Limitation of Liability + + Under no circumstances and under no legal theory, whether tort (including + negligence), contract, or otherwise, shall any Contributor, or anyone who + distributes Covered Software as permitted above, be liable to You for any + direct, indirect, special, incidental, or consequential damages of any + character including, without limitation, damages for lost profits, loss of + goodwill, work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses, even if such party shall have been + informed of the possibility of such damages. This limitation of liability + shall not apply to liability for death or personal injury resulting from + such party's negligence to the extent applicable law prohibits such + limitation. Some jurisdictions do not allow the exclusion or limitation of + incidental or consequential damages, so this exclusion and limitation may + not apply to You. + +8. Litigation + + Any litigation relating to this License may be brought only in the courts + of a jurisdiction where the defendant maintains its principal place of + business and such litigation shall be governed by laws of that + jurisdiction, without reference to its conflict-of-law provisions. Nothing + in this Section shall prevent a party's ability to bring cross-claims or + counter-claims. + +9. Miscellaneous + + This License represents the complete agreement concerning the subject + matter hereof. If any provision of this License is held to be + unenforceable, such provision shall be reformed only to the extent + necessary to make it enforceable. Any law or regulation which provides that + the language of a contract shall be construed against the drafter shall not + be used to construe this License against a Contributor. + + +10. Versions of the License + +10.1. New Versions + + Mozilla Foundation is the license steward. Except as provided in Section + 10.3, no one other than the license steward has the right to modify or + publish new versions of this License. Each version will be given a + distinguishing version number. + +10.2. Effect of New Versions + + You may distribute the Covered Software under the terms of the version + of the License under which You originally received the Covered Software, + or under the terms of any subsequent version published by the license + steward. + +10.3. Modified Versions + + If you create software not governed by this License, and you want to + create a new license for such software, you may create and use a + modified version of this License if you rename the license and remove + any references to the name of the license steward (except to note that + such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary + Licenses If You choose to distribute Source Code Form that is + Incompatible With Secondary Licenses under the terms of this version of + the License, the notice described in Exhibit B of this License must be + attached. + +Exhibit A - Source Code Form License Notice + + This Source Code Form is subject to the + terms of the Mozilla Public License, v. + 2.0. If a copy of the MPL was not + distributed with this file, You can + obtain one at + http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular file, +then You may include the notice in a location (such as a LICENSE file in a +relevant directory) where a recipient would be likely to look for such a +notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice + + This Source Code Form is "Incompatible + With Secondary Licenses", as defined by + the Mozilla Public License, v. 2.0. \ No newline at end of file diff --git a/vendor/github.com/hashicorp/raft-boltdb/Makefile b/vendor/github.com/hashicorp/raft-boltdb/Makefile new file mode 100644 index 00000000..bc5c6cc0 --- /dev/null +++ b/vendor/github.com/hashicorp/raft-boltdb/Makefile @@ -0,0 +1,11 @@ +DEPS = $(go list -f '{{range .TestImports}}{{.}} {{end}}' ./...) + +.PHONY: test deps + +test: + go test -timeout=30s ./... + +deps: + go get -d -v ./... + echo $(DEPS) | xargs -n1 go get -d + diff --git a/vendor/github.com/hashicorp/raft-boltdb/README.md b/vendor/github.com/hashicorp/raft-boltdb/README.md new file mode 100644 index 00000000..5d7180ab --- /dev/null +++ b/vendor/github.com/hashicorp/raft-boltdb/README.md @@ -0,0 +1,11 @@ +raft-boltdb +=========== + +This repository provides the `raftboltdb` package. The package exports the +`BoltStore` which is an implementation of both a `LogStore` and `StableStore`. + +It is meant to be used as a backend for the `raft` [package +here](https://github.com/hashicorp/raft). + +This implementation uses [BoltDB](https://github.com/boltdb/bolt). BoltDB is +a simple key/value store implemented in pure Go, and inspired by LMDB. diff --git a/vendor/github.com/hashicorp/raft-boltdb/bench_test.go b/vendor/github.com/hashicorp/raft-boltdb/bench_test.go new file mode 100644 index 00000000..b860706f --- /dev/null +++ b/vendor/github.com/hashicorp/raft-boltdb/bench_test.go @@ -0,0 +1,88 @@ +package raftboltdb + +import ( + "os" + "testing" + + "github.com/hashicorp/raft/bench" +) + +func BenchmarkBoltStore_FirstIndex(b *testing.B) { + store := testBoltStore(b) + defer store.Close() + defer os.Remove(store.path) + + raftbench.FirstIndex(b, store) +} + +func BenchmarkBoltStore_LastIndex(b *testing.B) { + store := testBoltStore(b) + defer store.Close() + defer os.Remove(store.path) + + raftbench.LastIndex(b, store) +} + +func BenchmarkBoltStore_GetLog(b *testing.B) { + store := testBoltStore(b) + defer store.Close() + defer os.Remove(store.path) + + raftbench.GetLog(b, store) +} + +func BenchmarkBoltStore_StoreLog(b *testing.B) { + store := testBoltStore(b) + defer store.Close() + defer os.Remove(store.path) + + raftbench.StoreLog(b, store) +} + +func BenchmarkBoltStore_StoreLogs(b *testing.B) { + store := testBoltStore(b) + defer store.Close() + defer os.Remove(store.path) + + raftbench.StoreLogs(b, store) +} + +func BenchmarkBoltStore_DeleteRange(b *testing.B) { + store := testBoltStore(b) + defer store.Close() + defer os.Remove(store.path) + + raftbench.DeleteRange(b, store) +} + +func BenchmarkBoltStore_Set(b *testing.B) { + store := testBoltStore(b) + defer store.Close() + defer os.Remove(store.path) + + raftbench.Set(b, store) +} + +func BenchmarkBoltStore_Get(b *testing.B) { + store := testBoltStore(b) + defer store.Close() + defer os.Remove(store.path) + + raftbench.Get(b, store) +} + +func BenchmarkBoltStore_SetUint64(b *testing.B) { + store := testBoltStore(b) + defer store.Close() + defer os.Remove(store.path) + + raftbench.SetUint64(b, store) +} + +func BenchmarkBoltStore_GetUint64(b *testing.B) { + store := testBoltStore(b) + defer store.Close() + defer os.Remove(store.path) + + raftbench.GetUint64(b, store) +} diff --git a/vendor/github.com/hashicorp/raft-boltdb/bolt_store.go b/vendor/github.com/hashicorp/raft-boltdb/bolt_store.go new file mode 100644 index 00000000..109a7989 --- /dev/null +++ b/vendor/github.com/hashicorp/raft-boltdb/bolt_store.go @@ -0,0 +1,255 @@ +package raftboltdb + +import ( + "errors" + + "github.com/boltdb/bolt" + "github.com/hashicorp/raft" +) + +const ( + // Permissions to use on the db file. This is only used if the + // database file does not exist and needs to be created. + dbFileMode = 0600 +) + +var ( + // Bucket names we perform transactions in + dbLogs = []byte("logs") + dbConf = []byte("conf") + + // An error indicating a given key does not exist + ErrKeyNotFound = errors.New("not found") +) + +// BoltStore provides access to BoltDB for Raft to store and retrieve +// log entries. It also provides key/value storage, and can be used as +// a LogStore and StableStore. +type BoltStore struct { + // conn is the underlying handle to the db. + conn *bolt.DB + + // The path to the Bolt database file + path string +} + +// Options contains all the configuraiton used to open the BoltDB +type Options struct { + // Path is the file path to the BoltDB to use + Path string + + // BoltOptions contains any specific BoltDB options you might + // want to specify [e.g. open timeout] + BoltOptions *bolt.Options +} + +// readOnly returns true if the contained bolt options say to open +// the DB in readOnly mode [this can be useful to tools that want +// to examine the log] +func (o *Options) readOnly() bool { + return o != nil && o.BoltOptions != nil && o.BoltOptions.ReadOnly +} + +// NewBoltStore takes a file path and returns a connected Raft backend. +func NewBoltStore(path string) (*BoltStore, error) { + return New(Options{Path: path}) +} + +// New uses the supplied options to open the BoltDB and prepare it for use as a raft backend. +func New(options Options) (*BoltStore, error) { + // Try to connect + handle, err := bolt.Open(options.Path, dbFileMode, options.BoltOptions) + if err != nil { + return nil, err + } + + // Create the new store + store := &BoltStore{ + conn: handle, + path: options.Path, + } + + // If the store was opened read-only, don't try and create buckets + if !options.readOnly() { + // Set up our buckets + if err := store.initialize(); err != nil { + store.Close() + return nil, err + } + } + return store, nil +} + +// initialize is used to set up all of the buckets. +func (b *BoltStore) initialize() error { + tx, err := b.conn.Begin(true) + if err != nil { + return err + } + defer tx.Rollback() + + // Create all the buckets + if _, err := tx.CreateBucketIfNotExists(dbLogs); err != nil { + return err + } + if _, err := tx.CreateBucketIfNotExists(dbConf); err != nil { + return err + } + + return tx.Commit() +} + +// Close is used to gracefully close the DB connection. +func (b *BoltStore) Close() error { + return b.conn.Close() +} + +// FirstIndex returns the first known index from the Raft log. +func (b *BoltStore) FirstIndex() (uint64, error) { + tx, err := b.conn.Begin(false) + if err != nil { + return 0, err + } + defer tx.Rollback() + + curs := tx.Bucket(dbLogs).Cursor() + if first, _ := curs.First(); first == nil { + return 0, nil + } else { + return bytesToUint64(first), nil + } +} + +// LastIndex returns the last known index from the Raft log. +func (b *BoltStore) LastIndex() (uint64, error) { + tx, err := b.conn.Begin(false) + if err != nil { + return 0, err + } + defer tx.Rollback() + + curs := tx.Bucket(dbLogs).Cursor() + if last, _ := curs.Last(); last == nil { + return 0, nil + } else { + return bytesToUint64(last), nil + } +} + +// GetLog is used to retrieve a log from BoltDB at a given index. +func (b *BoltStore) GetLog(idx uint64, log *raft.Log) error { + tx, err := b.conn.Begin(false) + if err != nil { + return err + } + defer tx.Rollback() + + bucket := tx.Bucket(dbLogs) + val := bucket.Get(uint64ToBytes(idx)) + + if val == nil { + return raft.ErrLogNotFound + } + return decodeMsgPack(val, log) +} + +// StoreLog is used to store a single raft log +func (b *BoltStore) StoreLog(log *raft.Log) error { + return b.StoreLogs([]*raft.Log{log}) +} + +// StoreLogs is used to store a set of raft logs +func (b *BoltStore) StoreLogs(logs []*raft.Log) error { + tx, err := b.conn.Begin(true) + if err != nil { + return err + } + defer tx.Rollback() + + for _, log := range logs { + key := uint64ToBytes(log.Index) + val, err := encodeMsgPack(log) + if err != nil { + return err + } + bucket := tx.Bucket(dbLogs) + if err := bucket.Put(key, val.Bytes()); err != nil { + return err + } + } + + return tx.Commit() +} + +// DeleteRange is used to delete logs within a given range inclusively. +func (b *BoltStore) DeleteRange(min, max uint64) error { + minKey := uint64ToBytes(min) + + tx, err := b.conn.Begin(true) + if err != nil { + return err + } + defer tx.Rollback() + + curs := tx.Bucket(dbLogs).Cursor() + for k, _ := curs.Seek(minKey); k != nil; k, _ = curs.Next() { + // Handle out-of-range log index + if bytesToUint64(k) > max { + break + } + + // Delete in-range log index + if err := curs.Delete(); err != nil { + return err + } + } + + return tx.Commit() +} + +// Set is used to set a key/value set outside of the raft log +func (b *BoltStore) Set(k, v []byte) error { + tx, err := b.conn.Begin(true) + if err != nil { + return err + } + defer tx.Rollback() + + bucket := tx.Bucket(dbConf) + if err := bucket.Put(k, v); err != nil { + return err + } + + return tx.Commit() +} + +// Get is used to retrieve a value from the k/v store by key +func (b *BoltStore) Get(k []byte) ([]byte, error) { + tx, err := b.conn.Begin(false) + if err != nil { + return nil, err + } + defer tx.Rollback() + + bucket := tx.Bucket(dbConf) + val := bucket.Get(k) + + if val == nil { + return nil, ErrKeyNotFound + } + return append([]byte(nil), val...), nil +} + +// SetUint64 is like Set, but handles uint64 values +func (b *BoltStore) SetUint64(key []byte, val uint64) error { + return b.Set(key, uint64ToBytes(val)) +} + +// GetUint64 is like Get, but handles uint64 values +func (b *BoltStore) GetUint64(key []byte) (uint64, error) { + val, err := b.Get(key) + if err != nil { + return 0, err + } + return bytesToUint64(val), nil +} diff --git a/vendor/github.com/hashicorp/raft-boltdb/bolt_store_test.go b/vendor/github.com/hashicorp/raft-boltdb/bolt_store_test.go new file mode 100644 index 00000000..12b09b21 --- /dev/null +++ b/vendor/github.com/hashicorp/raft-boltdb/bolt_store_test.go @@ -0,0 +1,416 @@ +package raftboltdb + +import ( + "bytes" + "io/ioutil" + "os" + "reflect" + "testing" + "time" + + "github.com/boltdb/bolt" + "github.com/hashicorp/raft" +) + +func testBoltStore(t testing.TB) *BoltStore { + fh, err := ioutil.TempFile("", "bolt") + if err != nil { + t.Fatalf("err: %s", err) + } + os.Remove(fh.Name()) + + // Successfully creates and returns a store + store, err := NewBoltStore(fh.Name()) + if err != nil { + t.Fatalf("err: %s", err) + } + + return store +} + +func testRaftLog(idx uint64, data string) *raft.Log { + return &raft.Log{ + Data: []byte(data), + Index: idx, + } +} + +func TestBoltStore_Implements(t *testing.T) { + var store interface{} = &BoltStore{} + if _, ok := store.(raft.StableStore); !ok { + t.Fatalf("BoltStore does not implement raft.StableStore") + } + if _, ok := store.(raft.LogStore); !ok { + t.Fatalf("BoltStore does not implement raft.LogStore") + } +} + +func TestBoltOptionsTimeout(t *testing.T) { + fh, err := ioutil.TempFile("", "bolt") + if err != nil { + t.Fatalf("err: %s", err) + } + os.Remove(fh.Name()) + defer os.Remove(fh.Name()) + options := Options{ + Path: fh.Name(), + BoltOptions: &bolt.Options{ + Timeout: time.Second / 10, + }, + } + store, err := New(options) + if err != nil { + t.Fatalf("err: %v", err) + } + defer store.Close() + // trying to open it again should timeout + doneCh := make(chan error, 1) + go func() { + _, err := New(options) + doneCh <- err + }() + select { + case err := <-doneCh: + if err == nil || err.Error() != "timeout" { + t.Errorf("Expected timeout error but got %v", err) + } + case <-time.After(5 * time.Second): + t.Errorf("Gave up waiting for timeout response") + } +} + +func TestBoltOptionsReadOnly(t *testing.T) { + fh, err := ioutil.TempFile("", "bolt") + if err != nil { + t.Fatalf("err: %s", err) + } + defer os.Remove(fh.Name()) + store, err := NewBoltStore(fh.Name()) + if err != nil { + t.Fatalf("err: %s", err) + } + // Create the log + log := &raft.Log{ + Data: []byte("log1"), + Index: 1, + } + // Attempt to store the log + if err := store.StoreLog(log); err != nil { + t.Fatalf("err: %s", err) + } + + store.Close() + options := Options{ + Path: fh.Name(), + BoltOptions: &bolt.Options{ + Timeout: time.Second / 10, + ReadOnly: true, + }, + } + roStore, err := New(options) + if err != nil { + t.Fatalf("err: %s", err) + } + defer roStore.Close() + result := new(raft.Log) + if err := roStore.GetLog(1, result); err != nil { + t.Fatalf("err: %s", err) + } + + // Ensure the log comes back the same + if !reflect.DeepEqual(log, result) { + t.Errorf("bad: %v", result) + } + // Attempt to store the log, should fail on a read-only store + err = roStore.StoreLog(log) + if err != bolt.ErrDatabaseReadOnly { + t.Errorf("expecting error %v, but got %v", bolt.ErrDatabaseReadOnly, err) + } +} + +func TestNewBoltStore(t *testing.T) { + fh, err := ioutil.TempFile("", "bolt") + if err != nil { + t.Fatalf("err: %s", err) + } + os.Remove(fh.Name()) + defer os.Remove(fh.Name()) + + // Successfully creates and returns a store + store, err := NewBoltStore(fh.Name()) + if err != nil { + t.Fatalf("err: %s", err) + } + + // Ensure the file was created + if store.path != fh.Name() { + t.Fatalf("unexpected file path %q", store.path) + } + if _, err := os.Stat(fh.Name()); err != nil { + t.Fatalf("err: %s", err) + } + + // Close the store so we can open again + if err := store.Close(); err != nil { + t.Fatalf("err: %s", err) + } + + // Ensure our tables were created + db, err := bolt.Open(fh.Name(), dbFileMode, nil) + if err != nil { + t.Fatalf("err: %s", err) + } + tx, err := db.Begin(true) + if err != nil { + t.Fatalf("err: %s", err) + } + if _, err := tx.CreateBucket([]byte(dbLogs)); err != bolt.ErrBucketExists { + t.Fatalf("bad: %v", err) + } + if _, err := tx.CreateBucket([]byte(dbConf)); err != bolt.ErrBucketExists { + t.Fatalf("bad: %v", err) + } +} + +func TestBoltStore_FirstIndex(t *testing.T) { + store := testBoltStore(t) + defer store.Close() + defer os.Remove(store.path) + + // Should get 0 index on empty log + idx, err := store.FirstIndex() + if err != nil { + t.Fatalf("err: %s", err) + } + if idx != 0 { + t.Fatalf("bad: %v", idx) + } + + // Set a mock raft log + logs := []*raft.Log{ + testRaftLog(1, "log1"), + testRaftLog(2, "log2"), + testRaftLog(3, "log3"), + } + if err := store.StoreLogs(logs); err != nil { + t.Fatalf("bad: %s", err) + } + + // Fetch the first Raft index + idx, err = store.FirstIndex() + if err != nil { + t.Fatalf("err: %s", err) + } + if idx != 1 { + t.Fatalf("bad: %d", idx) + } +} + +func TestBoltStore_LastIndex(t *testing.T) { + store := testBoltStore(t) + defer store.Close() + defer os.Remove(store.path) + + // Should get 0 index on empty log + idx, err := store.LastIndex() + if err != nil { + t.Fatalf("err: %s", err) + } + if idx != 0 { + t.Fatalf("bad: %v", idx) + } + + // Set a mock raft log + logs := []*raft.Log{ + testRaftLog(1, "log1"), + testRaftLog(2, "log2"), + testRaftLog(3, "log3"), + } + if err := store.StoreLogs(logs); err != nil { + t.Fatalf("bad: %s", err) + } + + // Fetch the last Raft index + idx, err = store.LastIndex() + if err != nil { + t.Fatalf("err: %s", err) + } + if idx != 3 { + t.Fatalf("bad: %d", idx) + } +} + +func TestBoltStore_GetLog(t *testing.T) { + store := testBoltStore(t) + defer store.Close() + defer os.Remove(store.path) + + log := new(raft.Log) + + // Should return an error on non-existent log + if err := store.GetLog(1, log); err != raft.ErrLogNotFound { + t.Fatalf("expected raft log not found error, got: %v", err) + } + + // Set a mock raft log + logs := []*raft.Log{ + testRaftLog(1, "log1"), + testRaftLog(2, "log2"), + testRaftLog(3, "log3"), + } + if err := store.StoreLogs(logs); err != nil { + t.Fatalf("bad: %s", err) + } + + // Should return the proper log + if err := store.GetLog(2, log); err != nil { + t.Fatalf("err: %s", err) + } + if !reflect.DeepEqual(log, logs[1]) { + t.Fatalf("bad: %#v", log) + } +} + +func TestBoltStore_SetLog(t *testing.T) { + store := testBoltStore(t) + defer store.Close() + defer os.Remove(store.path) + + // Create the log + log := &raft.Log{ + Data: []byte("log1"), + Index: 1, + } + + // Attempt to store the log + if err := store.StoreLog(log); err != nil { + t.Fatalf("err: %s", err) + } + + // Retrieve the log again + result := new(raft.Log) + if err := store.GetLog(1, result); err != nil { + t.Fatalf("err: %s", err) + } + + // Ensure the log comes back the same + if !reflect.DeepEqual(log, result) { + t.Fatalf("bad: %v", result) + } +} + +func TestBoltStore_SetLogs(t *testing.T) { + store := testBoltStore(t) + defer store.Close() + defer os.Remove(store.path) + + // Create a set of logs + logs := []*raft.Log{ + testRaftLog(1, "log1"), + testRaftLog(2, "log2"), + } + + // Attempt to store the logs + if err := store.StoreLogs(logs); err != nil { + t.Fatalf("err: %s", err) + } + + // Ensure we stored them all + result1, result2 := new(raft.Log), new(raft.Log) + if err := store.GetLog(1, result1); err != nil { + t.Fatalf("err: %s", err) + } + if !reflect.DeepEqual(logs[0], result1) { + t.Fatalf("bad: %#v", result1) + } + if err := store.GetLog(2, result2); err != nil { + t.Fatalf("err: %s", err) + } + if !reflect.DeepEqual(logs[1], result2) { + t.Fatalf("bad: %#v", result2) + } +} + +func TestBoltStore_DeleteRange(t *testing.T) { + store := testBoltStore(t) + defer store.Close() + defer os.Remove(store.path) + + // Create a set of logs + log1 := testRaftLog(1, "log1") + log2 := testRaftLog(2, "log2") + log3 := testRaftLog(3, "log3") + logs := []*raft.Log{log1, log2, log3} + + // Attempt to store the logs + if err := store.StoreLogs(logs); err != nil { + t.Fatalf("err: %s", err) + } + + // Attempt to delete a range of logs + if err := store.DeleteRange(1, 2); err != nil { + t.Fatalf("err: %s", err) + } + + // Ensure the logs were deleted + if err := store.GetLog(1, new(raft.Log)); err != raft.ErrLogNotFound { + t.Fatalf("should have deleted log1") + } + if err := store.GetLog(2, new(raft.Log)); err != raft.ErrLogNotFound { + t.Fatalf("should have deleted log2") + } +} + +func TestBoltStore_Set_Get(t *testing.T) { + store := testBoltStore(t) + defer store.Close() + defer os.Remove(store.path) + + // Returns error on non-existent key + if _, err := store.Get([]byte("bad")); err != ErrKeyNotFound { + t.Fatalf("expected not found error, got: %q", err) + } + + k, v := []byte("hello"), []byte("world") + + // Try to set a k/v pair + if err := store.Set(k, v); err != nil { + t.Fatalf("err: %s", err) + } + + // Try to read it back + val, err := store.Get(k) + if err != nil { + t.Fatalf("err: %s", err) + } + if !bytes.Equal(val, v) { + t.Fatalf("bad: %v", val) + } +} + +func TestBoltStore_SetUint64_GetUint64(t *testing.T) { + store := testBoltStore(t) + defer store.Close() + defer os.Remove(store.path) + + // Returns error on non-existent key + if _, err := store.GetUint64([]byte("bad")); err != ErrKeyNotFound { + t.Fatalf("expected not found error, got: %q", err) + } + + k, v := []byte("abc"), uint64(123) + + // Attempt to set the k/v pair + if err := store.SetUint64(k, v); err != nil { + t.Fatalf("err: %s", err) + } + + // Read back the value + val, err := store.GetUint64(k) + if err != nil { + t.Fatalf("err: %s", err) + } + if val != v { + t.Fatalf("bad: %v", val) + } +} diff --git a/vendor/github.com/hashicorp/raft-boltdb/util.go b/vendor/github.com/hashicorp/raft-boltdb/util.go new file mode 100644 index 00000000..68dd786b --- /dev/null +++ b/vendor/github.com/hashicorp/raft-boltdb/util.go @@ -0,0 +1,37 @@ +package raftboltdb + +import ( + "bytes" + "encoding/binary" + + "github.com/hashicorp/go-msgpack/codec" +) + +// Decode reverses the encode operation on a byte slice input +func decodeMsgPack(buf []byte, out interface{}) error { + r := bytes.NewBuffer(buf) + hd := codec.MsgpackHandle{} + dec := codec.NewDecoder(r, &hd) + return dec.Decode(out) +} + +// Encode writes an encoded object to a new bytes buffer +func encodeMsgPack(in interface{}) (*bytes.Buffer, error) { + buf := bytes.NewBuffer(nil) + hd := codec.MsgpackHandle{} + enc := codec.NewEncoder(buf, &hd) + err := enc.Encode(in) + return buf, err +} + +// Converts bytes to an integer +func bytesToUint64(b []byte) uint64 { + return binary.BigEndian.Uint64(b) +} + +// Converts a uint to a byte slice +func uint64ToBytes(u uint64) []byte { + buf := make([]byte, 8) + binary.BigEndian.PutUint64(buf, u) + return buf +} diff --git a/vendor/github.com/hashicorp/raft/.gitignore b/vendor/github.com/hashicorp/raft/.gitignore new file mode 100644 index 00000000..83656241 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/.gitignore @@ -0,0 +1,23 @@ +# Compiled Object files, Static and Dynamic libs (Shared Objects) +*.o +*.a +*.so + +# Folders +_obj +_test + +# Architecture specific extensions/prefixes +*.[568vq] +[568vq].out + +*.cgo1.go +*.cgo2.c +_cgo_defun.c +_cgo_gotypes.go +_cgo_export.* + +_testmain.go + +*.exe +*.test diff --git a/vendor/github.com/hashicorp/raft/.travis.yml b/vendor/github.com/hashicorp/raft/.travis.yml new file mode 100644 index 00000000..94eb8668 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/.travis.yml @@ -0,0 +1,16 @@ +language: go + +go: + - 1.4 + - 1.5 + - 1.6 + - tip + +install: make deps +script: + - make integ + +notifications: + flowdock: + secure: fZrcf9rlh2IrQrlch1sHkn3YI7SKvjGnAl/zyV5D6NROe1Bbr6d3QRMuCXWWdhJHzjKmXk5rIzbqJhUc0PNF7YjxGNKSzqWMQ56KcvN1k8DzlqxpqkcA3Jbs6fXCWo2fssRtZ7hj/wOP1f5n6cc7kzHDt9dgaYJ6nO2fqNPJiTc= + diff --git a/vendor/github.com/hashicorp/raft/LICENSE b/vendor/github.com/hashicorp/raft/LICENSE new file mode 100644 index 00000000..c33dcc7c --- /dev/null +++ b/vendor/github.com/hashicorp/raft/LICENSE @@ -0,0 +1,354 @@ +Mozilla Public License, version 2.0 + +1. Definitions + +1.1. “Contributor” + + means each individual or legal entity that creates, contributes to the + creation of, or owns Covered Software. + +1.2. “Contributor Version” + + means the combination of the Contributions of others (if any) used by a + Contributor and that particular Contributor’s Contribution. + +1.3. “Contribution” + + means Covered Software of a particular Contributor. + +1.4. “Covered Software” + + means Source Code Form to which the initial Contributor has attached the + notice in Exhibit A, the Executable Form of such Source Code Form, and + Modifications of such Source Code Form, in each case including portions + thereof. + +1.5. “Incompatible With Secondary Licenses” + means + + a. that the initial Contributor has attached the notice described in + Exhibit B to the Covered Software; or + + b. that the Covered Software was made available under the terms of version + 1.1 or earlier of the License, but not also under the terms of a + Secondary License. + +1.6. “Executable Form” + + means any form of the work other than Source Code Form. + +1.7. “Larger Work” + + means a work that combines Covered Software with other material, in a separate + file or files, that is not Covered Software. + +1.8. “License” + + means this document. + +1.9. “Licensable” + + means having the right to grant, to the maximum extent possible, whether at the + time of the initial grant or subsequently, any and all of the rights conveyed by + this License. + +1.10. “Modifications” + + means any of the following: + + a. any file in Source Code Form that results from an addition to, deletion + from, or modification of the contents of Covered Software; or + + b. any new file in Source Code Form that contains any Covered Software. + +1.11. “Patent Claims” of a Contributor + + means any patent claim(s), including without limitation, method, process, + and apparatus claims, in any patent Licensable by such Contributor that + would be infringed, but for the grant of the License, by the making, + using, selling, offering for sale, having made, import, or transfer of + either its Contributions or its Contributor Version. + +1.12. “Secondary License” + + means either the GNU General Public License, Version 2.0, the GNU Lesser + General Public License, Version 2.1, the GNU Affero General Public + License, Version 3.0, or any later versions of those licenses. + +1.13. “Source Code Form” + + means the form of the work preferred for making modifications. + +1.14. “You” (or “Your”) + + means an individual or a legal entity exercising rights under this + License. For legal entities, “You” includes any entity that controls, is + controlled by, or is under common control with You. For purposes of this + definition, “control” means (a) the power, direct or indirect, to cause + the direction or management of such entity, whether by contract or + otherwise, or (b) ownership of more than fifty percent (50%) of the + outstanding shares or beneficial ownership of such entity. + + +2. License Grants and Conditions + +2.1. Grants + + Each Contributor hereby grants You a world-wide, royalty-free, + non-exclusive license: + + a. under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or as + part of a Larger Work; and + + b. under Patent Claims of such Contributor to make, use, sell, offer for + sale, have made, import, and otherwise transfer either its Contributions + or its Contributor Version. + +2.2. Effective Date + + The licenses granted in Section 2.1 with respect to any Contribution become + effective for each Contribution on the date the Contributor first distributes + such Contribution. + +2.3. Limitations on Grant Scope + + The licenses granted in this Section 2 are the only rights granted under this + License. No additional rights or licenses will be implied from the distribution + or licensing of Covered Software under this License. Notwithstanding Section + 2.1(b) above, no patent license is granted by a Contributor: + + a. for any code that a Contributor has removed from Covered Software; or + + b. for infringements caused by: (i) Your and any other third party’s + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + + c. under Patent Claims infringed by Covered Software in the absence of its + Contributions. + + This License does not grant any rights in the trademarks, service marks, or + logos of any Contributor (except as may be necessary to comply with the + notice requirements in Section 3.4). + +2.4. Subsequent Licenses + + No Contributor makes additional grants as a result of Your choice to + distribute the Covered Software under a subsequent version of this License + (see Section 10.2) or under the terms of a Secondary License (if permitted + under the terms of Section 3.3). + +2.5. Representation + + Each Contributor represents that the Contributor believes its Contributions + are its original creation(s) or it has sufficient rights to grant the + rights to its Contributions conveyed by this License. + +2.6. Fair Use + + This License is not intended to limit any rights You have under applicable + copyright doctrines of fair use, fair dealing, or other equivalents. + +2.7. Conditions + + Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in + Section 2.1. + + +3. Responsibilities + +3.1. Distribution of Source Form + + All distribution of Covered Software in Source Code Form, including any + Modifications that You create or to which You contribute, must be under the + terms of this License. You must inform recipients that the Source Code Form + of the Covered Software is governed by the terms of this License, and how + they can obtain a copy of this License. You may not attempt to alter or + restrict the recipients’ rights in the Source Code Form. + +3.2. Distribution of Executable Form + + If You distribute Covered Software in Executable Form then: + + a. such Covered Software must also be made available in Source Code Form, + as described in Section 3.1, and You must inform recipients of the + Executable Form how they can obtain a copy of such Source Code Form by + reasonable means in a timely manner, at a charge no more than the cost + of distribution to the recipient; and + + b. You may distribute such Executable Form under the terms of this License, + or sublicense it under different terms, provided that the license for + the Executable Form does not attempt to limit or alter the recipients’ + rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + + You may create and distribute a Larger Work under terms of Your choice, + provided that You also comply with the requirements of this License for the + Covered Software. If the Larger Work is a combination of Covered Software + with a work governed by one or more Secondary Licenses, and the Covered + Software is not Incompatible With Secondary Licenses, this License permits + You to additionally distribute such Covered Software under the terms of + such Secondary License(s), so that the recipient of the Larger Work may, at + their option, further distribute the Covered Software under the terms of + either this License or such Secondary License(s). + +3.4. Notices + + You may not remove or alter the substance of any license notices (including + copyright notices, patent notices, disclaimers of warranty, or limitations + of liability) contained within the Source Code Form of the Covered + Software, except that You may alter any license notices to the extent + required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + + You may choose to offer, and to charge a fee for, warranty, support, + indemnity or liability obligations to one or more recipients of Covered + Software. However, You may do so only on Your own behalf, and not on behalf + of any Contributor. You must make it absolutely clear that any such + warranty, support, indemnity, or liability obligation is offered by You + alone, and You hereby agree to indemnify every Contributor for any + liability incurred by such Contributor as a result of warranty, support, + indemnity or liability terms You offer. You may include additional + disclaimers of warranty and limitations of liability specific to any + jurisdiction. + +4. Inability to Comply Due to Statute or Regulation + + If it is impossible for You to comply with any of the terms of this License + with respect to some or all of the Covered Software due to statute, judicial + order, or regulation then You must: (a) comply with the terms of this License + to the maximum extent possible; and (b) describe the limitations and the code + they affect. Such description must be placed in a text file included with all + distributions of the Covered Software under this License. Except to the + extent prohibited by statute or regulation, such description must be + sufficiently detailed for a recipient of ordinary skill to be able to + understand it. + +5. Termination + +5.1. The rights granted under this License will terminate automatically if You + fail to comply with any of its terms. However, if You become compliant, + then the rights granted under this License from a particular Contributor + are reinstated (a) provisionally, unless and until such Contributor + explicitly and finally terminates Your grants, and (b) on an ongoing basis, + if such Contributor fails to notify You of the non-compliance by some + reasonable means prior to 60 days after You have come back into compliance. + Moreover, Your grants from a particular Contributor are reinstated on an + ongoing basis if such Contributor notifies You of the non-compliance by + some reasonable means, this is the first time You have received notice of + non-compliance with this License from such Contributor, and You become + compliant prior to 30 days after Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent + infringement claim (excluding declaratory judgment actions, counter-claims, + and cross-claims) alleging that a Contributor Version directly or + indirectly infringes any patent, then the rights granted to You by any and + all Contributors for the Covered Software under Section 2.1 of this License + shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user + license agreements (excluding distributors and resellers) which have been + validly granted by You or Your distributors under this License prior to + termination shall survive termination. + +6. Disclaimer of Warranty + + Covered Software is provided under this License on an “as is” basis, without + warranty of any kind, either expressed, implied, or statutory, including, + without limitation, warranties that the Covered Software is free of defects, + merchantable, fit for a particular purpose or non-infringing. The entire + risk as to the quality and performance of the Covered Software is with You. + Should any Covered Software prove defective in any respect, You (not any + Contributor) assume the cost of any necessary servicing, repair, or + correction. This disclaimer of warranty constitutes an essential part of this + License. No use of any Covered Software is authorized under this License + except under this disclaimer. + +7. Limitation of Liability + + Under no circumstances and under no legal theory, whether tort (including + negligence), contract, or otherwise, shall any Contributor, or anyone who + distributes Covered Software as permitted above, be liable to You for any + direct, indirect, special, incidental, or consequential damages of any + character including, without limitation, damages for lost profits, loss of + goodwill, work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses, even if such party shall have been + informed of the possibility of such damages. This limitation of liability + shall not apply to liability for death or personal injury resulting from such + party’s negligence to the extent applicable law prohibits such limitation. + Some jurisdictions do not allow the exclusion or limitation of incidental or + consequential damages, so this exclusion and limitation may not apply to You. + +8. Litigation + + Any litigation relating to this License may be brought only in the courts of + a jurisdiction where the defendant maintains its principal place of business + and such litigation shall be governed by laws of that jurisdiction, without + reference to its conflict-of-law provisions. Nothing in this Section shall + prevent a party’s ability to bring cross-claims or counter-claims. + +9. Miscellaneous + + This License represents the complete agreement concerning the subject matter + hereof. If any provision of this License is held to be unenforceable, such + provision shall be reformed only to the extent necessary to make it + enforceable. Any law or regulation which provides that the language of a + contract shall be construed against the drafter shall not be used to construe + this License against a Contributor. + + +10. Versions of the License + +10.1. New Versions + + Mozilla Foundation is the license steward. Except as provided in Section + 10.3, no one other than the license steward has the right to modify or + publish new versions of this License. Each version will be given a + distinguishing version number. + +10.2. Effect of New Versions + + You may distribute the Covered Software under the terms of the version of + the License under which You originally received the Covered Software, or + under the terms of any subsequent version published by the license + steward. + +10.3. Modified Versions + + If you create software not governed by this License, and you want to + create a new license for such software, you may create and use a modified + version of this License if you rename the license and remove any + references to the name of the license steward (except to note that such + modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary Licenses + If You choose to distribute Source Code Form that is Incompatible With + Secondary Licenses under the terms of this version of the License, the + notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice + + This Source Code Form is subject to the + terms of the Mozilla Public License, v. + 2.0. If a copy of the MPL was not + distributed with this file, You can + obtain one at + http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular file, then +You may include the notice in a location (such as a LICENSE file in a relevant +directory) where a recipient would be likely to look for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - “Incompatible With Secondary Licenses” Notice + + This Source Code Form is “Incompatible + With Secondary Licenses”, as defined by + the Mozilla Public License, v. 2.0. + diff --git a/vendor/github.com/hashicorp/raft/Makefile b/vendor/github.com/hashicorp/raft/Makefile new file mode 100644 index 00000000..92a0c0b4 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/Makefile @@ -0,0 +1,17 @@ +DEPS = $(go list -f '{{range .TestImports}}{{.}} {{end}}' ./...) + +test: + go test -timeout=30s ./... + +integ: test + INTEG_TESTS=yes go test -timeout=23s -run=Integ ./... + +deps: + go get -d -v ./... + echo $(DEPS) | xargs -n1 go get -d + +cov: + INTEG_TESTS=yes gocov test github.com/hashicorp/raft | gocov-html > /tmp/coverage.html + open /tmp/coverage.html + +.PHONY: test cov integ deps diff --git a/vendor/github.com/hashicorp/raft/README.md b/vendor/github.com/hashicorp/raft/README.md new file mode 100644 index 00000000..a70ec8a0 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/README.md @@ -0,0 +1,107 @@ +raft [![Build Status](https://travis-ci.org/hashicorp/raft.png)](https://travis-ci.org/hashicorp/raft) +==== + +raft is a [Go](http://www.golang.org) library that manages a replicated +log and can be used with an FSM to manage replicated state machines. It +is a library for providing [consensus](http://en.wikipedia.org/wiki/Consensus_(computer_science)). + +The use cases for such a library are far-reaching as replicated state +machines are a key component of many distributed systems. They enable +building Consistent, Partition Tolerant (CP) systems, with limited +fault tolerance as well. + +## Building + +If you wish to build raft you'll need Go version 1.2+ installed. + +Please check your installation with: + +``` +go version +``` + +## Documentation + +For complete documentation, see the associated [Godoc](http://godoc.org/github.com/hashicorp/raft). + +To prevent complications with cgo, the primary backend `MDBStore` is in a separate repository, +called [raft-mdb](http://github.com/hashicorp/raft-mdb). That is the recommended implementation +for the `LogStore` and `StableStore`. + +A pure Go backend using [BoltDB](https://github.com/boltdb/bolt) is also available called +[raft-boltdb](https://github.com/hashicorp/raft-boltdb). It can also be used as a `LogStore` +and `StableStore`. + +## Tagged Releases + +As of September 2017, Hashicorp will start using tags for this library to clearly indicate +major version updates. We recommend you vendor your application's dependency on this library. + +* v0.1.0 is the original stable version of the library that was in master and has been maintained +with no breaking API changes. This was in use by Consul prior to version 0.7.0. + +* v1.0.0 takes the changes that were staged in the library-v2-stage-one branch. This version +manages server identities using a UUID, so introduces some breaking API changes. It also versions +the Raft protocol, and requires some special steps when interoperating with Raft servers running +older versions of the library (see the detailed comment in config.go about version compatibility). +You can reference https://github.com/hashicorp/consul/pull/2222 for an idea of what was required +to port Consul to these new interfaces. + + This version includes some new features as well, including non voting servers, a new address + provider abstraction in the transport layer, and more resilient snapshots. + +## Protocol + +raft is based on ["Raft: In Search of an Understandable Consensus Algorithm"](https://ramcloud.stanford.edu/wiki/download/attachments/11370504/raft.pdf) + +A high level overview of the Raft protocol is described below, but for details please read the full +[Raft paper](https://ramcloud.stanford.edu/wiki/download/attachments/11370504/raft.pdf) +followed by the raft source. Any questions about the raft protocol should be sent to the +[raft-dev mailing list](https://groups.google.com/forum/#!forum/raft-dev). + +### Protocol Description + +Raft nodes are always in one of three states: follower, candidate or leader. All +nodes initially start out as a follower. In this state, nodes can accept log entries +from a leader and cast votes. If no entries are received for some time, nodes +self-promote to the candidate state. In the candidate state nodes request votes from +their peers. If a candidate receives a quorum of votes, then it is promoted to a leader. +The leader must accept new log entries and replicate to all the other followers. +In addition, if stale reads are not acceptable, all queries must also be performed on +the leader. + +Once a cluster has a leader, it is able to accept new log entries. A client can +request that a leader append a new log entry, which is an opaque binary blob to +Raft. The leader then writes the entry to durable storage and attempts to replicate +to a quorum of followers. Once the log entry is considered *committed*, it can be +*applied* to a finite state machine. The finite state machine is application specific, +and is implemented using an interface. + +An obvious question relates to the unbounded nature of a replicated log. Raft provides +a mechanism by which the current state is snapshotted, and the log is compacted. Because +of the FSM abstraction, restoring the state of the FSM must result in the same state +as a replay of old logs. This allows Raft to capture the FSM state at a point in time, +and then remove all the logs that were used to reach that state. This is performed automatically +without user intervention, and prevents unbounded disk usage as well as minimizing +time spent replaying logs. + +Lastly, there is the issue of updating the peer set when new servers are joining +or existing servers are leaving. As long as a quorum of nodes is available, this +is not an issue as Raft provides mechanisms to dynamically update the peer set. +If a quorum of nodes is unavailable, then this becomes a very challenging issue. +For example, suppose there are only 2 peers, A and B. The quorum size is also +2, meaning both nodes must agree to commit a log entry. If either A or B fails, +it is now impossible to reach quorum. This means the cluster is unable to add, +or remove a node, or commit any additional log entries. This results in *unavailability*. +At this point, manual intervention would be required to remove either A or B, +and to restart the remaining node in bootstrap mode. + +A Raft cluster of 3 nodes can tolerate a single node failure, while a cluster +of 5 can tolerate 2 node failures. The recommended configuration is to either +run 3 or 5 raft servers. This maximizes availability without +greatly sacrificing performance. + +In terms of performance, Raft is comparable to Paxos. Assuming stable leadership, +committing a log entry requires a single round trip to half of the cluster. +Thus performance is bound by disk I/O and network latency. + diff --git a/vendor/github.com/hashicorp/raft/bench/bench.go b/vendor/github.com/hashicorp/raft/bench/bench.go new file mode 100644 index 00000000..d7a58f45 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/bench/bench.go @@ -0,0 +1,171 @@ +package raftbench + +// raftbench provides common benchmarking functions which can be used by +// anything which implements the raft.LogStore and raft.StableStore interfaces. +// All functions accept these interfaces and perform benchmarking. This +// makes comparing backend performance easier by sharing the tests. + +import ( + "github.com/hashicorp/raft" + "testing" +) + +func FirstIndex(b *testing.B, store raft.LogStore) { + // Create some fake data + var logs []*raft.Log + for i := 1; i < 10; i++ { + logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")}) + } + if err := store.StoreLogs(logs); err != nil { + b.Fatalf("err: %s", err) + } + b.ResetTimer() + + // Run FirstIndex a number of times + for n := 0; n < b.N; n++ { + store.FirstIndex() + } +} + +func LastIndex(b *testing.B, store raft.LogStore) { + // Create some fake data + var logs []*raft.Log + for i := 1; i < 10; i++ { + logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")}) + } + if err := store.StoreLogs(logs); err != nil { + b.Fatalf("err: %s", err) + } + b.ResetTimer() + + // Run LastIndex a number of times + for n := 0; n < b.N; n++ { + store.LastIndex() + } +} + +func GetLog(b *testing.B, store raft.LogStore) { + // Create some fake data + var logs []*raft.Log + for i := 1; i < 10; i++ { + logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")}) + } + if err := store.StoreLogs(logs); err != nil { + b.Fatalf("err: %s", err) + } + b.ResetTimer() + + // Run GetLog a number of times + for n := 0; n < b.N; n++ { + if err := store.GetLog(5, new(raft.Log)); err != nil { + b.Fatalf("err: %s", err) + } + } +} + +func StoreLog(b *testing.B, store raft.LogStore) { + // Run StoreLog a number of times + for n := 0; n < b.N; n++ { + log := &raft.Log{Index: uint64(n), Data: []byte("data")} + if err := store.StoreLog(log); err != nil { + b.Fatalf("err: %s", err) + } + } +} + +func StoreLogs(b *testing.B, store raft.LogStore) { + // Run StoreLogs a number of times. We want to set multiple logs each + // run, so we create 3 logs with incrementing indexes for each iteration. + for n := 0; n < b.N; n++ { + b.StopTimer() + offset := 3 * (n + 1) + logs := []*raft.Log{ + &raft.Log{Index: uint64(offset - 2), Data: []byte("data")}, + &raft.Log{Index: uint64(offset - 1), Data: []byte("data")}, + &raft.Log{Index: uint64(offset), Data: []byte("data")}, + } + b.StartTimer() + + if err := store.StoreLogs(logs); err != nil { + b.Fatalf("err: %s", err) + } + } +} + +func DeleteRange(b *testing.B, store raft.LogStore) { + // Create some fake data. In this case, we create 3 new log entries for each + // test case, and separate them by index in multiples of 10. This allows + // some room so that we can test deleting ranges with "extra" logs to + // to ensure we stop going to the database once our max index is hit. + var logs []*raft.Log + for n := 0; n < b.N; n++ { + offset := 10 * n + for i := offset; i < offset+3; i++ { + logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")}) + } + } + if err := store.StoreLogs(logs); err != nil { + b.Fatalf("err: %s", err) + } + b.ResetTimer() + + // Delete a range of the data + for n := 0; n < b.N; n++ { + offset := 10 * n + if err := store.DeleteRange(uint64(offset), uint64(offset+9)); err != nil { + b.Fatalf("err: %s", err) + } + } +} + +func Set(b *testing.B, store raft.StableStore) { + // Run Set a number of times + for n := 0; n < b.N; n++ { + if err := store.Set([]byte{byte(n)}, []byte("val")); err != nil { + b.Fatalf("err: %s", err) + } + } +} + +func Get(b *testing.B, store raft.StableStore) { + // Create some fake data + for i := 1; i < 10; i++ { + if err := store.Set([]byte{byte(i)}, []byte("val")); err != nil { + b.Fatalf("err: %s", err) + } + } + b.ResetTimer() + + // Run Get a number of times + for n := 0; n < b.N; n++ { + if _, err := store.Get([]byte{0x05}); err != nil { + b.Fatalf("err: %s", err) + } + } +} + +func SetUint64(b *testing.B, store raft.StableStore) { + // Run SetUint64 a number of times + for n := 0; n < b.N; n++ { + if err := store.SetUint64([]byte{byte(n)}, uint64(n)); err != nil { + b.Fatalf("err: %s", err) + } + } +} + +func GetUint64(b *testing.B, store raft.StableStore) { + // Create some fake data + for i := 0; i < 10; i++ { + if err := store.SetUint64([]byte{byte(i)}, uint64(i)); err != nil { + b.Fatalf("err: %s", err) + } + } + b.ResetTimer() + + // Run GetUint64 a number of times + for n := 0; n < b.N; n++ { + if _, err := store.Get([]byte{0x05}); err != nil { + b.Fatalf("err: %s", err) + } + } +} diff --git a/vendor/github.com/hashicorp/raft/commands.go b/vendor/github.com/hashicorp/raft/commands.go new file mode 100644 index 00000000..739775b3 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/commands.go @@ -0,0 +1,84 @@ +package raft + +// AppendEntriesRequest is the command used to append entries to the +// replicated log. +type AppendEntriesRequest struct { + // Provide the current term and leader + Term uint64 + Leader []byte + + // Provide the previous entries for integrity checking + PrevLogEntry uint64 + PrevLogTerm uint64 + + // New entries to commit + Entries []*Log + + // Commit index on the leader + LeaderCommitIndex uint64 +} + +// AppendEntriesResponse is the response returned from an +// AppendEntriesRequest. +type AppendEntriesResponse struct { + // Newer term if leader is out of date + Term uint64 + + // Last Log is a hint to help accelerate rebuilding slow nodes + LastLog uint64 + + // We may not succeed if we have a conflicting entry + Success bool + + // There are scenarios where this request didn't succeed + // but there's no need to wait/back-off the next attempt. + NoRetryBackoff bool +} + +// RequestVoteRequest is the command used by a candidate to ask a Raft peer +// for a vote in an election. +type RequestVoteRequest struct { + // Provide the term and our id + Term uint64 + Candidate []byte + + // Used to ensure safety + LastLogIndex uint64 + LastLogTerm uint64 +} + +// RequestVoteResponse is the response returned from a RequestVoteRequest. +type RequestVoteResponse struct { + // Newer term if leader is out of date + Term uint64 + + // Return the peers, so that a node can shutdown on removal + Peers []byte + + // Is the vote granted + Granted bool +} + +// InstallSnapshotRequest is the command sent to a Raft peer to bootstrap its +// log (and state machine) from a snapshot on another peer. +type InstallSnapshotRequest struct { + Term uint64 + Leader []byte + + // These are the last index/term included in the snapshot + LastLogIndex uint64 + LastLogTerm uint64 + + // Peer Set in the snapshot + Peers []byte + + // Size of the snapshot + Size int64 +} + +// InstallSnapshotResponse is the response returned from an +// InstallSnapshotRequest. +type InstallSnapshotResponse struct { + Term uint64 + Success bool +} diff --git a/vendor/github.com/hashicorp/raft/config.go b/vendor/github.com/hashicorp/raft/config.go new file mode 100644 index 00000000..2dbd5e60 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/config.go @@ -0,0 +1,136 @@ +package raft + +import ( + "fmt" + "io" + "log" + "time" +) + +// Config provides any necessary configuration to +// the Raft server +type Config struct { + // HeartbeatTimeout specifies the time in follower state without + // a leader before we attempt an election. + HeartbeatTimeout time.Duration + + // ElectionTimeout specifies the time in candidate state without + // a leader before we attempt an election. + ElectionTimeout time.Duration + + // CommitTimeout controls the time without an Apply() operation + // before we heartbeat to ensure a timely commit. Due to random + // staggering, may be delayed as much as 2x this value. + CommitTimeout time.Duration + + // MaxAppendEntries controls the maximum number of append entries + // to send at once. We want to strike a balance between efficiency + // and avoiding waste if the follower is going to reject because of + // an inconsistent log. + MaxAppendEntries int + + // If we are a member of a cluster, and RemovePeer is invoked for the + // local node, then we forget all peers and transition into the follower state. + // If ShutdownOnRemove is is set, we additional shutdown Raft. Otherwise, + // we can become a leader of a cluster containing only this node. + ShutdownOnRemove bool + + // DisableBootstrapAfterElect is used to turn off EnableSingleNode + // after the node is elected. This is used to prevent self-election + // if the node is removed from the Raft cluster via RemovePeer. Setting + // it to false will keep the bootstrap mode, allowing the node to self-elect + // and potentially bootstrap a separate cluster. + DisableBootstrapAfterElect bool + + // TrailingLogs controls how many logs we leave after a snapshot. This is + // used so that we can quickly replay logs on a follower instead of being + // forced to send an entire snapshot. + TrailingLogs uint64 + + // SnapshotInterval controls how often we check if we should perform a snapshot. + // We randomly stagger between this value and 2x this value to avoid the entire + // cluster from performing a snapshot at once. + SnapshotInterval time.Duration + + // SnapshotThreshold controls how many outstanding logs there must be before + // we perform a snapshot. This is to prevent excessive snapshots when we can + // just replay a small set of logs. + SnapshotThreshold uint64 + + // EnableSingleNode allows for a single node mode of operation. This + // is false by default, which prevents a lone node from electing itself. + // leader. + EnableSingleNode bool + + // LeaderLeaseTimeout is used to control how long the "lease" lasts + // for being the leader without being able to contact a quorum + // of nodes. If we reach this interval without contact, we will + // step down as leader. + LeaderLeaseTimeout time.Duration + + // StartAsLeader forces Raft to start in the leader state. This should + // never be used except for testing purposes, as it can cause a split-brain. + StartAsLeader bool + + // NotifyCh is used to provide a channel that will be notified of leadership + // changes. Raft will block writing to this channel, so it should either be + // buffered or aggressively consumed. + NotifyCh chan<- bool + + // LogOutput is used as a sink for logs, unless Logger is specified. + // Defaults to os.Stderr. + LogOutput io.Writer + + // Logger is a user-provided logger. If nil, a logger writing to LogOutput + // is used. + Logger *log.Logger +} + +// DefaultConfig returns a Config with usable defaults. +func DefaultConfig() *Config { + return &Config{ + HeartbeatTimeout: 1000 * time.Millisecond, + ElectionTimeout: 1000 * time.Millisecond, + CommitTimeout: 50 * time.Millisecond, + MaxAppendEntries: 64, + ShutdownOnRemove: true, + DisableBootstrapAfterElect: true, + TrailingLogs: 10240, + SnapshotInterval: 120 * time.Second, + SnapshotThreshold: 8192, + EnableSingleNode: false, + LeaderLeaseTimeout: 500 * time.Millisecond, + } +} + +// ValidateConfig is used to validate a sane configuration +func ValidateConfig(config *Config) error { + if config.HeartbeatTimeout < 5*time.Millisecond { + return fmt.Errorf("Heartbeat timeout is too low") + } + if config.ElectionTimeout < 5*time.Millisecond { + return fmt.Errorf("Election timeout is too low") + } + if config.CommitTimeout < time.Millisecond { + return fmt.Errorf("Commit timeout is too low") + } + if config.MaxAppendEntries <= 0 { + return fmt.Errorf("MaxAppendEntries must be positive") + } + if config.MaxAppendEntries > 1024 { + return fmt.Errorf("MaxAppendEntries is too large") + } + if config.SnapshotInterval < 5*time.Millisecond { + return fmt.Errorf("Snapshot interval is too low") + } + if config.LeaderLeaseTimeout < 5*time.Millisecond { + return fmt.Errorf("Leader lease timeout is too low") + } + if config.LeaderLeaseTimeout > config.HeartbeatTimeout { + return fmt.Errorf("Leader lease timeout cannot be larger than heartbeat timeout") + } + if config.ElectionTimeout < config.HeartbeatTimeout { + return fmt.Errorf("Election timeout must be equal or greater than Heartbeat Timeout") + } + return nil +} diff --git a/vendor/github.com/hashicorp/raft/discard_snapshot.go b/vendor/github.com/hashicorp/raft/discard_snapshot.go new file mode 100644 index 00000000..1b4611d5 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/discard_snapshot.go @@ -0,0 +1,48 @@ +package raft + +import ( + "fmt" + "io" +) + +// DiscardSnapshotStore is used to successfully snapshot while +// always discarding the snapshot. This is useful for when the +// log should be truncated but no snapshot should be retained. +// This should never be used for production use, and is only +// suitable for testing. +type DiscardSnapshotStore struct{} + +type DiscardSnapshotSink struct{} + +// NewDiscardSnapshotStore is used to create a new DiscardSnapshotStore. +func NewDiscardSnapshotStore() *DiscardSnapshotStore { + return &DiscardSnapshotStore{} +} + +func (d *DiscardSnapshotStore) Create(index, term uint64, peers []byte) (SnapshotSink, error) { + return &DiscardSnapshotSink{}, nil +} + +func (d *DiscardSnapshotStore) List() ([]*SnapshotMeta, error) { + return nil, nil +} + +func (d *DiscardSnapshotStore) Open(id string) (*SnapshotMeta, io.ReadCloser, error) { + return nil, nil, fmt.Errorf("open is not supported") +} + +func (d *DiscardSnapshotSink) Write(b []byte) (int, error) { + return len(b), nil +} + +func (d *DiscardSnapshotSink) Close() error { + return nil +} + +func (d *DiscardSnapshotSink) ID() string { + return "discard" +} + +func (d *DiscardSnapshotSink) Cancel() error { + return nil +} diff --git a/vendor/github.com/hashicorp/raft/discard_snapshot_test.go b/vendor/github.com/hashicorp/raft/discard_snapshot_test.go new file mode 100644 index 00000000..5abedfe2 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/discard_snapshot_test.go @@ -0,0 +1,17 @@ +package raft + +import "testing" + +func TestDiscardSnapshotStoreImpl(t *testing.T) { + var impl interface{} = &DiscardSnapshotStore{} + if _, ok := impl.(SnapshotStore); !ok { + t.Fatalf("DiscardSnapshotStore not a SnapshotStore") + } +} + +func TestDiscardSnapshotSinkImpl(t *testing.T) { + var impl interface{} = &DiscardSnapshotSink{} + if _, ok := impl.(SnapshotSink); !ok { + t.Fatalf("DiscardSnapshotSink not a SnapshotSink") + } +} diff --git a/vendor/github.com/hashicorp/raft/file_snapshot.go b/vendor/github.com/hashicorp/raft/file_snapshot.go new file mode 100644 index 00000000..5b6ccc4e --- /dev/null +++ b/vendor/github.com/hashicorp/raft/file_snapshot.go @@ -0,0 +1,513 @@ +package raft + +import ( + "bufio" + "bytes" + "encoding/json" + "fmt" + "hash" + "hash/crc64" + "io" + "io/ioutil" + "log" + "os" + "path/filepath" + "runtime" + "sort" + "strings" + "time" +) + +const ( + testPath = "permTest" + snapPath = "snapshots" + metaFilePath = "meta.json" + stateFilePath = "state.bin" + tmpSuffix = ".tmp" +) + +// FileSnapshotStore implements the SnapshotStore interface and allows +// snapshots to be made on the local disk. +type FileSnapshotStore struct { + path string + retain int + logger *log.Logger +} + +type snapMetaSlice []*fileSnapshotMeta + +// FileSnapshotSink implements SnapshotSink with a file. +type FileSnapshotSink struct { + store *FileSnapshotStore + logger *log.Logger + dir string + parentDir string + meta fileSnapshotMeta + + stateFile *os.File + stateHash hash.Hash64 + buffered *bufio.Writer + + closed bool +} + +// fileSnapshotMeta is stored on disk. We also put a CRC +// on disk so that we can verify the snapshot. +type fileSnapshotMeta struct { + SnapshotMeta + CRC []byte +} + +// bufferedFile is returned when we open a snapshot. This way +// reads are buffered and the file still gets closed. +type bufferedFile struct { + bh *bufio.Reader + fh *os.File +} + +func (b *bufferedFile) Read(p []byte) (n int, err error) { + return b.bh.Read(p) +} + +func (b *bufferedFile) Close() error { + return b.fh.Close() +} + +// NewFileSnapshotStoreWithLogger creates a new FileSnapshotStore based +// on a base directory. The `retain` parameter controls how many +// snapshots are retained. Must be at least 1. +func NewFileSnapshotStoreWithLogger(base string, retain int, logger *log.Logger) (*FileSnapshotStore, error) { + if retain < 1 { + return nil, fmt.Errorf("must retain at least one snapshot") + } + if logger == nil { + logger = log.New(os.Stderr, "", log.LstdFlags) + } + + // Ensure our path exists + path := filepath.Join(base, snapPath) + if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { + return nil, fmt.Errorf("snapshot path not accessible: %v", err) + } + + // Setup the store + store := &FileSnapshotStore{ + path: path, + retain: retain, + logger: logger, + } + + // Do a permissions test + if err := store.testPermissions(); err != nil { + return nil, fmt.Errorf("permissions test failed: %v", err) + } + return store, nil +} + +// NewFileSnapshotStore creates a new FileSnapshotStore based +// on a base directory. The `retain` parameter controls how many +// snapshots are retained. Must be at least 1. +func NewFileSnapshotStore(base string, retain int, logOutput io.Writer) (*FileSnapshotStore, error) { + if logOutput == nil { + logOutput = os.Stderr + } + return NewFileSnapshotStoreWithLogger(base, retain, log.New(logOutput, "", log.LstdFlags)) +} + +// testPermissions tries to touch a file in our path to see if it works. +func (f *FileSnapshotStore) testPermissions() error { + path := filepath.Join(f.path, testPath) + fh, err := os.Create(path) + if err != nil { + return err + } + + if err = fh.Close(); err != nil { + return err + } + + if err = os.Remove(path); err != nil { + return err + } + return nil +} + +// snapshotName generates a name for the snapshot. +func snapshotName(term, index uint64) string { + now := time.Now() + msec := now.UnixNano() / int64(time.Millisecond) + return fmt.Sprintf("%d-%d-%d", term, index, msec) +} + +// Create is used to start a new snapshot +func (f *FileSnapshotStore) Create(index, term uint64, peers []byte) (SnapshotSink, error) { + // Create a new path + name := snapshotName(term, index) + path := filepath.Join(f.path, name+tmpSuffix) + f.logger.Printf("[INFO] snapshot: Creating new snapshot at %s", path) + + // Make the directory + if err := os.MkdirAll(path, 0755); err != nil { + f.logger.Printf("[ERR] snapshot: Failed to make snapshot directory: %v", err) + return nil, err + } + + // Create the sink + sink := &FileSnapshotSink{ + store: f, + logger: f.logger, + dir: path, + parentDir: f.path, + meta: fileSnapshotMeta{ + SnapshotMeta: SnapshotMeta{ + ID: name, + Index: index, + Term: term, + Peers: peers, + }, + CRC: nil, + }, + } + + // Write out the meta data + if err := sink.writeMeta(); err != nil { + f.logger.Printf("[ERR] snapshot: Failed to write metadata: %v", err) + return nil, err + } + + // Open the state file + statePath := filepath.Join(path, stateFilePath) + fh, err := os.Create(statePath) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to create state file: %v", err) + return nil, err + } + sink.stateFile = fh + + // Create a CRC64 hash + sink.stateHash = crc64.New(crc64.MakeTable(crc64.ECMA)) + + // Wrap both the hash and file in a MultiWriter with buffering + multi := io.MultiWriter(sink.stateFile, sink.stateHash) + sink.buffered = bufio.NewWriter(multi) + + // Done + return sink, nil +} + +// List returns available snapshots in the store. +func (f *FileSnapshotStore) List() ([]*SnapshotMeta, error) { + // Get the eligible snapshots + snapshots, err := f.getSnapshots() + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to get snapshots: %v", err) + return nil, err + } + + var snapMeta []*SnapshotMeta + for _, meta := range snapshots { + snapMeta = append(snapMeta, &meta.SnapshotMeta) + if len(snapMeta) == f.retain { + break + } + } + return snapMeta, nil +} + +// getSnapshots returns all the known snapshots. +func (f *FileSnapshotStore) getSnapshots() ([]*fileSnapshotMeta, error) { + // Get the eligible snapshots + snapshots, err := ioutil.ReadDir(f.path) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to scan snapshot dir: %v", err) + return nil, err + } + + // Populate the metadata + var snapMeta []*fileSnapshotMeta + for _, snap := range snapshots { + // Ignore any files + if !snap.IsDir() { + continue + } + + // Ignore any temporary snapshots + dirName := snap.Name() + if strings.HasSuffix(dirName, tmpSuffix) { + f.logger.Printf("[WARN] snapshot: Found temporary snapshot: %v", dirName) + continue + } + + // Try to read the meta data + meta, err := f.readMeta(dirName) + if err != nil { + f.logger.Printf("[WARN] snapshot: Failed to read metadata for %v: %v", dirName, err) + continue + } + + // Append, but only return up to the retain count + snapMeta = append(snapMeta, meta) + } + + // Sort the snapshot, reverse so we get new -> old + sort.Sort(sort.Reverse(snapMetaSlice(snapMeta))) + + return snapMeta, nil +} + +// readMeta is used to read the meta data for a given named backup +func (f *FileSnapshotStore) readMeta(name string) (*fileSnapshotMeta, error) { + // Open the meta file + metaPath := filepath.Join(f.path, name, metaFilePath) + fh, err := os.Open(metaPath) + if err != nil { + return nil, err + } + defer fh.Close() + + // Buffer the file IO + buffered := bufio.NewReader(fh) + + // Read in the JSON + meta := &fileSnapshotMeta{} + dec := json.NewDecoder(buffered) + if err := dec.Decode(meta); err != nil { + return nil, err + } + return meta, nil +} + +// Open takes a snapshot ID and returns a ReadCloser for that snapshot. +func (f *FileSnapshotStore) Open(id string) (*SnapshotMeta, io.ReadCloser, error) { + // Get the metadata + meta, err := f.readMeta(id) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to get meta data to open snapshot: %v", err) + return nil, nil, err + } + + // Open the state file + statePath := filepath.Join(f.path, id, stateFilePath) + fh, err := os.Open(statePath) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to open state file: %v", err) + return nil, nil, err + } + + // Create a CRC64 hash + stateHash := crc64.New(crc64.MakeTable(crc64.ECMA)) + + // Compute the hash + _, err = io.Copy(stateHash, fh) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to read state file: %v", err) + fh.Close() + return nil, nil, err + } + + // Verify the hash + computed := stateHash.Sum(nil) + if bytes.Compare(meta.CRC, computed) != 0 { + f.logger.Printf("[ERR] snapshot: CRC checksum failed (stored: %v computed: %v)", + meta.CRC, computed) + fh.Close() + return nil, nil, fmt.Errorf("CRC mismatch") + } + + // Seek to the start + if _, err := fh.Seek(0, 0); err != nil { + f.logger.Printf("[ERR] snapshot: State file seek failed: %v", err) + fh.Close() + return nil, nil, err + } + + // Return a buffered file + buffered := &bufferedFile{ + bh: bufio.NewReader(fh), + fh: fh, + } + + return &meta.SnapshotMeta, buffered, nil +} + +// ReapSnapshots reaps any snapshots beyond the retain count. +func (f *FileSnapshotStore) ReapSnapshots() error { + snapshots, err := f.getSnapshots() + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to get snapshots: %v", err) + return err + } + + for i := f.retain; i < len(snapshots); i++ { + path := filepath.Join(f.path, snapshots[i].ID) + f.logger.Printf("[INFO] snapshot: reaping snapshot %v", path) + if err := os.RemoveAll(path); err != nil { + f.logger.Printf("[ERR] snapshot: Failed to reap snapshot %v: %v", path, err) + return err + } + } + return nil +} + +// ID returns the ID of the snapshot, can be used with Open() +// after the snapshot is finalized. +func (s *FileSnapshotSink) ID() string { + return s.meta.ID +} + +// Write is used to append to the state file. We write to the +// buffered IO object to reduce the amount of context switches. +func (s *FileSnapshotSink) Write(b []byte) (int, error) { + return s.buffered.Write(b) +} + +// Close is used to indicate a successful end. +func (s *FileSnapshotSink) Close() error { + // Make sure close is idempotent + if s.closed { + return nil + } + s.closed = true + + // Close the open handles + if err := s.finalize(); err != nil { + s.logger.Printf("[ERR] snapshot: Failed to finalize snapshot: %v", err) + if delErr := os.RemoveAll(s.dir); delErr != nil { + s.logger.Printf("[ERR] snapshot: Failed to delete temporary snapshot at path %v: %v", s.dir, delErr) + return delErr + } + return err + } + + // Write out the meta data + if err := s.writeMeta(); err != nil { + s.logger.Printf("[ERR] snapshot: Failed to write metadata: %v", err) + return err + } + + // Move the directory into place + newPath := strings.TrimSuffix(s.dir, tmpSuffix) + if err := os.Rename(s.dir, newPath); err != nil { + s.logger.Printf("[ERR] snapshot: Failed to move snapshot into place: %v", err) + return err + } + + if runtime.GOOS != "windows" { //skipping fsync for directory entry edits on Windows, only needed for *nix style file systems + parentFH, err := os.Open(s.parentDir) + defer parentFH.Close() + if err != nil { + s.logger.Printf("[ERR] snapshot: Failed to open snapshot parent directory %v, error: %v", s.parentDir, err) + return err + } + + if err = parentFH.Sync(); err != nil { + s.logger.Printf("[ERR] snapshot: Failed syncing parent directory %v, error: %v", s.parentDir, err) + return err + } + } + + // Reap any old snapshots + if err := s.store.ReapSnapshots(); err != nil { + return err + } + + return nil +} + +// Cancel is used to indicate an unsuccessful end. +func (s *FileSnapshotSink) Cancel() error { + // Make sure close is idempotent + if s.closed { + return nil + } + s.closed = true + + // Close the open handles + if err := s.finalize(); err != nil { + s.logger.Printf("[ERR] snapshot: Failed to finalize snapshot: %v", err) + return err + } + + // Attempt to remove all artifacts + return os.RemoveAll(s.dir) +} + +// finalize is used to close all of our resources. +func (s *FileSnapshotSink) finalize() error { + // Flush any remaining data + if err := s.buffered.Flush(); err != nil { + return err + } + + // Sync to force fsync to disk + if err := s.stateFile.Sync(); err != nil { + return err + } + + // Get the file size + stat, statErr := s.stateFile.Stat() + + // Close the file + if err := s.stateFile.Close(); err != nil { + return err + } + + // Set the file size, check after we close + if statErr != nil { + return statErr + } + s.meta.Size = stat.Size() + + // Set the CRC + s.meta.CRC = s.stateHash.Sum(nil) + return nil +} + +// writeMeta is used to write out the metadata we have. +func (s *FileSnapshotSink) writeMeta() error { + // Open the meta file + metaPath := filepath.Join(s.dir, metaFilePath) + fh, err := os.Create(metaPath) + if err != nil { + return err + } + defer fh.Close() + + // Buffer the file IO + buffered := bufio.NewWriter(fh) + + // Write out as JSON + enc := json.NewEncoder(buffered) + if err := enc.Encode(&s.meta); err != nil { + return err + } + + if err = buffered.Flush(); err != nil { + return err + } + + if err = fh.Sync(); err != nil { + return err + } + + return nil +} + +// Implement the sort interface for []*fileSnapshotMeta. +func (s snapMetaSlice) Len() int { + return len(s) +} + +func (s snapMetaSlice) Less(i, j int) bool { + if s[i].Term != s[j].Term { + return s[i].Term < s[j].Term + } + if s[i].Index != s[j].Index { + return s[i].Index < s[j].Index + } + return s[i].ID < s[j].ID +} + +func (s snapMetaSlice) Swap(i, j int) { + s[i], s[j] = s[j], s[i] +} diff --git a/vendor/github.com/hashicorp/raft/file_snapshot_test.go b/vendor/github.com/hashicorp/raft/file_snapshot_test.go new file mode 100644 index 00000000..fcd2ef4b --- /dev/null +++ b/vendor/github.com/hashicorp/raft/file_snapshot_test.go @@ -0,0 +1,343 @@ +package raft + +import ( + "bytes" + "io" + "io/ioutil" + "os" + "runtime" + "testing" +) + +func FileSnapTest(t *testing.T) (string, *FileSnapshotStore) { + // Create a test dir + dir, err := ioutil.TempDir("", "raft") + if err != nil { + t.Fatalf("err: %v ", err) + } + + snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + return dir, snap +} + +func TestFileSnapshotStoreImpl(t *testing.T) { + var impl interface{} = &FileSnapshotStore{} + if _, ok := impl.(SnapshotStore); !ok { + t.Fatalf("FileSnapshotStore not a SnapshotStore") + } +} + +func TestFileSnapshotSinkImpl(t *testing.T) { + var impl interface{} = &FileSnapshotSink{} + if _, ok := impl.(SnapshotSink); !ok { + t.Fatalf("FileSnapshotSink not a SnapshotSink") + } +} + +func TestFileSS_CreateSnapshotMissingParentDir(t *testing.T) { + parent, err := ioutil.TempDir("", "raft") + if err != nil { + t.Fatalf("err: %v ", err) + } + defer os.RemoveAll(parent) + + dir, err := ioutil.TempDir(parent, "raft") + if err != nil { + t.Fatalf("err: %v ", err) + } + + snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + + os.RemoveAll(parent) + peers := []byte("all my lovely friends") + _, err = snap.Create(10, 3, peers) + if err != nil { + t.Fatalf("should not fail when using non existing parent") + } + +} +func TestFileSS_CreateSnapshot(t *testing.T) { + // Create a test dir + dir, err := ioutil.TempDir("", "raft") + if err != nil { + t.Fatalf("err: %v ", err) + } + defer os.RemoveAll(dir) + + snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Check no snapshots + snaps, err := snap.List() + if err != nil { + t.Fatalf("err: %v", err) + } + if len(snaps) != 0 { + t.Fatalf("did not expect any snapshots: %v", snaps) + } + + // Create a new sink + peers := []byte("all my lovely friends") + sink, err := snap.Create(10, 3, peers) + if err != nil { + t.Fatalf("err: %v", err) + } + + // The sink is not done, should not be in a list! + snaps, err = snap.List() + if err != nil { + t.Fatalf("err: %v", err) + } + if len(snaps) != 0 { + t.Fatalf("did not expect any snapshots: %v", snaps) + } + + // Write to the sink + _, err = sink.Write([]byte("first\n")) + if err != nil { + t.Fatalf("err: %v", err) + } + _, err = sink.Write([]byte("second\n")) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Done! + err = sink.Close() + if err != nil { + t.Fatalf("err: %v", err) + } + + // Should have a snapshot! + snaps, err = snap.List() + if err != nil { + t.Fatalf("err: %v", err) + } + if len(snaps) != 1 { + t.Fatalf("expect a snapshots: %v", snaps) + } + + // Check the latest + latest := snaps[0] + if latest.Index != 10 { + t.Fatalf("bad snapshot: %v", *latest) + } + if latest.Term != 3 { + t.Fatalf("bad snapshot: %v", *latest) + } + if bytes.Compare(latest.Peers, peers) != 0 { + t.Fatalf("bad snapshot: %v", *latest) + } + if latest.Size != 13 { + t.Fatalf("bad snapshot: %v", *latest) + } + + // Read the snapshot + _, r, err := snap.Open(latest.ID) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Read out everything + var buf bytes.Buffer + if _, err := io.Copy(&buf, r); err != nil { + t.Fatalf("err: %v", err) + } + if err := r.Close(); err != nil { + t.Fatalf("err: %v", err) + } + + // Ensure a match + if bytes.Compare(buf.Bytes(), []byte("first\nsecond\n")) != 0 { + t.Fatalf("content mismatch") + } +} + +func TestFileSS_CancelSnapshot(t *testing.T) { + // Create a test dir + dir, err := ioutil.TempDir("", "raft") + if err != nil { + t.Fatalf("err: %v ", err) + } + defer os.RemoveAll(dir) + + snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Create a new sink + peers := []byte("all my lovely friends") + sink, err := snap.Create(10, 3, peers) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Cancel the snapshot! Should delete + err = sink.Cancel() + if err != nil { + t.Fatalf("err: %v", err) + } + + // The sink is canceled, should not be in a list! + snaps, err := snap.List() + if err != nil { + t.Fatalf("err: %v", err) + } + if len(snaps) != 0 { + t.Fatalf("did not expect any snapshots: %v", snaps) + } +} + +func TestFileSS_Retention(t *testing.T) { + // Create a test dir + dir, err := ioutil.TempDir("", "raft") + if err != nil { + t.Fatalf("err: %v ", err) + } + defer os.RemoveAll(dir) + + snap, err := NewFileSnapshotStoreWithLogger(dir, 2, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Create a new sink + peers := []byte("all my lovely friends") + + // Create a few snapshots + for i := 10; i < 15; i++ { + sink, err := snap.Create(uint64(i), 3, peers) + if err != nil { + t.Fatalf("err: %v", err) + } + err = sink.Close() + if err != nil { + t.Fatalf("err: %v", err) + } + } + + // Should only have 2 listed! + snaps, err := snap.List() + if err != nil { + t.Fatalf("err: %v", err) + } + if len(snaps) != 2 { + t.Fatalf("expect 2 snapshots: %v", snaps) + } + + // Check they are the latest + if snaps[0].Index != 14 { + t.Fatalf("bad snap: %#v", *snaps[0]) + } + if snaps[1].Index != 13 { + t.Fatalf("bad snap: %#v", *snaps[1]) + } +} + +func TestFileSS_BadPerm(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("skipping file permission test on windows") + } + + // Create a temp dir + dir1, err := ioutil.TempDir("", "raft") + if err != nil { + t.Fatalf("err: %s", err) + } + defer os.RemoveAll(dir1) + + // Create a sub dir and remove all permissions + dir2, err := ioutil.TempDir(dir1, "badperm") + if err != nil { + t.Fatalf("err: %s", err) + } + if err := os.Chmod(dir2, 000); err != nil { + t.Fatalf("err: %s", err) + } + defer os.Chmod(dir2, 777) // Set perms back for delete + + // Should fail + if _, err := NewFileSnapshotStore(dir2, 3, nil); err == nil { + t.Fatalf("should fail to use dir with bad perms") + } +} + +func TestFileSS_MissingParentDir(t *testing.T) { + parent, err := ioutil.TempDir("", "raft") + if err != nil { + t.Fatalf("err: %v ", err) + } + defer os.RemoveAll(parent) + + dir, err := ioutil.TempDir(parent, "raft") + if err != nil { + t.Fatalf("err: %v ", err) + } + + os.RemoveAll(parent) + _, err = NewFileSnapshotStore(dir, 3, nil) + if err != nil { + t.Fatalf("should not fail when using non existing parent") + } +} + +func TestFileSS_Ordering(t *testing.T) { + // Create a test dir + dir, err := ioutil.TempDir("", "raft") + if err != nil { + t.Fatalf("err: %v ", err) + } + defer os.RemoveAll(dir) + + snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Create a new sink + peers := []byte("all my lovely friends") + + sink, err := snap.Create(130350, 5, peers) + if err != nil { + t.Fatalf("err: %v", err) + } + err = sink.Close() + if err != nil { + t.Fatalf("err: %v", err) + } + + sink, err = snap.Create(204917, 36, peers) + if err != nil { + t.Fatalf("err: %v", err) + } + err = sink.Close() + if err != nil { + t.Fatalf("err: %v", err) + } + + // Should only have 2 listed! + snaps, err := snap.List() + if err != nil { + t.Fatalf("err: %v", err) + } + if len(snaps) != 2 { + t.Fatalf("expect 2 snapshots: %v", snaps) + } + + // Check they are ordered + if snaps[0].Term != 36 { + t.Fatalf("bad snap: %#v", *snaps[0]) + } + if snaps[1].Term != 5 { + t.Fatalf("bad snap: %#v", *snaps[1]) + } +} diff --git a/vendor/github.com/hashicorp/raft/fsm.go b/vendor/github.com/hashicorp/raft/fsm.go new file mode 100644 index 00000000..ae52e9a7 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/fsm.go @@ -0,0 +1,40 @@ +package raft + +import ( + "io" +) + +// FSM provides an interface that can be implemented by +// clients to make use of the replicated log. +type FSM interface { + // Apply log is invoked once a log entry is committed. + // It returns a value which will be made available in the + // ApplyFuture returned by Raft.Apply method if that + // method was called on the same Raft node as the FSM. + Apply(*Log) interface{} + + // Snapshot is used to support log compaction. This call should + // return an FSMSnapshot which can be used to save a point-in-time + // snapshot of the FSM. Apply and Snapshot are not called in multiple + // threads, but Apply will be called concurrently with Persist. This means + // the FSM should be implemented in a fashion that allows for concurrent + // updates while a snapshot is happening. + Snapshot() (FSMSnapshot, error) + + // Restore is used to restore an FSM from a snapshot. It is not called + // concurrently with any other command. The FSM must discard all previous + // state. + Restore(io.ReadCloser) error +} + +// FSMSnapshot is returned by an FSM in response to a Snapshot +// It must be safe to invoke FSMSnapshot methods with concurrent +// calls to Apply. +type FSMSnapshot interface { + // Persist should dump all necessary state to the WriteCloser 'sink', + // and call sink.Close() when finished or call sink.Cancel() on error. + Persist(sink SnapshotSink) error + + // Release is invoked when we are finished with the snapshot. + Release() +} diff --git a/vendor/github.com/hashicorp/raft/future.go b/vendor/github.com/hashicorp/raft/future.go new file mode 100644 index 00000000..177ef834 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/future.go @@ -0,0 +1,203 @@ +package raft + +import ( + "sync" + "time" +) + +// Future is used to represent an action that may occur in the future. +type Future interface { + // Error blocks until the future arrives and then + // returns the error status of the future. + // This may be called any number of times - all + // calls will return the same value. + // Note that it is not OK to call this method + // twice concurrently on the same Future instance. + Error() error +} + +// ApplyFuture is used for Apply() and may return the FSM response. +type ApplyFuture interface { + Future + + // Response returns the FSM response as returned + // by the FSM.Apply method. This must not be called + // until after the Error method has returned. + Response() interface{} + + // Index holds the index of the newly applied log entry. + // This must not be called + // until after the Error method has returned. + Index() uint64 +} + +// errorFuture is used to return a static error. +type errorFuture struct { + err error +} + +func (e errorFuture) Error() error { + return e.err +} + +func (e errorFuture) Response() interface{} { + return nil +} + +func (e errorFuture) Index() uint64 { + return 0 +} + +// deferError can be embedded to allow a future +// to provide an error in the future. +type deferError struct { + err error + errCh chan error + responded bool +} + +func (d *deferError) init() { + d.errCh = make(chan error, 1) +} + +func (d *deferError) Error() error { + if d.err != nil { + // Note that when we've received a nil error, this + // won't trigger, but the channel is closed after + // send so we'll still return nil below. + return d.err + } + if d.errCh == nil { + panic("waiting for response on nil channel") + } + d.err = <-d.errCh + return d.err +} + +func (d *deferError) respond(err error) { + if d.errCh == nil { + return + } + if d.responded { + return + } + d.errCh <- err + close(d.errCh) + d.responded = true +} + +// logFuture is used to apply a log entry and waits until +// the log is considered committed. +type logFuture struct { + deferError + log Log + policy quorumPolicy + response interface{} + dispatch time.Time +} + +func (l *logFuture) Response() interface{} { + return l.response +} + +func (l *logFuture) Index() uint64 { + return l.log.Index +} + +type peerFuture struct { + deferError + peers []string +} + +type shutdownFuture struct { + raft *Raft +} + +func (s *shutdownFuture) Error() error { + if s.raft == nil { + return nil + } + s.raft.waitShutdown() + if closeable, ok := s.raft.trans.(WithClose); ok { + closeable.Close() + } + return nil +} + +// snapshotFuture is used for waiting on a snapshot to complete. +type snapshotFuture struct { + deferError +} + +// reqSnapshotFuture is used for requesting a snapshot start. +// It is only used internally. +type reqSnapshotFuture struct { + deferError + + // snapshot details provided by the FSM runner before responding + index uint64 + term uint64 + peers []string + snapshot FSMSnapshot +} + +// restoreFuture is used for requesting an FSM to perform a +// snapshot restore. Used internally only. +type restoreFuture struct { + deferError + ID string +} + +// verifyFuture is used to verify the current node is still +// the leader. This is to prevent a stale read. +type verifyFuture struct { + deferError + notifyCh chan *verifyFuture + quorumSize int + votes int + voteLock sync.Mutex +} + +// vote is used to respond to a verifyFuture. +// This may block when responding on the notifyCh. +func (v *verifyFuture) vote(leader bool) { + v.voteLock.Lock() + defer v.voteLock.Unlock() + + // Guard against having notified already + if v.notifyCh == nil { + return + } + + if leader { + v.votes++ + if v.votes >= v.quorumSize { + v.notifyCh <- v + v.notifyCh = nil + } + } else { + v.notifyCh <- v + v.notifyCh = nil + } +} + +// appendFuture is used for waiting on a pipelined append +// entries RPC. +type appendFuture struct { + deferError + start time.Time + args *AppendEntriesRequest + resp *AppendEntriesResponse +} + +func (a *appendFuture) Start() time.Time { + return a.start +} + +func (a *appendFuture) Request() *AppendEntriesRequest { + return a.args +} + +func (a *appendFuture) Response() *AppendEntriesResponse { + return a.resp +} diff --git a/vendor/github.com/hashicorp/raft/future_test.go b/vendor/github.com/hashicorp/raft/future_test.go new file mode 100644 index 00000000..8bb95832 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/future_test.go @@ -0,0 +1,42 @@ +package raft + +import ( + "errors" + "testing" +) + +func TestDeferFutureSuccess(t *testing.T) { + var f deferError + f.init() + f.respond(nil) + if err := f.Error(); err != nil { + t.Fatalf("unexpected error result; got %#v want nil", err) + } + if err := f.Error(); err != nil { + t.Fatalf("unexpected error result; got %#v want nil", err) + } +} + +func TestDeferFutureError(t *testing.T) { + want := errors.New("x") + var f deferError + f.init() + f.respond(want) + if got := f.Error(); got != want { + t.Fatalf("unexpected error result; got %#v want %#v", got, want) + } + if got := f.Error(); got != want { + t.Fatalf("unexpected error result; got %#v want %#v", got, want) + } +} + +func TestDeferFutureConcurrent(t *testing.T) { + // Food for the race detector. + want := errors.New("x") + var f deferError + f.init() + go f.respond(want) + if got := f.Error(); got != want { + t.Errorf("unexpected error result; got %#v want %#v", got, want) + } +} diff --git a/vendor/github.com/hashicorp/raft/inflight.go b/vendor/github.com/hashicorp/raft/inflight.go new file mode 100644 index 00000000..7014ff50 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/inflight.go @@ -0,0 +1,213 @@ +package raft + +import ( + "container/list" + "sync" +) + +// QuorumPolicy allows individual logFutures to have different +// commitment rules while still using the inflight mechanism. +type quorumPolicy interface { + // Checks if a commit from a given peer is enough to + // satisfy the commitment rules + Commit() bool + + // Checks if a commit is committed + IsCommitted() bool +} + +// MajorityQuorum is used by Apply transactions and requires +// a simple majority of nodes. +type majorityQuorum struct { + count int + votesNeeded int +} + +func newMajorityQuorum(clusterSize int) *majorityQuorum { + votesNeeded := (clusterSize / 2) + 1 + return &majorityQuorum{count: 0, votesNeeded: votesNeeded} +} + +func (m *majorityQuorum) Commit() bool { + m.count++ + return m.count >= m.votesNeeded +} + +func (m *majorityQuorum) IsCommitted() bool { + return m.count >= m.votesNeeded +} + +// Inflight is used to track operations that are still in-flight. +type inflight struct { + sync.Mutex + committed *list.List + commitCh chan struct{} + minCommit uint64 + maxCommit uint64 + operations map[uint64]*logFuture + stopCh chan struct{} +} + +// NewInflight returns an inflight struct that notifies +// the provided channel when logs are finished committing. +func newInflight(commitCh chan struct{}) *inflight { + return &inflight{ + committed: list.New(), + commitCh: commitCh, + minCommit: 0, + maxCommit: 0, + operations: make(map[uint64]*logFuture), + stopCh: make(chan struct{}), + } +} + +// Start is used to mark a logFuture as being inflight. It +// also commits the entry, as it is assumed the leader is +// starting. +func (i *inflight) Start(l *logFuture) { + i.Lock() + defer i.Unlock() + i.start(l) +} + +// StartAll is used to mark a list of logFuture's as being +// inflight. It also commits each entry as the leader is +// assumed to be starting. +func (i *inflight) StartAll(logs []*logFuture) { + i.Lock() + defer i.Unlock() + for _, l := range logs { + i.start(l) + } +} + +// start is used to mark a single entry as inflight, +// must be invoked with the lock held. +func (i *inflight) start(l *logFuture) { + idx := l.log.Index + i.operations[idx] = l + + if idx > i.maxCommit { + i.maxCommit = idx + } + if i.minCommit == 0 { + i.minCommit = idx + } + i.commit(idx) +} + +// Cancel is used to cancel all in-flight operations. +// This is done when the leader steps down, and all futures +// are sent the given error. +func (i *inflight) Cancel(err error) { + // Close the channel first to unblock any pending commits + close(i.stopCh) + + // Lock after close to avoid deadlock + i.Lock() + defer i.Unlock() + + // Respond to all inflight operations + for _, op := range i.operations { + op.respond(err) + } + + // Clear all the committed but not processed + for e := i.committed.Front(); e != nil; e = e.Next() { + e.Value.(*logFuture).respond(err) + } + + // Clear the map + i.operations = make(map[uint64]*logFuture) + + // Clear the list of committed + i.committed = list.New() + + // Close the commmitCh + close(i.commitCh) + + // Reset indexes + i.minCommit = 0 + i.maxCommit = 0 +} + +// Committed returns all the committed operations in order. +func (i *inflight) Committed() (l *list.List) { + i.Lock() + l, i.committed = i.committed, list.New() + i.Unlock() + return l +} + +// Commit is used by leader replication routines to indicate that +// a follower was finished committing a log to disk. +func (i *inflight) Commit(index uint64) { + i.Lock() + defer i.Unlock() + i.commit(index) +} + +// CommitRange is used to commit a range of indexes inclusively. +// It is optimized to avoid commits for indexes that are not tracked. +func (i *inflight) CommitRange(minIndex, maxIndex uint64) { + i.Lock() + defer i.Unlock() + + // Update the minimum index + minIndex = max(i.minCommit, minIndex) + + // Commit each index + for idx := minIndex; idx <= maxIndex; idx++ { + i.commit(idx) + } +} + +// commit is used to commit a single index. Must be called with the lock held. +func (i *inflight) commit(index uint64) { + op, ok := i.operations[index] + if !ok { + // Ignore if not in the map, as it may be committed already + return + } + + // Check if we've satisfied the commit + if !op.policy.Commit() { + return + } + + // Cannot commit if this is not the minimum inflight. This can happen + // if the quorum size changes, meaning a previous commit requires a larger + // quorum that this commit. We MUST block until the previous log is committed, + // otherwise logs will be applied out of order. + if index != i.minCommit { + return + } + +NOTIFY: + // Add the operation to the committed list + i.committed.PushBack(op) + + // Stop tracking since it is committed + delete(i.operations, index) + + // Update the indexes + if index == i.maxCommit { + i.minCommit = 0 + i.maxCommit = 0 + + } else { + i.minCommit++ + } + + // Check if the next in-flight operation is ready + if i.minCommit != 0 { + op = i.operations[i.minCommit] + if op.policy.IsCommitted() { + index = i.minCommit + goto NOTIFY + } + } + + // Async notify of ready operations + asyncNotifyCh(i.commitCh) +} diff --git a/vendor/github.com/hashicorp/raft/inflight_test.go b/vendor/github.com/hashicorp/raft/inflight_test.go new file mode 100644 index 00000000..a9f57d6e --- /dev/null +++ b/vendor/github.com/hashicorp/raft/inflight_test.go @@ -0,0 +1,150 @@ +package raft + +import ( + "fmt" + "testing" +) + +func TestInflight_StartCommit(t *testing.T) { + commitCh := make(chan struct{}, 1) + in := newInflight(commitCh) + + // Commit a transaction as being in flight + l := &logFuture{log: Log{Index: 1}} + l.policy = newMajorityQuorum(5) + in.Start(l) + + // Commit 3 times + in.Commit(1) + if in.Committed().Len() != 0 { + t.Fatalf("should not be commited") + } + + in.Commit(1) + if in.Committed().Len() != 1 { + t.Fatalf("should be commited") + } + + // Already committed but should work anyways + in.Commit(1) +} + +func TestInflight_Cancel(t *testing.T) { + commitCh := make(chan struct{}, 1) + in := newInflight(commitCh) + + // Commit a transaction as being in flight + l := &logFuture{ + log: Log{Index: 1}, + } + l.init() + l.policy = newMajorityQuorum(3) + in.Start(l) + + // Cancel with an error + err := fmt.Errorf("error 1") + in.Cancel(err) + + // Should get an error return + if l.Error() != err { + t.Fatalf("expected error") + } +} + +func TestInflight_StartAll(t *testing.T) { + commitCh := make(chan struct{}, 1) + in := newInflight(commitCh) + + // Commit a few transaction as being in flight + l1 := &logFuture{log: Log{Index: 2}} + l1.policy = newMajorityQuorum(5) + l2 := &logFuture{log: Log{Index: 3}} + l2.policy = newMajorityQuorum(5) + l3 := &logFuture{log: Log{Index: 4}} + l3.policy = newMajorityQuorum(5) + + // Start all the entries + in.StartAll([]*logFuture{l1, l2, l3}) + + // Commit ranges + in.CommitRange(1, 5) + in.CommitRange(1, 4) + in.CommitRange(1, 10) + + // Should get 3 back + if in.Committed().Len() != 3 { + t.Fatalf("expected all 3 to commit") + } +} + +func TestInflight_CommitRange(t *testing.T) { + commitCh := make(chan struct{}, 1) + in := newInflight(commitCh) + + // Commit a few transaction as being in flight + l1 := &logFuture{log: Log{Index: 2}} + l1.policy = newMajorityQuorum(5) + in.Start(l1) + + l2 := &logFuture{log: Log{Index: 3}} + l2.policy = newMajorityQuorum(5) + in.Start(l2) + + l3 := &logFuture{log: Log{Index: 4}} + l3.policy = newMajorityQuorum(5) + in.Start(l3) + + // Commit ranges + in.CommitRange(1, 5) + in.CommitRange(1, 4) + in.CommitRange(1, 10) + + // Should get 3 back + if in.Committed().Len() != 3 { + t.Fatalf("expected all 3 to commit") + } +} + +// Should panic if we commit non contiguously! +func TestInflight_NonContiguous(t *testing.T) { + commitCh := make(chan struct{}, 1) + in := newInflight(commitCh) + + // Commit a few transaction as being in flight + l1 := &logFuture{log: Log{Index: 2}} + l1.policy = newMajorityQuorum(5) + in.Start(l1) + + l2 := &logFuture{log: Log{Index: 3}} + l2.policy = newMajorityQuorum(5) + in.Start(l2) + + in.Commit(3) + in.Commit(3) + in.Commit(3) // panic! + + if in.Committed().Len() != 0 { + t.Fatalf("should not commit") + } + + in.Commit(2) + in.Commit(2) + in.Commit(2) // panic! + + committed := in.Committed() + if committed.Len() != 2 { + t.Fatalf("should commit both") + } + + current := committed.Front() + l := current.Value.(*logFuture) + if l.log.Index != 2 { + t.Fatalf("bad: %v", *l) + } + + current = current.Next() + l = current.Value.(*logFuture) + if l.log.Index != 3 { + t.Fatalf("bad: %v", *l) + } +} diff --git a/vendor/github.com/hashicorp/raft/inmem_store.go b/vendor/github.com/hashicorp/raft/inmem_store.go new file mode 100644 index 00000000..6e4dfd02 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/inmem_store.go @@ -0,0 +1,116 @@ +package raft + +import ( + "sync" +) + +// InmemStore implements the LogStore and StableStore interface. +// It should NOT EVER be used for production. It is used only for +// unit tests. Use the MDBStore implementation instead. +type InmemStore struct { + l sync.RWMutex + lowIndex uint64 + highIndex uint64 + logs map[uint64]*Log + kv map[string][]byte + kvInt map[string]uint64 +} + +// NewInmemStore returns a new in-memory backend. Do not ever +// use for production. Only for testing. +func NewInmemStore() *InmemStore { + i := &InmemStore{ + logs: make(map[uint64]*Log), + kv: make(map[string][]byte), + kvInt: make(map[string]uint64), + } + return i +} + +// FirstIndex implements the LogStore interface. +func (i *InmemStore) FirstIndex() (uint64, error) { + i.l.RLock() + defer i.l.RUnlock() + return i.lowIndex, nil +} + +// LastIndex implements the LogStore interface. +func (i *InmemStore) LastIndex() (uint64, error) { + i.l.RLock() + defer i.l.RUnlock() + return i.highIndex, nil +} + +// GetLog implements the LogStore interface. +func (i *InmemStore) GetLog(index uint64, log *Log) error { + i.l.RLock() + defer i.l.RUnlock() + l, ok := i.logs[index] + if !ok { + return ErrLogNotFound + } + *log = *l + return nil +} + +// StoreLog implements the LogStore interface. +func (i *InmemStore) StoreLog(log *Log) error { + return i.StoreLogs([]*Log{log}) +} + +// StoreLogs implements the LogStore interface. +func (i *InmemStore) StoreLogs(logs []*Log) error { + i.l.Lock() + defer i.l.Unlock() + for _, l := range logs { + i.logs[l.Index] = l + if i.lowIndex == 0 { + i.lowIndex = l.Index + } + if l.Index > i.highIndex { + i.highIndex = l.Index + } + } + return nil +} + +// DeleteRange implements the LogStore interface. +func (i *InmemStore) DeleteRange(min, max uint64) error { + i.l.Lock() + defer i.l.Unlock() + for j := min; j <= max; j++ { + delete(i.logs, j) + } + i.lowIndex = max + 1 + return nil +} + +// Set implements the StableStore interface. +func (i *InmemStore) Set(key []byte, val []byte) error { + i.l.Lock() + defer i.l.Unlock() + i.kv[string(key)] = val + return nil +} + +// Get implements the StableStore interface. +func (i *InmemStore) Get(key []byte) ([]byte, error) { + i.l.RLock() + defer i.l.RUnlock() + return i.kv[string(key)], nil +} + +// SetUint64 implements the StableStore interface. +func (i *InmemStore) SetUint64(key []byte, val uint64) error { + i.l.Lock() + defer i.l.Unlock() + i.kvInt[string(key)] = val + return nil +} + +// GetUint64 implements the StableStore interface. +func (i *InmemStore) GetUint64(key []byte) (uint64, error) { + i.l.RLock() + defer i.l.RUnlock() + return i.kvInt[string(key)], nil +} diff --git a/vendor/github.com/hashicorp/raft/inmem_transport.go b/vendor/github.com/hashicorp/raft/inmem_transport.go new file mode 100644 index 00000000..2d5f3190 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/inmem_transport.go @@ -0,0 +1,324 @@ +package raft + +import ( + "fmt" + "io" + "sync" + "time" +) + +// NewInmemAddr returns a new in-memory addr with +// a randomly generate UUID as the ID. +func NewInmemAddr() string { + return generateUUID() +} + +// inmemPipeline is used to pipeline requests for the in-mem transport. +type inmemPipeline struct { + trans *InmemTransport + peer *InmemTransport + peerAddr string + + doneCh chan AppendFuture + inprogressCh chan *inmemPipelineInflight + + shutdown bool + shutdownCh chan struct{} + shutdownLock sync.Mutex +} + +type inmemPipelineInflight struct { + future *appendFuture + respCh <-chan RPCResponse +} + +// InmemTransport Implements the Transport interface, to allow Raft to be +// tested in-memory without going over a network. +type InmemTransport struct { + sync.RWMutex + consumerCh chan RPC + localAddr string + peers map[string]*InmemTransport + pipelines []*inmemPipeline + timeout time.Duration +} + +// NewInmemTransport is used to initialize a new transport +// and generates a random local address if none is specified +func NewInmemTransport(addr string) (string, *InmemTransport) { + if addr == "" { + addr = NewInmemAddr() + } + trans := &InmemTransport{ + consumerCh: make(chan RPC, 16), + localAddr: addr, + peers: make(map[string]*InmemTransport), + timeout: 50 * time.Millisecond, + } + return addr, trans +} + +// SetHeartbeatHandler is used to set optional fast-path for +// heartbeats, not supported for this transport. +func (i *InmemTransport) SetHeartbeatHandler(cb func(RPC)) { +} + +// Consumer implements the Transport interface. +func (i *InmemTransport) Consumer() <-chan RPC { + return i.consumerCh +} + +// LocalAddr implements the Transport interface. +func (i *InmemTransport) LocalAddr() string { + return i.localAddr +} + +// AppendEntriesPipeline returns an interface that can be used to pipeline +// AppendEntries requests. +func (i *InmemTransport) AppendEntriesPipeline(target string) (AppendPipeline, error) { + i.RLock() + peer, ok := i.peers[target] + i.RUnlock() + if !ok { + return nil, fmt.Errorf("failed to connect to peer: %v", target) + } + pipeline := newInmemPipeline(i, peer, target) + i.Lock() + i.pipelines = append(i.pipelines, pipeline) + i.Unlock() + return pipeline, nil +} + +// AppendEntries implements the Transport interface. +func (i *InmemTransport) AppendEntries(target string, args *AppendEntriesRequest, resp *AppendEntriesResponse) error { + rpcResp, err := i.makeRPC(target, args, nil, i.timeout) + if err != nil { + return err + } + + // Copy the result back + out := rpcResp.Response.(*AppendEntriesResponse) + *resp = *out + return nil +} + +// RequestVote implements the Transport interface. +func (i *InmemTransport) RequestVote(target string, args *RequestVoteRequest, resp *RequestVoteResponse) error { + rpcResp, err := i.makeRPC(target, args, nil, i.timeout) + if err != nil { + return err + } + + // Copy the result back + out := rpcResp.Response.(*RequestVoteResponse) + *resp = *out + return nil +} + +// InstallSnapshot implements the Transport interface. +func (i *InmemTransport) InstallSnapshot(target string, args *InstallSnapshotRequest, resp *InstallSnapshotResponse, data io.Reader) error { + rpcResp, err := i.makeRPC(target, args, data, 10*i.timeout) + if err != nil { + return err + } + + // Copy the result back + out := rpcResp.Response.(*InstallSnapshotResponse) + *resp = *out + return nil +} + +func (i *InmemTransport) makeRPC(target string, args interface{}, r io.Reader, timeout time.Duration) (rpcResp RPCResponse, err error) { + i.RLock() + peer, ok := i.peers[target] + i.RUnlock() + + if !ok { + err = fmt.Errorf("failed to connect to peer: %v", target) + return + } + + // Send the RPC over + respCh := make(chan RPCResponse) + peer.consumerCh <- RPC{ + Command: args, + Reader: r, + RespChan: respCh, + } + + // Wait for a response + select { + case rpcResp = <-respCh: + if rpcResp.Error != nil { + err = rpcResp.Error + } + case <-time.After(timeout): + err = fmt.Errorf("command timed out") + } + return +} + +// EncodePeer implements the Transport interface. It uses the UUID as the +// address directly. +func (i *InmemTransport) EncodePeer(p string) []byte { + return []byte(p) +} + +// DecodePeer implements the Transport interface. It wraps the UUID in an +// InmemAddr. +func (i *InmemTransport) DecodePeer(buf []byte) string { + return string(buf) +} + +// Connect is used to connect this transport to another transport for +// a given peer name. This allows for local routing. +func (i *InmemTransport) Connect(peer string, t Transport) { + trans := t.(*InmemTransport) + i.Lock() + defer i.Unlock() + i.peers[peer] = trans +} + +// Disconnect is used to remove the ability to route to a given peer. +func (i *InmemTransport) Disconnect(peer string) { + i.Lock() + defer i.Unlock() + delete(i.peers, peer) + + // Disconnect any pipelines + n := len(i.pipelines) + for idx := 0; idx < n; idx++ { + if i.pipelines[idx].peerAddr == peer { + i.pipelines[idx].Close() + i.pipelines[idx], i.pipelines[n-1] = i.pipelines[n-1], nil + idx-- + n-- + } + } + i.pipelines = i.pipelines[:n] +} + +// DisconnectAll is used to remove all routes to peers. +func (i *InmemTransport) DisconnectAll() { + i.Lock() + defer i.Unlock() + i.peers = make(map[string]*InmemTransport) + + // Handle pipelines + for _, pipeline := range i.pipelines { + pipeline.Close() + } + i.pipelines = nil +} + +// Close is used to permanently disable the transport +func (i *InmemTransport) Close() error { + i.DisconnectAll() + return nil +} + +func newInmemPipeline(trans *InmemTransport, peer *InmemTransport, addr string) *inmemPipeline { + i := &inmemPipeline{ + trans: trans, + peer: peer, + peerAddr: addr, + doneCh: make(chan AppendFuture, 16), + inprogressCh: make(chan *inmemPipelineInflight, 16), + shutdownCh: make(chan struct{}), + } + go i.decodeResponses() + return i +} + +func (i *inmemPipeline) decodeResponses() { + timeout := i.trans.timeout + for { + select { + case inp := <-i.inprogressCh: + var timeoutCh <-chan time.Time + if timeout > 0 { + timeoutCh = time.After(timeout) + } + + select { + case rpcResp := <-inp.respCh: + // Copy the result back + *inp.future.resp = *rpcResp.Response.(*AppendEntriesResponse) + inp.future.respond(rpcResp.Error) + + select { + case i.doneCh <- inp.future: + case <-i.shutdownCh: + return + } + + case <-timeoutCh: + inp.future.respond(fmt.Errorf("command timed out")) + select { + case i.doneCh <- inp.future: + case <-i.shutdownCh: + return + } + + case <-i.shutdownCh: + return + } + case <-i.shutdownCh: + return + } + } +} + +func (i *inmemPipeline) AppendEntries(args *AppendEntriesRequest, resp *AppendEntriesResponse) (AppendFuture, error) { + // Create a new future + future := &appendFuture{ + start: time.Now(), + args: args, + resp: resp, + } + future.init() + + // Handle a timeout + var timeout <-chan time.Time + if i.trans.timeout > 0 { + timeout = time.After(i.trans.timeout) + } + + // Send the RPC over + respCh := make(chan RPCResponse, 1) + rpc := RPC{ + Command: args, + RespChan: respCh, + } + select { + case i.peer.consumerCh <- rpc: + case <-timeout: + return nil, fmt.Errorf("command enqueue timeout") + case <-i.shutdownCh: + return nil, ErrPipelineShutdown + } + + // Send to be decoded + select { + case i.inprogressCh <- &inmemPipelineInflight{future, respCh}: + return future, nil + case <-i.shutdownCh: + return nil, ErrPipelineShutdown + } +} + +func (i *inmemPipeline) Consumer() <-chan AppendFuture { + return i.doneCh +} + +func (i *inmemPipeline) Close() error { + i.shutdownLock.Lock() + defer i.shutdownLock.Unlock() + if i.shutdown { + return nil + } + + i.shutdown = true + close(i.shutdownCh) + return nil +} diff --git a/vendor/github.com/hashicorp/raft/inmem_transport_test.go b/vendor/github.com/hashicorp/raft/inmem_transport_test.go new file mode 100644 index 00000000..82c95348 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/inmem_transport_test.go @@ -0,0 +1,18 @@ +package raft + +import ( + "testing" +) + +func TestInmemTransportImpl(t *testing.T) { + var inm interface{} = &InmemTransport{} + if _, ok := inm.(Transport); !ok { + t.Fatalf("InmemTransport is not a Transport") + } + if _, ok := inm.(LoopbackTransport); !ok { + t.Fatalf("InmemTransport is not a Loopback Transport") + } + if _, ok := inm.(WithPeers); !ok { + t.Fatalf("InmemTransport is not a WithPeers Transport") + } +} diff --git a/vendor/github.com/hashicorp/raft/integ_test.go b/vendor/github.com/hashicorp/raft/integ_test.go new file mode 100644 index 00000000..66654be4 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/integ_test.go @@ -0,0 +1,336 @@ +package raft + +import ( + "bytes" + "fmt" + "io/ioutil" + "log" + "os" + "testing" + "time" +) + +// CheckInteg will skip a test if integration testing is not enabled. +func CheckInteg(t *testing.T) { + if !IsInteg() { + t.SkipNow() + } +} + +// IsInteg returns a boolean telling you if we're in integ testing mode. +func IsInteg() bool { + return os.Getenv("INTEG_TESTS") != "" +} + +type RaftEnv struct { + dir string + conf *Config + fsm *MockFSM + store *InmemStore + snapshot *FileSnapshotStore + peers *JSONPeers + trans *NetworkTransport + raft *Raft + logger *log.Logger +} + +// Release shuts down and cleans up any stored data, its not restartable after this +func (r *RaftEnv) Release() { + r.Shutdown() + os.RemoveAll(r.dir) +} + +// Shutdown shuts down raft & transport, but keeps track of its data, its restartable +// after a Shutdown() by calling Start() +func (r *RaftEnv) Shutdown() { + r.logger.Printf("[WARN] Shutdown node at %v", r.raft.localAddr) + f := r.raft.Shutdown() + if err := f.Error(); err != nil { + panic(err) + } + r.trans.Close() +} + +// Restart will start a raft node that was previously Shutdown() +func (r *RaftEnv) Restart(t *testing.T) { + trans, err := NewTCPTransport(r.raft.localAddr, nil, 2, time.Second, nil) + if err != nil { + t.Fatalf("err: %v", err) + } + r.trans = trans + r.logger.Printf("[INFO] Starting node at %v", trans.LocalAddr()) + raft, err := NewRaft(r.conf, r.fsm, r.store, r.store, r.snapshot, r.peers, r.trans) + if err != nil { + t.Fatalf("err: %v", err) + } + r.raft = raft +} + +func MakeRaft(t *testing.T, conf *Config) *RaftEnv { + // Set the config + if conf == nil { + conf = inmemConfig(t) + } + + dir, err := ioutil.TempDir("", "raft") + if err != nil { + t.Fatalf("err: %v ", err) + } + + stable := NewInmemStore() + + snap, err := NewFileSnapshotStore(dir, 3, nil) + if err != nil { + t.Fatalf("err: %v", err) + } + + env := &RaftEnv{ + conf: conf, + dir: dir, + store: stable, + snapshot: snap, + fsm: &MockFSM{}, + } + + trans, err := NewTCPTransport("127.0.0.1:0", nil, 2, time.Second, nil) + if err != nil { + t.Fatalf("err: %v", err) + } + env.logger = log.New(os.Stdout, trans.LocalAddr()+" :", log.Lmicroseconds) + env.trans = trans + + env.peers = NewJSONPeers(dir, trans) + + env.logger.Printf("[INFO] Starting node at %v", trans.LocalAddr()) + conf.Logger = env.logger + raft, err := NewRaft(conf, env.fsm, stable, stable, snap, env.peers, trans) + if err != nil { + t.Fatalf("err: %v", err) + } + env.raft = raft + return env +} + +func WaitFor(env *RaftEnv, state RaftState) error { + limit := time.Now().Add(200 * time.Millisecond) + for env.raft.State() != state { + if time.Now().Before(limit) { + time.Sleep(10 * time.Millisecond) + } else { + return fmt.Errorf("failed to transition to state %v", state) + } + } + return nil +} + +func WaitForAny(state RaftState, envs []*RaftEnv) (*RaftEnv, error) { + limit := time.Now().Add(200 * time.Millisecond) +CHECK: + for _, env := range envs { + if env.raft.State() == state { + return env, nil + } + } + if time.Now().Before(limit) { + goto WAIT + } + return nil, fmt.Errorf("failed to find node in %v state", state) +WAIT: + time.Sleep(10 * time.Millisecond) + goto CHECK +} + +func WaitFuture(f Future, t *testing.T) error { + timer := time.AfterFunc(200*time.Millisecond, func() { + panic(fmt.Errorf("timeout waiting for future %v", f)) + }) + defer timer.Stop() + return f.Error() +} + +func NoErr(err error, t *testing.T) { + if err != nil { + t.Fatalf("err: %v", err) + } +} + +func CheckConsistent(envs []*RaftEnv, t *testing.T) { + limit := time.Now().Add(400 * time.Millisecond) + first := envs[0] + first.fsm.Lock() + defer first.fsm.Unlock() + var err error +CHECK: + l1 := len(first.fsm.logs) + for i := 1; i < len(envs); i++ { + env := envs[i] + env.fsm.Lock() + l2 := len(env.fsm.logs) + if l1 != l2 { + err = fmt.Errorf("log length mismatch %d %d", l1, l2) + env.fsm.Unlock() + goto ERR + } + for idx, log := range first.fsm.logs { + other := env.fsm.logs[idx] + if bytes.Compare(log, other) != 0 { + err = fmt.Errorf("log entry %d mismatch between %s/%s : '%s' / '%s'", idx, first.raft.localAddr, env.raft.localAddr, log, other) + env.fsm.Unlock() + goto ERR + } + } + env.fsm.Unlock() + } + return +ERR: + if time.Now().After(limit) { + t.Fatalf("%v", err) + } + first.fsm.Unlock() + time.Sleep(20 * time.Millisecond) + first.fsm.Lock() + goto CHECK +} + +// return a log entry that's at least sz long that has the prefix 'test i ' +func logBytes(i, sz int) []byte { + var logBuffer bytes.Buffer + fmt.Fprintf(&logBuffer, "test %d ", i) + for logBuffer.Len() < sz { + logBuffer.WriteByte('x') + } + return logBuffer.Bytes() + +} + +// Tests Raft by creating a cluster, growing it to 5 nodes while +// causing various stressful conditions +func TestRaft_Integ(t *testing.T) { + CheckInteg(t) + conf := DefaultConfig() + conf.HeartbeatTimeout = 50 * time.Millisecond + conf.ElectionTimeout = 50 * time.Millisecond + conf.LeaderLeaseTimeout = 50 * time.Millisecond + conf.CommitTimeout = 5 * time.Millisecond + conf.SnapshotThreshold = 100 + conf.TrailingLogs = 10 + conf.EnableSingleNode = true + + // Create a single node + env1 := MakeRaft(t, conf) + NoErr(WaitFor(env1, Leader), t) + + totalApplied := 0 + applyAndWait := func(leader *RaftEnv, n int, sz int) { + // Do some commits + var futures []ApplyFuture + for i := 0; i < n; i++ { + futures = append(futures, leader.raft.Apply(logBytes(i, sz), 0)) + } + for _, f := range futures { + NoErr(WaitFuture(f, t), t) + leader.logger.Printf("[DEBUG] Applied at %d, size %d", f.Index(), sz) + } + totalApplied += n + } + // Do some commits + applyAndWait(env1, 100, 10) + + // Do a snapshot + NoErr(WaitFuture(env1.raft.Snapshot(), t), t) + + // Join a few nodes! + var envs []*RaftEnv + for i := 0; i < 4; i++ { + env := MakeRaft(t, conf) + addr := env.trans.LocalAddr() + NoErr(WaitFuture(env1.raft.AddPeer(addr), t), t) + envs = append(envs, env) + } + + // Wait for a leader + leader, err := WaitForAny(Leader, append([]*RaftEnv{env1}, envs...)) + NoErr(err, t) + + // Do some more commits + applyAndWait(leader, 100, 10) + + // snapshot the leader + NoErr(WaitFuture(leader.raft.Snapshot(), t), t) + + CheckConsistent(append([]*RaftEnv{env1}, envs...), t) + + // shutdown a follower + disconnected := envs[len(envs)-1] + disconnected.Shutdown() + + // Do some more commits [make sure the resulting snapshot will be a reasonable size] + applyAndWait(leader, 100, 10000) + + // snapshot the leader [leaders log should be compacted past the disconnected follower log now] + NoErr(WaitFuture(leader.raft.Snapshot(), t), t) + + // Unfortuantly we need to wait for the leader to start backing off RPCs to the down follower + // such that when the follower comes back up it'll run an election before it gets an rpc from + // the leader + time.Sleep(time.Second * 5) + + // start the now out of date follower back up + disconnected.Restart(t) + + // wait for it to get caught up + timeout := time.Now().Add(time.Second * 10) + for disconnected.raft.getLastApplied() < leader.raft.getLastApplied() { + time.Sleep(time.Millisecond) + if time.Now().After(timeout) { + t.Fatalf("Gave up waiting for follower to get caught up to leader") + } + } + + CheckConsistent(append([]*RaftEnv{env1}, envs...), t) + + // Shoot two nodes in the head! + rm1, rm2 := envs[0], envs[1] + rm1.Release() + rm2.Release() + envs = envs[2:] + time.Sleep(10 * time.Millisecond) + + // Wait for a leader + leader, err = WaitForAny(Leader, append([]*RaftEnv{env1}, envs...)) + NoErr(err, t) + + // Do some more commits + applyAndWait(leader, 100, 10) + + // Join a few new nodes! + for i := 0; i < 2; i++ { + env := MakeRaft(t, conf) + addr := env.trans.LocalAddr() + NoErr(WaitFuture(leader.raft.AddPeer(addr), t), t) + envs = append(envs, env) + } + + // Remove the old nodes + NoErr(WaitFuture(leader.raft.RemovePeer(rm1.raft.localAddr), t), t) + NoErr(WaitFuture(leader.raft.RemovePeer(rm2.raft.localAddr), t), t) + + // Shoot the leader + env1.Release() + time.Sleep(3 * conf.HeartbeatTimeout) + + // Wait for a leader + leader, err = WaitForAny(Leader, envs) + NoErr(err, t) + + allEnvs := append([]*RaftEnv{env1}, envs...) + CheckConsistent(allEnvs, t) + + if len(env1.fsm.logs) != totalApplied { + t.Fatalf("should apply %d logs! %d", totalApplied, len(env1.fsm.logs)) + } + + for _, e := range envs { + e.Release() + } +} diff --git a/vendor/github.com/hashicorp/raft/log.go b/vendor/github.com/hashicorp/raft/log.go new file mode 100644 index 00000000..9399154a --- /dev/null +++ b/vendor/github.com/hashicorp/raft/log.go @@ -0,0 +1,67 @@ +package raft + +// LogType describes various types of log entries. +type LogType uint8 + +const ( + // LogCommand is applied to a user FSM. + LogCommand LogType = iota + + // LogNoop is used to assert leadership. + LogNoop + + // LogAddPeer is used to add a new peer. + LogAddPeer + + // LogRemovePeer is used to remove an existing peer. + LogRemovePeer + + // LogBarrier is used to ensure all preceding operations have been + // applied to the FSM. It is similar to LogNoop, but instead of returning + // once committed, it only returns once the FSM manager acks it. Otherwise + // it is possible there are operations committed but not yet applied to + // the FSM. + LogBarrier +) + +// Log entries are replicated to all members of the Raft cluster +// and form the heart of the replicated state machine. +type Log struct { + // Index holds the index of the log entry. + Index uint64 + + // Term holds the election term of the log entry. + Term uint64 + + // Type holds the type of the log entry. + Type LogType + + // Data holds the log entry's type-specific data. + Data []byte + + // peer is not exported since it is not transmitted, only used + // internally to construct the Data field. + peer string +} + +// LogStore is used to provide an interface for storing +// and retrieving logs in a durable fashion. +type LogStore interface { + // FirstIndex returns the first index written. 0 for no entries. + FirstIndex() (uint64, error) + + // LastIndex returns the last index written. 0 for no entries. + LastIndex() (uint64, error) + + // GetLog gets a log entry at a given index. + GetLog(index uint64, log *Log) error + + // StoreLog stores a log entry. + StoreLog(log *Log) error + + // StoreLogs stores multiple log entries. + StoreLogs(logs []*Log) error + + // DeleteRange deletes a range of log entries. The range is inclusive. + DeleteRange(min, max uint64) error +} diff --git a/vendor/github.com/hashicorp/raft/log_cache.go b/vendor/github.com/hashicorp/raft/log_cache.go new file mode 100644 index 00000000..952e98c2 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/log_cache.go @@ -0,0 +1,79 @@ +package raft + +import ( + "fmt" + "sync" +) + +// LogCache wraps any LogStore implementation to provide an +// in-memory ring buffer. This is used to cache access to +// the recently written entries. For implementations that do not +// cache themselves, this can provide a substantial boost by +// avoiding disk I/O on recent entries. +type LogCache struct { + store LogStore + + cache []*Log + l sync.RWMutex +} + +// NewLogCache is used to create a new LogCache with the +// given capacity and backend store. +func NewLogCache(capacity int, store LogStore) (*LogCache, error) { + if capacity <= 0 { + return nil, fmt.Errorf("capacity must be positive") + } + c := &LogCache{ + store: store, + cache: make([]*Log, capacity), + } + return c, nil +} + +func (c *LogCache) GetLog(idx uint64, log *Log) error { + // Check the buffer for an entry + c.l.RLock() + cached := c.cache[idx%uint64(len(c.cache))] + c.l.RUnlock() + + // Check if entry is valid + if cached != nil && cached.Index == idx { + *log = *cached + return nil + } + + // Forward request on cache miss + return c.store.GetLog(idx, log) +} + +func (c *LogCache) StoreLog(log *Log) error { + return c.StoreLogs([]*Log{log}) +} + +func (c *LogCache) StoreLogs(logs []*Log) error { + // Insert the logs into the ring buffer + c.l.Lock() + for _, l := range logs { + c.cache[l.Index%uint64(len(c.cache))] = l + } + c.l.Unlock() + + return c.store.StoreLogs(logs) +} + +func (c *LogCache) FirstIndex() (uint64, error) { + return c.store.FirstIndex() +} + +func (c *LogCache) LastIndex() (uint64, error) { + return c.store.LastIndex() +} + +func (c *LogCache) DeleteRange(min, max uint64) error { + // Invalidate the cache on deletes + c.l.Lock() + c.cache = make([]*Log, len(c.cache)) + c.l.Unlock() + + return c.store.DeleteRange(min, max) +} diff --git a/vendor/github.com/hashicorp/raft/log_cache_test.go b/vendor/github.com/hashicorp/raft/log_cache_test.go new file mode 100644 index 00000000..7569e78e --- /dev/null +++ b/vendor/github.com/hashicorp/raft/log_cache_test.go @@ -0,0 +1,88 @@ +package raft + +import ( + "testing" +) + +func TestLogCache(t *testing.T) { + store := NewInmemStore() + c, _ := NewLogCache(16, store) + + // Insert into the in-mem store + for i := 0; i < 32; i++ { + log := &Log{Index: uint64(i) + 1} + store.StoreLog(log) + } + + // Check the indexes + if idx, _ := c.FirstIndex(); idx != 1 { + t.Fatalf("bad: %d", idx) + } + if idx, _ := c.LastIndex(); idx != 32 { + t.Fatalf("bad: %d", idx) + } + + // Try get log with a miss + var out Log + err := c.GetLog(1, &out) + if err != nil { + t.Fatalf("err: %v", err) + } + if out.Index != 1 { + t.Fatalf("bad: %#v", out) + } + + // Store logs + l1 := &Log{Index: 33} + l2 := &Log{Index: 34} + err = c.StoreLogs([]*Log{l1, l2}) + if err != nil { + t.Fatalf("err: %v", err) + } + + if idx, _ := c.LastIndex(); idx != 34 { + t.Fatalf("bad: %d", idx) + } + + // Check that it wrote-through + err = store.GetLog(33, &out) + if err != nil { + t.Fatalf("err: %v", err) + } + err = store.GetLog(34, &out) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Delete in the backend + err = store.DeleteRange(33, 34) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Should be in the ring buffer + err = c.GetLog(33, &out) + if err != nil { + t.Fatalf("err: %v", err) + } + err = c.GetLog(34, &out) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Purge the ring buffer + err = c.DeleteRange(33, 34) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Should not be in the ring buffer + err = c.GetLog(33, &out) + if err != ErrLogNotFound { + t.Fatalf("err: %v", err) + } + err = c.GetLog(34, &out) + if err != ErrLogNotFound { + t.Fatalf("err: %v", err) + } +} diff --git a/vendor/github.com/hashicorp/raft/net_transport.go b/vendor/github.com/hashicorp/raft/net_transport.go new file mode 100644 index 00000000..3de2a694 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/net_transport.go @@ -0,0 +1,622 @@ +package raft + +import ( + "bufio" + "errors" + "fmt" + "io" + "log" + "net" + "os" + "sync" + "time" + + "github.com/hashicorp/go-msgpack/codec" +) + +const ( + rpcAppendEntries uint8 = iota + rpcRequestVote + rpcInstallSnapshot + + // DefaultTimeoutScale is the default TimeoutScale in a NetworkTransport. + DefaultTimeoutScale = 256 * 1024 // 256KB + + // rpcMaxPipeline controls the maximum number of outstanding + // AppendEntries RPC calls. + rpcMaxPipeline = 128 +) + +var ( + // ErrTransportShutdown is returned when operations on a transport are + // invoked after it's been terminated. + ErrTransportShutdown = errors.New("transport shutdown") + + // ErrPipelineShutdown is returned when the pipeline is closed. + ErrPipelineShutdown = errors.New("append pipeline closed") +) + +/* + +NetworkTransport provides a network based transport that can be +used to communicate with Raft on remote machines. It requires +an underlying stream layer to provide a stream abstraction, which can +be simple TCP, TLS, etc. + +This transport is very simple and lightweight. Each RPC request is +framed by sending a byte that indicates the message type, followed +by the MsgPack encoded request. + +The response is an error string followed by the response object, +both are encoded using MsgPack. + +InstallSnapshot is special, in that after the RPC request we stream +the entire state. That socket is not re-used as the connection state +is not known if there is an error. + +*/ +type NetworkTransport struct { + connPool map[string][]*netConn + connPoolLock sync.Mutex + + consumeCh chan RPC + + heartbeatFn func(RPC) + heartbeatFnLock sync.Mutex + + logger *log.Logger + + maxPool int + + shutdown bool + shutdownCh chan struct{} + shutdownLock sync.Mutex + + stream StreamLayer + + timeout time.Duration + TimeoutScale int +} + +// StreamLayer is used with the NetworkTransport to provide +// the low level stream abstraction. +type StreamLayer interface { + net.Listener + + // Dial is used to create a new outgoing connection + Dial(address string, timeout time.Duration) (net.Conn, error) +} + +type netConn struct { + target string + conn net.Conn + r *bufio.Reader + w *bufio.Writer + dec *codec.Decoder + enc *codec.Encoder +} + +func (n *netConn) Release() error { + return n.conn.Close() +} + +type netPipeline struct { + conn *netConn + trans *NetworkTransport + + doneCh chan AppendFuture + inprogressCh chan *appendFuture + + shutdown bool + shutdownCh chan struct{} + shutdownLock sync.Mutex +} + +// NewNetworkTransport creates a new network transport with the given dialer +// and listener. The maxPool controls how many connections we will pool. The +// timeout is used to apply I/O deadlines. For InstallSnapshot, we multiply +// the timeout by (SnapshotSize / TimeoutScale). +func NewNetworkTransport( + stream StreamLayer, + maxPool int, + timeout time.Duration, + logOutput io.Writer, +) *NetworkTransport { + if logOutput == nil { + logOutput = os.Stderr + } + return NewNetworkTransportWithLogger(stream, maxPool, timeout, log.New(logOutput, "", log.LstdFlags)) +} + +// NewNetworkTransportWithLogger creates a new network transport with the given dialer +// and listener. The maxPool controls how many connections we will pool. The +// timeout is used to apply I/O deadlines. For InstallSnapshot, we multiply +// the timeout by (SnapshotSize / TimeoutScale). +func NewNetworkTransportWithLogger( + stream StreamLayer, + maxPool int, + timeout time.Duration, + logger *log.Logger, +) *NetworkTransport { + if logger == nil { + logger = log.New(os.Stderr, "", log.LstdFlags) + } + trans := &NetworkTransport{ + connPool: make(map[string][]*netConn), + consumeCh: make(chan RPC), + logger: logger, + maxPool: maxPool, + shutdownCh: make(chan struct{}), + stream: stream, + timeout: timeout, + TimeoutScale: DefaultTimeoutScale, + } + go trans.listen() + return trans +} + +// SetHeartbeatHandler is used to setup a heartbeat handler +// as a fast-pass. This is to avoid head-of-line blocking from +// disk IO. +func (n *NetworkTransport) SetHeartbeatHandler(cb func(rpc RPC)) { + n.heartbeatFnLock.Lock() + defer n.heartbeatFnLock.Unlock() + n.heartbeatFn = cb +} + +// Close is used to stop the network transport. +func (n *NetworkTransport) Close() error { + n.shutdownLock.Lock() + defer n.shutdownLock.Unlock() + + if !n.shutdown { + close(n.shutdownCh) + n.stream.Close() + n.shutdown = true + } + return nil +} + +// Consumer implements the Transport interface. +func (n *NetworkTransport) Consumer() <-chan RPC { + return n.consumeCh +} + +// LocalAddr implements the Transport interface. +func (n *NetworkTransport) LocalAddr() string { + return n.stream.Addr().String() +} + +// IsShutdown is used to check if the transport is shutdown. +func (n *NetworkTransport) IsShutdown() bool { + select { + case <-n.shutdownCh: + return true + default: + return false + } +} + +// getExistingConn is used to grab a pooled connection. +func (n *NetworkTransport) getPooledConn(target string) *netConn { + n.connPoolLock.Lock() + defer n.connPoolLock.Unlock() + + conns, ok := n.connPool[target] + if !ok || len(conns) == 0 { + return nil + } + + var conn *netConn + num := len(conns) + conn, conns[num-1] = conns[num-1], nil + n.connPool[target] = conns[:num-1] + return conn +} + +// getConn is used to get a connection from the pool. +func (n *NetworkTransport) getConn(target string) (*netConn, error) { + // Check for a pooled conn + if conn := n.getPooledConn(target); conn != nil { + return conn, nil + } + + // Dial a new connection + conn, err := n.stream.Dial(target, n.timeout) + if err != nil { + return nil, err + } + + // Wrap the conn + netConn := &netConn{ + target: target, + conn: conn, + r: bufio.NewReader(conn), + w: bufio.NewWriter(conn), + } + + // Setup encoder/decoders + netConn.dec = codec.NewDecoder(netConn.r, &codec.MsgpackHandle{}) + netConn.enc = codec.NewEncoder(netConn.w, &codec.MsgpackHandle{}) + + // Done + return netConn, nil +} + +// returnConn returns a connection back to the pool. +func (n *NetworkTransport) returnConn(conn *netConn) { + n.connPoolLock.Lock() + defer n.connPoolLock.Unlock() + + key := conn.target + conns, _ := n.connPool[key] + + if !n.IsShutdown() && len(conns) < n.maxPool { + n.connPool[key] = append(conns, conn) + } else { + conn.Release() + } +} + +// AppendEntriesPipeline returns an interface that can be used to pipeline +// AppendEntries requests. +func (n *NetworkTransport) AppendEntriesPipeline(target string) (AppendPipeline, error) { + // Get a connection + conn, err := n.getConn(target) + if err != nil { + return nil, err + } + + // Create the pipeline + return newNetPipeline(n, conn), nil +} + +// AppendEntries implements the Transport interface. +func (n *NetworkTransport) AppendEntries(target string, args *AppendEntriesRequest, resp *AppendEntriesResponse) error { + return n.genericRPC(target, rpcAppendEntries, args, resp) +} + +// RequestVote implements the Transport interface. +func (n *NetworkTransport) RequestVote(target string, args *RequestVoteRequest, resp *RequestVoteResponse) error { + return n.genericRPC(target, rpcRequestVote, args, resp) +} + +// genericRPC handles a simple request/response RPC. +func (n *NetworkTransport) genericRPC(target string, rpcType uint8, args interface{}, resp interface{}) error { + // Get a conn + conn, err := n.getConn(target) + if err != nil { + return err + } + + // Set a deadline + if n.timeout > 0 { + conn.conn.SetDeadline(time.Now().Add(n.timeout)) + } + + // Send the RPC + if err = sendRPC(conn, rpcType, args); err != nil { + return err + } + + // Decode the response + canReturn, err := decodeResponse(conn, resp) + if canReturn { + n.returnConn(conn) + } + return err +} + +// InstallSnapshot implements the Transport interface. +func (n *NetworkTransport) InstallSnapshot(target string, args *InstallSnapshotRequest, resp *InstallSnapshotResponse, data io.Reader) error { + // Get a conn, always close for InstallSnapshot + conn, err := n.getConn(target) + if err != nil { + return err + } + defer conn.Release() + + // Set a deadline, scaled by request size + if n.timeout > 0 { + timeout := n.timeout * time.Duration(args.Size/int64(n.TimeoutScale)) + if timeout < n.timeout { + timeout = n.timeout + } + conn.conn.SetDeadline(time.Now().Add(timeout)) + } + + // Send the RPC + if err = sendRPC(conn, rpcInstallSnapshot, args); err != nil { + return err + } + + // Stream the state + if _, err = io.Copy(conn.w, data); err != nil { + return err + } + + // Flush + if err = conn.w.Flush(); err != nil { + return err + } + + // Decode the response, do not return conn + _, err = decodeResponse(conn, resp) + return err +} + +// EncodePeer implements the Transport interface. +func (n *NetworkTransport) EncodePeer(p string) []byte { + return []byte(p) +} + +// DecodePeer implements the Transport interface. +func (n *NetworkTransport) DecodePeer(buf []byte) string { + return string(buf) +} + +// listen is used to handling incoming connections. +func (n *NetworkTransport) listen() { + for { + // Accept incoming connections + conn, err := n.stream.Accept() + if err != nil { + if n.IsShutdown() { + return + } + n.logger.Printf("[ERR] raft-net: Failed to accept connection: %v", err) + continue + } + n.logger.Printf("[DEBUG] raft-net: %v accepted connection from: %v", n.LocalAddr(), conn.RemoteAddr()) + + // Handle the connection in dedicated routine + go n.handleConn(conn) + } +} + +// handleConn is used to handle an inbound connection for its lifespan. +func (n *NetworkTransport) handleConn(conn net.Conn) { + defer conn.Close() + r := bufio.NewReader(conn) + w := bufio.NewWriter(conn) + dec := codec.NewDecoder(r, &codec.MsgpackHandle{}) + enc := codec.NewEncoder(w, &codec.MsgpackHandle{}) + + for { + if err := n.handleCommand(r, dec, enc); err != nil { + if err != io.EOF { + n.logger.Printf("[ERR] raft-net: Failed to decode incoming command: %v", err) + } + return + } + if err := w.Flush(); err != nil { + n.logger.Printf("[ERR] raft-net: Failed to flush response: %v", err) + return + } + } +} + +// handleCommand is used to decode and dispatch a single command. +func (n *NetworkTransport) handleCommand(r *bufio.Reader, dec *codec.Decoder, enc *codec.Encoder) error { + // Get the rpc type + rpcType, err := r.ReadByte() + if err != nil { + return err + } + + // Create the RPC object + respCh := make(chan RPCResponse, 1) + rpc := RPC{ + RespChan: respCh, + } + + // Decode the command + isHeartbeat := false + switch rpcType { + case rpcAppendEntries: + var req AppendEntriesRequest + if err := dec.Decode(&req); err != nil { + return err + } + rpc.Command = &req + + // Check if this is a heartbeat + if req.Term != 0 && req.Leader != nil && + req.PrevLogEntry == 0 && req.PrevLogTerm == 0 && + len(req.Entries) == 0 && req.LeaderCommitIndex == 0 { + isHeartbeat = true + } + + case rpcRequestVote: + var req RequestVoteRequest + if err := dec.Decode(&req); err != nil { + return err + } + rpc.Command = &req + + case rpcInstallSnapshot: + var req InstallSnapshotRequest + if err := dec.Decode(&req); err != nil { + return err + } + rpc.Command = &req + rpc.Reader = io.LimitReader(r, req.Size) + + default: + return fmt.Errorf("unknown rpc type %d", rpcType) + } + + // Check for heartbeat fast-path + if isHeartbeat { + n.heartbeatFnLock.Lock() + fn := n.heartbeatFn + n.heartbeatFnLock.Unlock() + if fn != nil { + fn(rpc) + goto RESP + } + } + + // Dispatch the RPC + select { + case n.consumeCh <- rpc: + case <-n.shutdownCh: + return ErrTransportShutdown + } + + // Wait for response +RESP: + select { + case resp := <-respCh: + // Send the error first + respErr := "" + if resp.Error != nil { + respErr = resp.Error.Error() + } + if err := enc.Encode(respErr); err != nil { + return err + } + + // Send the response + if err := enc.Encode(resp.Response); err != nil { + return err + } + case <-n.shutdownCh: + return ErrTransportShutdown + } + return nil +} + +// decodeResponse is used to decode an RPC response and reports whether +// the connection can be reused. +func decodeResponse(conn *netConn, resp interface{}) (bool, error) { + // Decode the error if any + var rpcError string + if err := conn.dec.Decode(&rpcError); err != nil { + conn.Release() + return false, err + } + + // Decode the response + if err := conn.dec.Decode(resp); err != nil { + conn.Release() + return false, err + } + + // Format an error if any + if rpcError != "" { + return true, fmt.Errorf(rpcError) + } + return true, nil +} + +// sendRPC is used to encode and send the RPC. +func sendRPC(conn *netConn, rpcType uint8, args interface{}) error { + // Write the request type + if err := conn.w.WriteByte(rpcType); err != nil { + conn.Release() + return err + } + + // Send the request + if err := conn.enc.Encode(args); err != nil { + conn.Release() + return err + } + + // Flush + if err := conn.w.Flush(); err != nil { + conn.Release() + return err + } + return nil +} + +// newNetPipeline is used to construct a netPipeline from a given +// transport and connection. +func newNetPipeline(trans *NetworkTransport, conn *netConn) *netPipeline { + n := &netPipeline{ + conn: conn, + trans: trans, + doneCh: make(chan AppendFuture, rpcMaxPipeline), + inprogressCh: make(chan *appendFuture, rpcMaxPipeline), + shutdownCh: make(chan struct{}), + } + go n.decodeResponses() + return n +} + +// decodeResponses is a long running routine that decodes the responses +// sent on the connection. +func (n *netPipeline) decodeResponses() { + timeout := n.trans.timeout + for { + select { + case future := <-n.inprogressCh: + if timeout > 0 { + n.conn.conn.SetReadDeadline(time.Now().Add(timeout)) + } + + _, err := decodeResponse(n.conn, future.resp) + future.respond(err) + select { + case n.doneCh <- future: + case <-n.shutdownCh: + return + } + case <-n.shutdownCh: + return + } + } +} + +// AppendEntries is used to pipeline a new append entries request. +func (n *netPipeline) AppendEntries(args *AppendEntriesRequest, resp *AppendEntriesResponse) (AppendFuture, error) { + // Create a new future + future := &appendFuture{ + start: time.Now(), + args: args, + resp: resp, + } + future.init() + + // Add a send timeout + if timeout := n.trans.timeout; timeout > 0 { + n.conn.conn.SetWriteDeadline(time.Now().Add(timeout)) + } + + // Send the RPC + if err := sendRPC(n.conn, rpcAppendEntries, future.args); err != nil { + return nil, err + } + + // Hand-off for decoding, this can also cause back-pressure + // to prevent too many inflight requests + select { + case n.inprogressCh <- future: + return future, nil + case <-n.shutdownCh: + return nil, ErrPipelineShutdown + } +} + +// Consumer returns a channel that can be used to consume complete futures. +func (n *netPipeline) Consumer() <-chan AppendFuture { + return n.doneCh +} + +// Closed is used to shutdown the pipeline connection. +func (n *netPipeline) Close() error { + n.shutdownLock.Lock() + defer n.shutdownLock.Unlock() + if n.shutdown { + return nil + } + + // Release the connection + n.conn.Release() + + n.shutdown = true + close(n.shutdownCh) + return nil +} diff --git a/vendor/github.com/hashicorp/raft/net_transport_test.go b/vendor/github.com/hashicorp/raft/net_transport_test.go new file mode 100644 index 00000000..ca92c897 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/net_transport_test.go @@ -0,0 +1,449 @@ +package raft + +import ( + "bytes" + "reflect" + "sync" + "testing" + "time" +) + +func TestNetworkTransport_StartStop(t *testing.T) { + trans, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + trans.Close() +} + +func TestNetworkTransport_Heartbeat_FastPath(t *testing.T) { + // Transport 1 is consumer + trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans1.Close() + + // Make the RPC request + args := AppendEntriesRequest{ + Term: 10, + Leader: []byte("cartman"), + } + resp := AppendEntriesResponse{ + Term: 4, + LastLog: 90, + Success: true, + } + + invoked := false + fastpath := func(rpc RPC) { + // Verify the command + req := rpc.Command.(*AppendEntriesRequest) + if !reflect.DeepEqual(req, &args) { + t.Fatalf("command mismatch: %#v %#v", *req, args) + } + + rpc.Respond(&resp, nil) + invoked = true + } + trans1.SetHeartbeatHandler(fastpath) + + // Transport 2 makes outbound request + trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans2.Close() + + var out AppendEntriesResponse + if err := trans2.AppendEntries(trans1.LocalAddr(), &args, &out); err != nil { + t.Fatalf("err: %v", err) + } + + // Verify the response + if !reflect.DeepEqual(resp, out) { + t.Fatalf("command mismatch: %#v %#v", resp, out) + } + + // Ensure fast-path is used + if !invoked { + t.Fatalf("fast-path not used") + } +} + +func TestNetworkTransport_AppendEntries(t *testing.T) { + // Transport 1 is consumer + trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans1.Close() + rpcCh := trans1.Consumer() + + // Make the RPC request + args := AppendEntriesRequest{ + Term: 10, + Leader: []byte("cartman"), + PrevLogEntry: 100, + PrevLogTerm: 4, + Entries: []*Log{ + &Log{ + Index: 101, + Term: 4, + Type: LogNoop, + }, + }, + LeaderCommitIndex: 90, + } + resp := AppendEntriesResponse{ + Term: 4, + LastLog: 90, + Success: true, + } + + // Listen for a request + go func() { + select { + case rpc := <-rpcCh: + // Verify the command + req := rpc.Command.(*AppendEntriesRequest) + if !reflect.DeepEqual(req, &args) { + t.Fatalf("command mismatch: %#v %#v", *req, args) + } + + rpc.Respond(&resp, nil) + + case <-time.After(200 * time.Millisecond): + t.Fatalf("timeout") + } + }() + + // Transport 2 makes outbound request + trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans2.Close() + + var out AppendEntriesResponse + if err := trans2.AppendEntries(trans1.LocalAddr(), &args, &out); err != nil { + t.Fatalf("err: %v", err) + } + + // Verify the response + if !reflect.DeepEqual(resp, out) { + t.Fatalf("command mismatch: %#v %#v", resp, out) + } +} + +func TestNetworkTransport_AppendEntriesPipeline(t *testing.T) { + // Transport 1 is consumer + trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans1.Close() + rpcCh := trans1.Consumer() + + // Make the RPC request + args := AppendEntriesRequest{ + Term: 10, + Leader: []byte("cartman"), + PrevLogEntry: 100, + PrevLogTerm: 4, + Entries: []*Log{ + &Log{ + Index: 101, + Term: 4, + Type: LogNoop, + }, + }, + LeaderCommitIndex: 90, + } + resp := AppendEntriesResponse{ + Term: 4, + LastLog: 90, + Success: true, + } + + // Listen for a request + go func() { + for i := 0; i < 10; i++ { + select { + case rpc := <-rpcCh: + // Verify the command + req := rpc.Command.(*AppendEntriesRequest) + if !reflect.DeepEqual(req, &args) { + t.Fatalf("command mismatch: %#v %#v", *req, args) + } + rpc.Respond(&resp, nil) + + case <-time.After(200 * time.Millisecond): + t.Fatalf("timeout") + } + } + }() + + // Transport 2 makes outbound request + trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans2.Close() + + pipeline, err := trans2.AppendEntriesPipeline(trans1.LocalAddr()) + if err != nil { + t.Fatalf("err: %v", err) + } + defer pipeline.Close() + for i := 0; i < 10; i++ { + out := new(AppendEntriesResponse) + if _, err := pipeline.AppendEntries(&args, out); err != nil { + t.Fatalf("err: %v", err) + } + } + + respCh := pipeline.Consumer() + for i := 0; i < 10; i++ { + select { + case ready := <-respCh: + // Verify the response + if !reflect.DeepEqual(&resp, ready.Response()) { + t.Fatalf("command mismatch: %#v %#v", &resp, ready.Response()) + } + case <-time.After(200 * time.Millisecond): + t.Fatalf("timeout") + } + } +} + +func TestNetworkTransport_RequestVote(t *testing.T) { + // Transport 1 is consumer + trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans1.Close() + rpcCh := trans1.Consumer() + + // Make the RPC request + args := RequestVoteRequest{ + Term: 20, + Candidate: []byte("butters"), + LastLogIndex: 100, + LastLogTerm: 19, + } + resp := RequestVoteResponse{ + Term: 100, + Peers: []byte("blah"), + Granted: false, + } + + // Listen for a request + go func() { + select { + case rpc := <-rpcCh: + // Verify the command + req := rpc.Command.(*RequestVoteRequest) + if !reflect.DeepEqual(req, &args) { + t.Fatalf("command mismatch: %#v %#v", *req, args) + } + + rpc.Respond(&resp, nil) + + case <-time.After(200 * time.Millisecond): + t.Fatalf("timeout") + } + }() + + // Transport 2 makes outbound request + trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans2.Close() + + var out RequestVoteResponse + if err := trans2.RequestVote(trans1.LocalAddr(), &args, &out); err != nil { + t.Fatalf("err: %v", err) + } + + // Verify the response + if !reflect.DeepEqual(resp, out) { + t.Fatalf("command mismatch: %#v %#v", resp, out) + } +} + +func TestNetworkTransport_InstallSnapshot(t *testing.T) { + // Transport 1 is consumer + trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans1.Close() + rpcCh := trans1.Consumer() + + // Make the RPC request + args := InstallSnapshotRequest{ + Term: 10, + Leader: []byte("kyle"), + LastLogIndex: 100, + LastLogTerm: 9, + Peers: []byte("blah blah"), + Size: 10, + } + resp := InstallSnapshotResponse{ + Term: 10, + Success: true, + } + + // Listen for a request + go func() { + select { + case rpc := <-rpcCh: + // Verify the command + req := rpc.Command.(*InstallSnapshotRequest) + if !reflect.DeepEqual(req, &args) { + t.Fatalf("command mismatch: %#v %#v", *req, args) + } + + // Try to read the bytes + buf := make([]byte, 10) + rpc.Reader.Read(buf) + + // Compare + if bytes.Compare(buf, []byte("0123456789")) != 0 { + t.Fatalf("bad buf %v", buf) + } + + rpc.Respond(&resp, nil) + + case <-time.After(200 * time.Millisecond): + t.Fatalf("timeout") + } + }() + + // Transport 2 makes outbound request + trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans2.Close() + + // Create a buffer + buf := bytes.NewBuffer([]byte("0123456789")) + + var out InstallSnapshotResponse + if err := trans2.InstallSnapshot(trans1.LocalAddr(), &args, &out, buf); err != nil { + t.Fatalf("err: %v", err) + } + + // Verify the response + if !reflect.DeepEqual(resp, out) { + t.Fatalf("command mismatch: %#v %#v", resp, out) + } +} + +func TestNetworkTransport_EncodeDecode(t *testing.T) { + // Transport 1 is consumer + trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans1.Close() + + local := trans1.LocalAddr() + enc := trans1.EncodePeer(local) + dec := trans1.DecodePeer(enc) + + if dec != local { + t.Fatalf("enc/dec fail: %v %v", dec, local) + } +} + +func TestNetworkTransport_PooledConn(t *testing.T) { + // Transport 1 is consumer + trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans1.Close() + rpcCh := trans1.Consumer() + + // Make the RPC request + args := AppendEntriesRequest{ + Term: 10, + Leader: []byte("cartman"), + PrevLogEntry: 100, + PrevLogTerm: 4, + Entries: []*Log{ + &Log{ + Index: 101, + Term: 4, + Type: LogNoop, + }, + }, + LeaderCommitIndex: 90, + } + resp := AppendEntriesResponse{ + Term: 4, + LastLog: 90, + Success: true, + } + + // Listen for a request + go func() { + for { + select { + case rpc := <-rpcCh: + // Verify the command + req := rpc.Command.(*AppendEntriesRequest) + if !reflect.DeepEqual(req, &args) { + t.Fatalf("command mismatch: %#v %#v", *req, args) + } + rpc.Respond(&resp, nil) + + case <-time.After(200 * time.Millisecond): + return + } + } + }() + + // Transport 2 makes outbound request, 3 conn pool + trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 3, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans2.Close() + + // Create wait group + wg := &sync.WaitGroup{} + wg.Add(5) + + appendFunc := func() { + defer wg.Done() + var out AppendEntriesResponse + if err := trans2.AppendEntries(trans1.LocalAddr(), &args, &out); err != nil { + t.Fatalf("err: %v", err) + } + + // Verify the response + if !reflect.DeepEqual(resp, out) { + t.Fatalf("command mismatch: %#v %#v", resp, out) + } + } + + // Try to do parallel appends, should stress the conn pool + for i := 0; i < 5; i++ { + go appendFunc() + } + + // Wait for the routines to finish + wg.Wait() + + // Check the conn pool size + addr := trans1.LocalAddr() + if len(trans2.connPool[addr]) != 3 { + t.Fatalf("Expected 2 pooled conns!") + } +} diff --git a/vendor/github.com/hashicorp/raft/observer.go b/vendor/github.com/hashicorp/raft/observer.go new file mode 100644 index 00000000..dbd0cc64 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/observer.go @@ -0,0 +1,122 @@ +package raft + +import ( + "sync/atomic" +) + +// Observation is sent along the given channel to observers when an event occurs. +type Observation struct { + // Raft holds the Raft instance generating the observation. + Raft *Raft + // Data holds observation-specific data. Possible types are + // *RequestVoteRequest, RaftState and LeaderObservation. + Data interface{} +} + +// LeaderObservation is used in Observation.Data when leadership changes. +type LeaderObservation struct { + Leader string +} + +// nextObserverId is used to provide a unique ID for each observer to aid in +// deregistration. +var nextObserverID uint64 + +// FilterFn is a function that can be registered in order to filter observations. +// The function reports whether the observation should be included - if +// it returns false, the observation will be filtered out. +type FilterFn func(o *Observation) bool + +// Observer describes what to do with a given observation. +type Observer struct { + // numObserved and numDropped are performance counters for this observer. + // 64 bit types must be 64 bit aligned to use with atomic operations on + // 32 bit platforms, so keep them at the top of the struct. + numObserved uint64 + numDropped uint64 + + // channel receives observations. + channel chan Observation + + // blocking, if true, will cause Raft to block when sending an observation + // to this observer. This should generally be set to false. + blocking bool + + // filter will be called to determine if an observation should be sent to + // the channel. + filter FilterFn + + // id is the ID of this observer in the Raft map. + id uint64 +} + +// NewObserver creates a new observer that can be registered +// to make observations on a Raft instance. Observations +// will be sent on the given channel if they satisfy the +// given filter. +// +// If blocking is true, the observer will block when it can't +// send on the channel, otherwise it may discard events. +func NewObserver(channel chan Observation, blocking bool, filter FilterFn) *Observer { + return &Observer{ + channel: channel, + blocking: blocking, + filter: filter, + id: atomic.AddUint64(&nextObserverID, 1), + } +} + +// GetNumObserved returns the number of observations. +func (or *Observer) GetNumObserved() uint64 { + return atomic.LoadUint64(&or.numObserved) +} + +// GetNumDropped returns the number of dropped observations due to blocking. +func (or *Observer) GetNumDropped() uint64 { + return atomic.LoadUint64(&or.numDropped) +} + +// RegisterObserver registers a new observer. +func (r *Raft) RegisterObserver(or *Observer) { + r.observersLock.Lock() + defer r.observersLock.Unlock() + r.observers[or.id] = or +} + +// DeregisterObserver deregisters an observer. +func (r *Raft) DeregisterObserver(or *Observer) { + r.observersLock.Lock() + defer r.observersLock.Unlock() + delete(r.observers, or.id) +} + +// observe sends an observation to every observer. +func (r *Raft) observe(o interface{}) { + // In general observers should not block. But in any case this isn't + // disastrous as we only hold a read lock, which merely prevents + // registration / deregistration of observers. + r.observersLock.RLock() + defer r.observersLock.RUnlock() + for _, or := range r.observers { + // It's wasteful to do this in the loop, but for the common case + // where there are no observers we won't create any objects. + ob := Observation{Raft: r, Data: o} + if or.filter != nil && !or.filter(&ob) { + continue + } + if or.channel == nil { + continue + } + if or.blocking { + or.channel <- ob + atomic.AddUint64(&or.numObserved, 1) + } else { + select { + case or.channel <- ob: + atomic.AddUint64(&or.numObserved, 1) + default: + atomic.AddUint64(&or.numDropped, 1) + } + } + } +} diff --git a/vendor/github.com/hashicorp/raft/peer.go b/vendor/github.com/hashicorp/raft/peer.go new file mode 100644 index 00000000..6f3bcf85 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/peer.go @@ -0,0 +1,122 @@ +package raft + +import ( + "bytes" + "encoding/json" + "io/ioutil" + "os" + "path/filepath" + "sync" +) + +const ( + jsonPeerPath = "peers.json" +) + +// PeerStore provides an interface for persistent storage and +// retrieval of peers. We use a separate interface than StableStore +// since the peers may need to be edited by a human operator. For example, +// in a two node cluster, the failure of either node requires human intervention +// since consensus is impossible. +type PeerStore interface { + // Peers returns the list of known peers. + Peers() ([]string, error) + + // SetPeers sets the list of known peers. This is invoked when a peer is + // added or removed. + SetPeers([]string) error +} + +// StaticPeers is used to provide a static list of peers. +type StaticPeers struct { + StaticPeers []string + l sync.Mutex +} + +// Peers implements the PeerStore interface. +func (s *StaticPeers) Peers() ([]string, error) { + s.l.Lock() + peers := s.StaticPeers + s.l.Unlock() + return peers, nil +} + +// SetPeers implements the PeerStore interface. +func (s *StaticPeers) SetPeers(p []string) error { + s.l.Lock() + s.StaticPeers = p + s.l.Unlock() + return nil +} + +// JSONPeers is used to provide peer persistence on disk in the form +// of a JSON file. This allows human operators to manipulate the file. +type JSONPeers struct { + l sync.Mutex + path string + trans Transport +} + +// NewJSONPeers creates a new JSONPeers store. Requires a transport +// to handle the serialization of network addresses. +func NewJSONPeers(base string, trans Transport) *JSONPeers { + path := filepath.Join(base, jsonPeerPath) + store := &JSONPeers{ + path: path, + trans: trans, + } + return store +} + +// Peers implements the PeerStore interface. +func (j *JSONPeers) Peers() ([]string, error) { + j.l.Lock() + defer j.l.Unlock() + + // Read the file + buf, err := ioutil.ReadFile(j.path) + if err != nil && !os.IsNotExist(err) { + return nil, err + } + + // Check for no peers + if len(buf) == 0 { + return nil, nil + } + + // Decode the peers + var peerSet []string + dec := json.NewDecoder(bytes.NewReader(buf)) + if err := dec.Decode(&peerSet); err != nil { + return nil, err + } + + // Deserialize each peer + var peers []string + for _, p := range peerSet { + peers = append(peers, j.trans.DecodePeer([]byte(p))) + } + return peers, nil +} + +// SetPeers implements the PeerStore interface. +func (j *JSONPeers) SetPeers(peers []string) error { + j.l.Lock() + defer j.l.Unlock() + + // Encode each peer + var peerSet []string + for _, p := range peers { + peerSet = append(peerSet, string(j.trans.EncodePeer(p))) + } + + // Convert to JSON + var buf bytes.Buffer + enc := json.NewEncoder(&buf) + if err := enc.Encode(peerSet); err != nil { + return err + } + + // Write out as JSON + return ioutil.WriteFile(j.path, buf.Bytes(), 0755) +} diff --git a/vendor/github.com/hashicorp/raft/peer_test.go b/vendor/github.com/hashicorp/raft/peer_test.go new file mode 100644 index 00000000..ff835e02 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/peer_test.go @@ -0,0 +1,44 @@ +package raft + +import ( + "io/ioutil" + "os" + "testing" +) + +func TestJSONPeers(t *testing.T) { + // Create a test dir + dir, err := ioutil.TempDir("", "raft") + if err != nil { + t.Fatalf("err: %v ", err) + } + defer os.RemoveAll(dir) + + // Create the store + _, trans := NewInmemTransport("") + store := NewJSONPeers(dir, trans) + + // Try a read, should get nothing + peers, err := store.Peers() + if err != nil { + t.Fatalf("err: %v", err) + } + if len(peers) != 0 { + t.Fatalf("peers: %v", peers) + } + + // Initialize some peers + newPeers := []string{NewInmemAddr(), NewInmemAddr(), NewInmemAddr()} + if err := store.SetPeers(newPeers); err != nil { + t.Fatalf("err: %v", err) + } + + // Try a read, should peers + peers, err = store.Peers() + if err != nil { + t.Fatalf("err: %v", err) + } + if len(peers) != 3 { + t.Fatalf("peers: %v", peers) + } +} diff --git a/vendor/github.com/hashicorp/raft/raft.go b/vendor/github.com/hashicorp/raft/raft.go new file mode 100644 index 00000000..c5dac733 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/raft.go @@ -0,0 +1,1925 @@ +package raft + +import ( + "bytes" + "errors" + "fmt" + "io" + "io/ioutil" + "log" + "os" + "strconv" + "sync" + "time" + + "github.com/armon/go-metrics" +) + +const ( + minCheckInterval = 10 * time.Millisecond +) + +var ( + keyCurrentTerm = []byte("CurrentTerm") + keyLastVoteTerm = []byte("LastVoteTerm") + keyLastVoteCand = []byte("LastVoteCand") + + // ErrLeader is returned when an operation can't be completed on a + // leader node. + ErrLeader = errors.New("node is the leader") + + // ErrNotLeader is returned when an operation can't be completed on a + // follower or candidate node. + ErrNotLeader = errors.New("node is not the leader") + + // ErrLeadershipLost is returned when a leader fails to commit a log entry + // because it's been deposed in the process. + ErrLeadershipLost = errors.New("leadership lost while committing log") + + // ErrRaftShutdown is returned when operations are requested against an + // inactive Raft. + ErrRaftShutdown = errors.New("raft is already shutdown") + + // ErrEnqueueTimeout is returned when a command fails due to a timeout. + ErrEnqueueTimeout = errors.New("timed out enqueuing operation") + + // ErrKnownPeer is returned when trying to add a peer to the configuration + // that already exists. + ErrKnownPeer = errors.New("peer already known") + + // ErrUnknownPeer is returned when trying to remove a peer from the + // configuration that doesn't exist. + ErrUnknownPeer = errors.New("peer is unknown") + + // ErrNothingNewToSnapshot is returned when trying to create a snapshot + // but there's nothing new commited to the FSM since we started. + ErrNothingNewToSnapshot = errors.New("Nothing new to snapshot") +) + +// commitTuple is used to send an index that was committed, +// with an optional associated future that should be invoked. +type commitTuple struct { + log *Log + future *logFuture +} + +// leaderState is state that is used while we are a leader. +type leaderState struct { + commitCh chan struct{} + inflight *inflight + replState map[string]*followerReplication + notify map[*verifyFuture]struct{} + stepDown chan struct{} +} + +// Raft implements a Raft node. +type Raft struct { + raftState + + // applyCh is used to async send logs to the main thread to + // be committed and applied to the FSM. + applyCh chan *logFuture + + // Configuration provided at Raft initialization + conf *Config + + // FSM is the client state machine to apply commands to + fsm FSM + + // fsmCommitCh is used to trigger async application of logs to the fsm + fsmCommitCh chan commitTuple + + // fsmRestoreCh is used to trigger a restore from snapshot + fsmRestoreCh chan *restoreFuture + + // fsmSnapshotCh is used to trigger a new snapshot being taken + fsmSnapshotCh chan *reqSnapshotFuture + + // lastContact is the last time we had contact from the + // leader node. This can be used to gauge staleness. + lastContact time.Time + lastContactLock sync.RWMutex + + // Leader is the current cluster leader + leader string + leaderLock sync.RWMutex + + // leaderCh is used to notify of leadership changes + leaderCh chan bool + + // leaderState used only while state is leader + leaderState leaderState + + // Stores our local addr + localAddr string + + // Used for our logging + logger *log.Logger + + // LogStore provides durable storage for logs + logs LogStore + + // Track our known peers + peerCh chan *peerFuture + peers []string + peerStore PeerStore + + // RPC chan comes from the transport layer + rpcCh <-chan RPC + + // Shutdown channel to exit, protected to prevent concurrent exits + shutdown bool + shutdownCh chan struct{} + shutdownLock sync.Mutex + + // snapshots is used to store and retrieve snapshots + snapshots SnapshotStore + + // snapshotCh is used for user triggered snapshots + snapshotCh chan *snapshotFuture + + // stable is a StableStore implementation for durable state + // It provides stable storage for many fields in raftState + stable StableStore + + // The transport layer we use + trans Transport + + // verifyCh is used to async send verify futures to the main thread + // to verify we are still the leader + verifyCh chan *verifyFuture + + // List of observers and the mutex that protects them. The observers list + // is indexed by an artificial ID which is used for deregistration. + observersLock sync.RWMutex + observers map[uint64]*Observer +} + +// NewRaft is used to construct a new Raft node. It takes a configuration, as well +// as implementations of various interfaces that are required. If we have any old state, +// such as snapshots, logs, peers, etc, all those will be restored when creating the +// Raft node. +func NewRaft(conf *Config, fsm FSM, logs LogStore, stable StableStore, snaps SnapshotStore, + peerStore PeerStore, trans Transport) (*Raft, error) { + // Validate the configuration + if err := ValidateConfig(conf); err != nil { + return nil, err + } + + // Ensure we have a LogOutput + var logger *log.Logger + if conf.Logger != nil { + logger = conf.Logger + } else { + if conf.LogOutput == nil { + conf.LogOutput = os.Stderr + } + logger = log.New(conf.LogOutput, "", log.LstdFlags) + } + + // Try to restore the current term + currentTerm, err := stable.GetUint64(keyCurrentTerm) + if err != nil && err.Error() != "not found" { + return nil, fmt.Errorf("failed to load current term: %v", err) + } + + // Read the last log value + lastIdx, err := logs.LastIndex() + if err != nil { + return nil, fmt.Errorf("failed to find last log: %v", err) + } + + // Get the log + var lastLog Log + if lastIdx > 0 { + if err = logs.GetLog(lastIdx, &lastLog); err != nil { + return nil, fmt.Errorf("failed to get last log: %v", err) + } + } + + // Construct the list of peers that excludes us + localAddr := trans.LocalAddr() + peers, err := peerStore.Peers() + if err != nil { + return nil, fmt.Errorf("failed to get list of peers: %v", err) + } + peers = ExcludePeer(peers, localAddr) + + // Create Raft struct + r := &Raft{ + applyCh: make(chan *logFuture), + conf: conf, + fsm: fsm, + fsmCommitCh: make(chan commitTuple, 128), + fsmRestoreCh: make(chan *restoreFuture), + fsmSnapshotCh: make(chan *reqSnapshotFuture), + leaderCh: make(chan bool), + localAddr: localAddr, + logger: logger, + logs: logs, + peerCh: make(chan *peerFuture), + peers: peers, + peerStore: peerStore, + rpcCh: trans.Consumer(), + snapshots: snaps, + snapshotCh: make(chan *snapshotFuture), + shutdownCh: make(chan struct{}), + stable: stable, + trans: trans, + verifyCh: make(chan *verifyFuture, 64), + observers: make(map[uint64]*Observer), + } + + // Initialize as a follower + r.setState(Follower) + + // Start as leader if specified. This should only be used + // for testing purposes. + if conf.StartAsLeader { + r.setState(Leader) + r.setLeader(r.localAddr) + } + + // Restore the current term and the last log + r.setCurrentTerm(currentTerm) + r.setLastLog(lastLog.Index, lastLog.Term) + + // Attempt to restore a snapshot if there are any + if err := r.restoreSnapshot(); err != nil { + return nil, err + } + + // Setup a heartbeat fast-path to avoid head-of-line + // blocking where possible. It MUST be safe for this + // to be called concurrently with a blocking RPC. + trans.SetHeartbeatHandler(r.processHeartbeat) + + // Start the background work + r.goFunc(r.run) + r.goFunc(r.runFSM) + r.goFunc(r.runSnapshots) + return r, nil +} + +// Leader is used to return the current leader of the cluster. +// It may return empty string if there is no current leader +// or the leader is unknown. +func (r *Raft) Leader() string { + r.leaderLock.RLock() + leader := r.leader + r.leaderLock.RUnlock() + return leader +} + +// setLeader is used to modify the current leader of the cluster +func (r *Raft) setLeader(leader string) { + r.leaderLock.Lock() + oldLeader := r.leader + r.leader = leader + r.leaderLock.Unlock() + if oldLeader != leader { + r.observe(LeaderObservation{Leader: leader}) + } +} + +// Apply is used to apply a command to the FSM in a highly consistent +// manner. This returns a future that can be used to wait on the application. +// An optional timeout can be provided to limit the amount of time we wait +// for the command to be started. This must be run on the leader or it +// will fail. +func (r *Raft) Apply(cmd []byte, timeout time.Duration) ApplyFuture { + metrics.IncrCounter([]string{"raft", "apply"}, 1) + var timer <-chan time.Time + if timeout > 0 { + timer = time.After(timeout) + } + + // Create a log future, no index or term yet + logFuture := &logFuture{ + log: Log{ + Type: LogCommand, + Data: cmd, + }, + } + logFuture.init() + + select { + case <-timer: + return errorFuture{ErrEnqueueTimeout} + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + case r.applyCh <- logFuture: + return logFuture + } +} + +// Barrier is used to issue a command that blocks until all preceeding +// operations have been applied to the FSM. It can be used to ensure the +// FSM reflects all queued writes. An optional timeout can be provided to +// limit the amount of time we wait for the command to be started. This +// must be run on the leader or it will fail. +func (r *Raft) Barrier(timeout time.Duration) Future { + metrics.IncrCounter([]string{"raft", "barrier"}, 1) + var timer <-chan time.Time + if timeout > 0 { + timer = time.After(timeout) + } + + // Create a log future, no index or term yet + logFuture := &logFuture{ + log: Log{ + Type: LogBarrier, + }, + } + logFuture.init() + + select { + case <-timer: + return errorFuture{ErrEnqueueTimeout} + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + case r.applyCh <- logFuture: + return logFuture + } +} + +// VerifyLeader is used to ensure the current node is still +// the leader. This can be done to prevent stale reads when a +// new leader has potentially been elected. +func (r *Raft) VerifyLeader() Future { + metrics.IncrCounter([]string{"raft", "verify_leader"}, 1) + verifyFuture := &verifyFuture{} + verifyFuture.init() + select { + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + case r.verifyCh <- verifyFuture: + return verifyFuture + } +} + +// AddPeer is used to add a new peer into the cluster. This must be +// run on the leader or it will fail. +func (r *Raft) AddPeer(peer string) Future { + logFuture := &logFuture{ + log: Log{ + Type: LogAddPeer, + peer: peer, + }, + } + logFuture.init() + select { + case r.applyCh <- logFuture: + return logFuture + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + } +} + +// RemovePeer is used to remove a peer from the cluster. If the +// current leader is being removed, it will cause a new election +// to occur. This must be run on the leader or it will fail. +func (r *Raft) RemovePeer(peer string) Future { + logFuture := &logFuture{ + log: Log{ + Type: LogRemovePeer, + peer: peer, + }, + } + logFuture.init() + select { + case r.applyCh <- logFuture: + return logFuture + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + } +} + +// SetPeers is used to forcibly replace the set of internal peers and +// the peerstore with the ones specified. This can be considered unsafe. +func (r *Raft) SetPeers(p []string) Future { + peerFuture := &peerFuture{ + peers: p, + } + peerFuture.init() + + select { + case r.peerCh <- peerFuture: + return peerFuture + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + } +} + +// Shutdown is used to stop the Raft background routines. +// This is not a graceful operation. Provides a future that +// can be used to block until all background routines have exited. +func (r *Raft) Shutdown() Future { + r.shutdownLock.Lock() + defer r.shutdownLock.Unlock() + + if !r.shutdown { + close(r.shutdownCh) + r.shutdown = true + r.setState(Shutdown) + return &shutdownFuture{r} + } + + // avoid closing transport twice + return &shutdownFuture{nil} +} + +// Snapshot is used to manually force Raft to take a snapshot. +// Returns a future that can be used to block until complete. +func (r *Raft) Snapshot() Future { + snapFuture := &snapshotFuture{} + snapFuture.init() + select { + case r.snapshotCh <- snapFuture: + return snapFuture + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + } + +} + +// State is used to return the current raft state. +func (r *Raft) State() RaftState { + return r.getState() +} + +// LeaderCh is used to get a channel which delivers signals on +// acquiring or losing leadership. It sends true if we become +// the leader, and false if we lose it. The channel is not buffered, +// and does not block on writes. +func (r *Raft) LeaderCh() <-chan bool { + return r.leaderCh +} + +func (r *Raft) String() string { + return fmt.Sprintf("Node at %s [%v]", r.localAddr, r.getState()) +} + +// LastContact returns the time of last contact by a leader. +// This only makes sense if we are currently a follower. +func (r *Raft) LastContact() time.Time { + r.lastContactLock.RLock() + last := r.lastContact + r.lastContactLock.RUnlock() + return last +} + +// Stats is used to return a map of various internal stats. This +// should only be used for informative purposes or debugging. +// +// Keys are: "state", "term", "last_log_index", "last_log_term", +// "commit_index", "applied_index", "fsm_pending", +// "last_snapshot_index", "last_snapshot_term", "num_peers" and +// "last_contact". +// +// The value of "state" is a numerical value representing a +// RaftState const. +// +// The value of "last_contact" is either "never" if there +// has been no contact with a leader, "0" if the node is in the +// leader state, or the time since last contact with a leader +// formatted as a string. +// +// All other values are uint64s, formatted as strings. +func (r *Raft) Stats() map[string]string { + toString := func(v uint64) string { + return strconv.FormatUint(v, 10) + } + lastLogIndex, lastLogTerm := r.getLastLog() + lastSnapIndex, lastSnapTerm := r.getLastSnapshot() + s := map[string]string{ + "state": r.getState().String(), + "term": toString(r.getCurrentTerm()), + "last_log_index": toString(lastLogIndex), + "last_log_term": toString(lastLogTerm), + "commit_index": toString(r.getCommitIndex()), + "applied_index": toString(r.getLastApplied()), + "fsm_pending": toString(uint64(len(r.fsmCommitCh))), + "last_snapshot_index": toString(lastSnapIndex), + "last_snapshot_term": toString(lastSnapTerm), + "num_peers": toString(uint64(len(r.peers))), + } + last := r.LastContact() + if last.IsZero() { + s["last_contact"] = "never" + } else if r.getState() == Leader { + s["last_contact"] = "0" + } else { + s["last_contact"] = fmt.Sprintf("%v", time.Now().Sub(last)) + } + return s +} + +// LastIndex returns the last index in stable storage, +// either from the last log or from the last snapshot. +func (r *Raft) LastIndex() uint64 { + return r.getLastIndex() +} + +// AppliedIndex returns the last index applied to the FSM. This is generally +// lagging behind the last index, especially for indexes that are persisted but +// have not yet been considered committed by the leader. NOTE - this reflects +// the last index that was sent to the application's FSM over the apply channel +// but DOES NOT mean that the application's FSM has yet consumed it and applied +// it to its internal state. Thus, the application's state may lag behind this +// index. +func (r *Raft) AppliedIndex() uint64 { + return r.getLastApplied() +} + +// runFSM is a long running goroutine responsible for applying logs +// to the FSM. This is done async of other logs since we don't want +// the FSM to block our internal operations. +func (r *Raft) runFSM() { + var lastIndex, lastTerm uint64 + for { + select { + case req := <-r.fsmRestoreCh: + // Open the snapshot + meta, source, err := r.snapshots.Open(req.ID) + if err != nil { + req.respond(fmt.Errorf("failed to open snapshot %v: %v", req.ID, err)) + continue + } + + // Attempt to restore + start := time.Now() + if err := r.fsm.Restore(source); err != nil { + req.respond(fmt.Errorf("failed to restore snapshot %v: %v", req.ID, err)) + source.Close() + continue + } + source.Close() + metrics.MeasureSince([]string{"raft", "fsm", "restore"}, start) + + // Update the last index and term + lastIndex = meta.Index + lastTerm = meta.Term + req.respond(nil) + + case req := <-r.fsmSnapshotCh: + // Is there something to snapshot? + if lastIndex == 0 { + req.respond(ErrNothingNewToSnapshot) + continue + } + + // Get our peers + peers, err := r.peerStore.Peers() + if err != nil { + req.respond(err) + continue + } + + // Start a snapshot + start := time.Now() + snap, err := r.fsm.Snapshot() + metrics.MeasureSince([]string{"raft", "fsm", "snapshot"}, start) + + // Respond to the request + req.index = lastIndex + req.term = lastTerm + req.peers = peers + req.snapshot = snap + req.respond(err) + + case commitEntry := <-r.fsmCommitCh: + // Apply the log if a command + var resp interface{} + if commitEntry.log.Type == LogCommand { + start := time.Now() + resp = r.fsm.Apply(commitEntry.log) + metrics.MeasureSince([]string{"raft", "fsm", "apply"}, start) + } + + // Update the indexes + lastIndex = commitEntry.log.Index + lastTerm = commitEntry.log.Term + + // Invoke the future if given + if commitEntry.future != nil { + commitEntry.future.response = resp + commitEntry.future.respond(nil) + } + case <-r.shutdownCh: + return + } + } +} + +// run is a long running goroutine that runs the Raft FSM. +func (r *Raft) run() { + for { + // Check if we are doing a shutdown + select { + case <-r.shutdownCh: + // Clear the leader to prevent forwarding + r.setLeader("") + return + default: + } + + // Enter into a sub-FSM + switch r.getState() { + case Follower: + r.runFollower() + case Candidate: + r.runCandidate() + case Leader: + r.runLeader() + } + } +} + +// runFollower runs the FSM for a follower. +func (r *Raft) runFollower() { + didWarn := false + r.logger.Printf("[INFO] raft: %v entering Follower state (Leader: %q)", r, r.Leader()) + metrics.IncrCounter([]string{"raft", "state", "follower"}, 1) + heartbeatTimer := randomTimeout(r.conf.HeartbeatTimeout) + for { + select { + case rpc := <-r.rpcCh: + r.processRPC(rpc) + + case a := <-r.applyCh: + // Reject any operations since we are not the leader + a.respond(ErrNotLeader) + + case v := <-r.verifyCh: + // Reject any operations since we are not the leader + v.respond(ErrNotLeader) + + case p := <-r.peerCh: + // Set the peers + r.peers = ExcludePeer(p.peers, r.localAddr) + p.respond(r.peerStore.SetPeers(p.peers)) + + case <-heartbeatTimer: + // Restart the heartbeat timer + heartbeatTimer = randomTimeout(r.conf.HeartbeatTimeout) + + // Check if we have had a successful contact + lastContact := r.LastContact() + if time.Now().Sub(lastContact) < r.conf.HeartbeatTimeout { + continue + } + + // Heartbeat failed! Transition to the candidate state + lastLeader := r.Leader() + r.setLeader("") + if len(r.peers) == 0 && !r.conf.EnableSingleNode { + if !didWarn { + r.logger.Printf("[WARN] raft: EnableSingleNode disabled, and no known peers. Aborting election.") + didWarn = true + } + } else { + r.logger.Printf(`[WARN] raft: Heartbeat timeout from %q reached, starting election`, lastLeader) + + metrics.IncrCounter([]string{"raft", "transition", "heartbeat_timeout"}, 1) + r.setState(Candidate) + return + } + + case <-r.shutdownCh: + return + } + } +} + +// runCandidate runs the FSM for a candidate. +func (r *Raft) runCandidate() { + r.logger.Printf("[INFO] raft: %v entering Candidate state", r) + metrics.IncrCounter([]string{"raft", "state", "candidate"}, 1) + + // Start vote for us, and set a timeout + voteCh := r.electSelf() + electionTimer := randomTimeout(r.conf.ElectionTimeout) + + // Tally the votes, need a simple majority + grantedVotes := 0 + votesNeeded := r.quorumSize() + r.logger.Printf("[DEBUG] raft: Votes needed: %d", votesNeeded) + + for r.getState() == Candidate { + select { + case rpc := <-r.rpcCh: + r.processRPC(rpc) + + case vote := <-voteCh: + // Check if the term is greater than ours, bail + if vote.Term > r.getCurrentTerm() { + r.logger.Printf("[DEBUG] raft: Newer term discovered, fallback to follower") + r.setState(Follower) + r.setCurrentTerm(vote.Term) + return + } + + // Check if the vote is granted + if vote.Granted { + grantedVotes++ + r.logger.Printf("[DEBUG] raft: Vote granted from %s. Tally: %d", vote.voter, grantedVotes) + } + + // Check if we've become the leader + if grantedVotes >= votesNeeded { + r.logger.Printf("[INFO] raft: Election won. Tally: %d", grantedVotes) + r.setState(Leader) + r.setLeader(r.localAddr) + return + } + + case a := <-r.applyCh: + // Reject any operations since we are not the leader + a.respond(ErrNotLeader) + + case v := <-r.verifyCh: + // Reject any operations since we are not the leader + v.respond(ErrNotLeader) + + case p := <-r.peerCh: + // Set the peers + r.peers = ExcludePeer(p.peers, r.localAddr) + p.respond(r.peerStore.SetPeers(p.peers)) + // Become a follower again + r.setState(Follower) + return + + case <-electionTimer: + // Election failed! Restart the election. We simply return, + // which will kick us back into runCandidate + r.logger.Printf("[WARN] raft: Election timeout reached, restarting election") + return + + case <-r.shutdownCh: + return + } + } +} + +// runLeader runs the FSM for a leader. Do the setup here and drop into +// the leaderLoop for the hot loop. +func (r *Raft) runLeader() { + r.logger.Printf("[INFO] raft: %v entering Leader state", r) + metrics.IncrCounter([]string{"raft", "state", "leader"}, 1) + + // Notify that we are the leader + asyncNotifyBool(r.leaderCh, true) + + // Push to the notify channel if given + if notify := r.conf.NotifyCh; notify != nil { + select { + case notify <- true: + case <-r.shutdownCh: + } + } + + // Setup leader state + r.leaderState.commitCh = make(chan struct{}, 1) + r.leaderState.inflight = newInflight(r.leaderState.commitCh) + r.leaderState.replState = make(map[string]*followerReplication) + r.leaderState.notify = make(map[*verifyFuture]struct{}) + r.leaderState.stepDown = make(chan struct{}, 1) + + // Cleanup state on step down + defer func() { + // Since we were the leader previously, we update our + // last contact time when we step down, so that we are not + // reporting a last contact time from before we were the + // leader. Otherwise, to a client it would seem our data + // is extremely stale. + r.setLastContact() + + // Stop replication + for _, p := range r.leaderState.replState { + close(p.stopCh) + } + + // Cancel inflight requests + r.leaderState.inflight.Cancel(ErrLeadershipLost) + + // Respond to any pending verify requests + for future := range r.leaderState.notify { + future.respond(ErrLeadershipLost) + } + + // Clear all the state + r.leaderState.commitCh = nil + r.leaderState.inflight = nil + r.leaderState.replState = nil + r.leaderState.notify = nil + r.leaderState.stepDown = nil + + // If we are stepping down for some reason, no known leader. + // We may have stepped down due to an RPC call, which would + // provide the leader, so we cannot always blank this out. + r.leaderLock.Lock() + if r.leader == r.localAddr { + r.leader = "" + } + r.leaderLock.Unlock() + + // Notify that we are not the leader + asyncNotifyBool(r.leaderCh, false) + + // Push to the notify channel if given + if notify := r.conf.NotifyCh; notify != nil { + select { + case notify <- false: + case <-r.shutdownCh: + // On shutdown, make a best effort but do not block + select { + case notify <- false: + default: + } + } + } + }() + + // Start a replication routine for each peer + for _, peer := range r.peers { + r.startReplication(peer) + } + + // Dispatch a no-op log first. Instead of LogNoop, + // we use a LogAddPeer with our peerset. This acts like + // a no-op as well, but when doing an initial bootstrap, ensures + // that all nodes share a common peerset. + peerSet := append([]string{r.localAddr}, r.peers...) + noop := &logFuture{ + log: Log{ + Type: LogAddPeer, + Data: encodePeers(peerSet, r.trans), + }, + } + r.dispatchLogs([]*logFuture{noop}) + + // Disable EnableSingleNode after we've been elected leader. + // This is to prevent a split brain in the future, if we are removed + // from the cluster and then elect ourself as leader. + if r.conf.DisableBootstrapAfterElect && r.conf.EnableSingleNode { + r.logger.Printf("[INFO] raft: Disabling EnableSingleNode (bootstrap)") + r.conf.EnableSingleNode = false + } + + // Sit in the leader loop until we step down + r.leaderLoop() +} + +// startReplication is a helper to setup state and start async replication to a peer. +func (r *Raft) startReplication(peer string) { + lastIdx := r.getLastIndex() + s := &followerReplication{ + peer: peer, + inflight: r.leaderState.inflight, + stopCh: make(chan uint64, 1), + triggerCh: make(chan struct{}, 1), + currentTerm: r.getCurrentTerm(), + matchIndex: 0, + nextIndex: lastIdx + 1, + lastContact: time.Now(), + notifyCh: make(chan struct{}, 1), + stepDown: r.leaderState.stepDown, + } + r.leaderState.replState[peer] = s + r.goFunc(func() { r.replicate(s) }) + asyncNotifyCh(s.triggerCh) +} + +// leaderLoop is the hot loop for a leader. It is invoked +// after all the various leader setup is done. +func (r *Raft) leaderLoop() { + // stepDown is used to track if there is an inflight log that + // would cause us to lose leadership (specifically a RemovePeer of + // ourselves). If this is the case, we must not allow any logs to + // be processed in parallel, otherwise we are basing commit on + // only a single peer (ourself) and replicating to an undefined set + // of peers. + stepDown := false + + lease := time.After(r.conf.LeaderLeaseTimeout) + for r.getState() == Leader { + select { + case rpc := <-r.rpcCh: + r.processRPC(rpc) + + case <-r.leaderState.stepDown: + r.setState(Follower) + + case <-r.leaderState.commitCh: + // Get the committed messages + committed := r.leaderState.inflight.Committed() + for e := committed.Front(); e != nil; e = e.Next() { + // Measure the commit time + commitLog := e.Value.(*logFuture) + metrics.MeasureSince([]string{"raft", "commitTime"}, commitLog.dispatch) + + // Increment the commit index + idx := commitLog.log.Index + r.setCommitIndex(idx) + r.processLogs(idx, commitLog) + } + + case v := <-r.verifyCh: + if v.quorumSize == 0 { + // Just dispatched, start the verification + r.verifyLeader(v) + + } else if v.votes < v.quorumSize { + // Early return, means there must be a new leader + r.logger.Printf("[WARN] raft: New leader elected, stepping down") + r.setState(Follower) + delete(r.leaderState.notify, v) + v.respond(ErrNotLeader) + + } else { + // Quorum of members agree, we are still leader + delete(r.leaderState.notify, v) + v.respond(nil) + } + + case p := <-r.peerCh: + p.respond(ErrLeader) + + case newLog := <-r.applyCh: + // Group commit, gather all the ready commits + ready := []*logFuture{newLog} + for i := 0; i < r.conf.MaxAppendEntries; i++ { + select { + case newLog := <-r.applyCh: + ready = append(ready, newLog) + default: + break + } + } + + // Handle any peer set changes + n := len(ready) + for i := 0; i < n; i++ { + // Fail all future transactions once stepDown is on + if stepDown { + ready[i].respond(ErrNotLeader) + ready[i], ready[n-1] = ready[n-1], nil + n-- + i-- + continue + } + + // Special case AddPeer and RemovePeer + log := ready[i] + if log.log.Type != LogAddPeer && log.log.Type != LogRemovePeer { + continue + } + + // Check if this log should be ignored. The logs can be + // reordered here since we have not yet assigned an index + // and are not violating any promises. + if !r.preparePeerChange(log) { + ready[i], ready[n-1] = ready[n-1], nil + n-- + i-- + continue + } + + // Apply peer set changes early and check if we will step + // down after the commit of this log. If so, we must not + // allow any future entries to make progress to avoid undefined + // behavior. + if ok := r.processLog(&log.log, nil, true); ok { + stepDown = true + } + } + + // Nothing to do if all logs are invalid + if n == 0 { + continue + } + + // Dispatch the logs + ready = ready[:n] + r.dispatchLogs(ready) + + case <-lease: + // Check if we've exceeded the lease, potentially stepping down + maxDiff := r.checkLeaderLease() + + // Next check interval should adjust for the last node we've + // contacted, without going negative + checkInterval := r.conf.LeaderLeaseTimeout - maxDiff + if checkInterval < minCheckInterval { + checkInterval = minCheckInterval + } + + // Renew the lease timer + lease = time.After(checkInterval) + + case <-r.shutdownCh: + return + } + } +} + +// verifyLeader must be called from the main thread for safety. +// Causes the followers to attempt an immediate heartbeat. +func (r *Raft) verifyLeader(v *verifyFuture) { + // Current leader always votes for self + v.votes = 1 + + // Set the quorum size, hot-path for single node + v.quorumSize = r.quorumSize() + if v.quorumSize == 1 { + v.respond(nil) + return + } + + // Track this request + v.notifyCh = r.verifyCh + r.leaderState.notify[v] = struct{}{} + + // Trigger immediate heartbeats + for _, repl := range r.leaderState.replState { + repl.notifyLock.Lock() + repl.notify = append(repl.notify, v) + repl.notifyLock.Unlock() + asyncNotifyCh(repl.notifyCh) + } +} + +// checkLeaderLease is used to check if we can contact a quorum of nodes +// within the last leader lease interval. If not, we need to step down, +// as we may have lost connectivity. Returns the maximum duration without +// contact. +func (r *Raft) checkLeaderLease() time.Duration { + // Track contacted nodes, we can always contact ourself + contacted := 1 + + // Check each follower + var maxDiff time.Duration + now := time.Now() + for peer, f := range r.leaderState.replState { + diff := now.Sub(f.LastContact()) + if diff <= r.conf.LeaderLeaseTimeout { + contacted++ + if diff > maxDiff { + maxDiff = diff + } + } else { + // Log at least once at high value, then debug. Otherwise it gets very verbose. + if diff <= 3*r.conf.LeaderLeaseTimeout { + r.logger.Printf("[WARN] raft: Failed to contact %v in %v", peer, diff) + } else { + r.logger.Printf("[DEBUG] raft: Failed to contact %v in %v", peer, diff) + } + } + metrics.AddSample([]string{"raft", "leader", "lastContact"}, float32(diff/time.Millisecond)) + } + + // Verify we can contact a quorum + quorum := r.quorumSize() + if contacted < quorum { + r.logger.Printf("[WARN] raft: Failed to contact quorum of nodes, stepping down") + r.setState(Follower) + metrics.IncrCounter([]string{"raft", "transition", "leader_lease_timeout"}, 1) + } + return maxDiff +} + +// quorumSize is used to return the quorum size +func (r *Raft) quorumSize() int { + return ((len(r.peers) + 1) / 2) + 1 +} + +// preparePeerChange checks if a LogAddPeer or LogRemovePeer should be performed, +// and properly formats the data field on the log before dispatching it. +func (r *Raft) preparePeerChange(l *logFuture) bool { + // Check if this is a known peer + p := l.log.peer + knownPeer := PeerContained(r.peers, p) || r.localAddr == p + + // Ignore known peers on add + if l.log.Type == LogAddPeer && knownPeer { + l.respond(ErrKnownPeer) + return false + } + + // Ignore unknown peers on remove + if l.log.Type == LogRemovePeer && !knownPeer { + l.respond(ErrUnknownPeer) + return false + } + + // Construct the peer set + var peerSet []string + if l.log.Type == LogAddPeer { + peerSet = append([]string{p, r.localAddr}, r.peers...) + } else { + peerSet = ExcludePeer(append([]string{r.localAddr}, r.peers...), p) + } + + // Setup the log + l.log.Data = encodePeers(peerSet, r.trans) + return true +} + +// dispatchLog is called to push a log to disk, mark it +// as inflight and begin replication of it. +func (r *Raft) dispatchLogs(applyLogs []*logFuture) { + now := time.Now() + defer metrics.MeasureSince([]string{"raft", "leader", "dispatchLog"}, now) + + term := r.getCurrentTerm() + lastIndex := r.getLastIndex() + logs := make([]*Log, len(applyLogs)) + + for idx, applyLog := range applyLogs { + applyLog.dispatch = now + applyLog.log.Index = lastIndex + uint64(idx) + 1 + applyLog.log.Term = term + applyLog.policy = newMajorityQuorum(len(r.peers) + 1) + logs[idx] = &applyLog.log + } + + // Write the log entry locally + if err := r.logs.StoreLogs(logs); err != nil { + r.logger.Printf("[ERR] raft: Failed to commit logs: %v", err) + for _, applyLog := range applyLogs { + applyLog.respond(err) + } + r.setState(Follower) + return + } + + // Add this to the inflight logs, commit + r.leaderState.inflight.StartAll(applyLogs) + + // Update the last log since it's on disk now + r.setLastLog(lastIndex+uint64(len(applyLogs)), term) + + // Notify the replicators of the new log + for _, f := range r.leaderState.replState { + asyncNotifyCh(f.triggerCh) + } +} + +// processLogs is used to process all the logs from the lastApplied +// up to the given index. +func (r *Raft) processLogs(index uint64, future *logFuture) { + // Reject logs we've applied already + lastApplied := r.getLastApplied() + if index <= lastApplied { + r.logger.Printf("[WARN] raft: Skipping application of old log: %d", index) + return + } + + // Apply all the preceding logs + for idx := r.getLastApplied() + 1; idx <= index; idx++ { + // Get the log, either from the future or from our log store + if future != nil && future.log.Index == idx { + r.processLog(&future.log, future, false) + + } else { + l := new(Log) + if err := r.logs.GetLog(idx, l); err != nil { + r.logger.Printf("[ERR] raft: Failed to get log at %d: %v", idx, err) + panic(err) + } + r.processLog(l, nil, false) + } + + // Update the lastApplied index and term + r.setLastApplied(idx) + } +} + +// processLog is invoked to process the application of a single committed log. +// Returns if this log entry would cause us to stepDown after it commits. +func (r *Raft) processLog(l *Log, future *logFuture, precommit bool) (stepDown bool) { + switch l.Type { + case LogBarrier: + // Barrier is handled by the FSM + fallthrough + + case LogCommand: + // Forward to the fsm handler + select { + case r.fsmCommitCh <- commitTuple{l, future}: + case <-r.shutdownCh: + if future != nil { + future.respond(ErrRaftShutdown) + } + } + + // Return so that the future is only responded to + // by the FSM handler when the application is done + return + + case LogAddPeer: + fallthrough + case LogRemovePeer: + peers := decodePeers(l.Data, r.trans) + r.logger.Printf("[DEBUG] raft: Node %v updated peer set (%v): %v", r.localAddr, l.Type, peers) + + // If the peer set does not include us, remove all other peers + removeSelf := !PeerContained(peers, r.localAddr) && l.Type == LogRemovePeer + if removeSelf { + // Mark that this operation will cause us to step down as + // leader. This prevents the future logs from being Applied + // from this leader. + stepDown = true + + // We only modify the peers after the commit, otherwise we + // would be using a quorum size of 1 for the RemovePeer operation. + // This is used with the stepDown guard to prevent any other logs. + if !precommit { + r.peers = nil + r.peerStore.SetPeers([]string{r.localAddr}) + } + } else { + r.peers = ExcludePeer(peers, r.localAddr) + r.peerStore.SetPeers(peers) + } + + // Handle replication if we are the leader + if r.getState() == Leader { + for _, p := range r.peers { + if _, ok := r.leaderState.replState[p]; !ok { + r.logger.Printf("[INFO] raft: Added peer %v, starting replication", p) + r.startReplication(p) + } + } + } + + // Stop replication for old nodes + if r.getState() == Leader && !precommit { + var toDelete []string + for _, repl := range r.leaderState.replState { + if !PeerContained(r.peers, repl.peer) { + r.logger.Printf("[INFO] raft: Removed peer %v, stopping replication (Index: %d)", repl.peer, l.Index) + + // Replicate up to this index and stop + repl.stopCh <- l.Index + close(repl.stopCh) + toDelete = append(toDelete, repl.peer) + } + } + for _, name := range toDelete { + delete(r.leaderState.replState, name) + } + } + + // Handle removing ourself + if removeSelf && !precommit { + if r.conf.ShutdownOnRemove { + r.logger.Printf("[INFO] raft: Removed ourself, shutting down") + r.Shutdown() + } else { + r.logger.Printf("[INFO] raft: Removed ourself, transitioning to follower") + r.setState(Follower) + } + } + + case LogNoop: + // Ignore the no-op + default: + r.logger.Printf("[ERR] raft: Got unrecognized log type: %#v", l) + } + + // Invoke the future if given + if future != nil && !precommit { + future.respond(nil) + } + return +} + +// processRPC is called to handle an incoming RPC request. +func (r *Raft) processRPC(rpc RPC) { + switch cmd := rpc.Command.(type) { + case *AppendEntriesRequest: + r.appendEntries(rpc, cmd) + case *RequestVoteRequest: + r.requestVote(rpc, cmd) + case *InstallSnapshotRequest: + r.installSnapshot(rpc, cmd) + default: + r.logger.Printf("[ERR] raft: Got unexpected command: %#v", rpc.Command) + rpc.Respond(nil, fmt.Errorf("unexpected command")) + } +} + +// processHeartbeat is a special handler used just for heartbeat requests +// so that they can be fast-pathed if a transport supports it. +func (r *Raft) processHeartbeat(rpc RPC) { + defer metrics.MeasureSince([]string{"raft", "rpc", "processHeartbeat"}, time.Now()) + + // Check if we are shutdown, just ignore the RPC + select { + case <-r.shutdownCh: + return + default: + } + + // Ensure we are only handling a heartbeat + switch cmd := rpc.Command.(type) { + case *AppendEntriesRequest: + r.appendEntries(rpc, cmd) + default: + r.logger.Printf("[ERR] raft: Expected heartbeat, got command: %#v", rpc.Command) + rpc.Respond(nil, fmt.Errorf("unexpected command")) + } +} + +// appendEntries is invoked when we get an append entries RPC call. +func (r *Raft) appendEntries(rpc RPC, a *AppendEntriesRequest) { + defer metrics.MeasureSince([]string{"raft", "rpc", "appendEntries"}, time.Now()) + // Setup a response + resp := &AppendEntriesResponse{ + Term: r.getCurrentTerm(), + LastLog: r.getLastIndex(), + Success: false, + NoRetryBackoff: false, + } + var rpcErr error + defer func() { + rpc.Respond(resp, rpcErr) + }() + + // Ignore an older term + if a.Term < r.getCurrentTerm() { + return + } + + // Increase the term if we see a newer one, also transition to follower + // if we ever get an appendEntries call + if a.Term > r.getCurrentTerm() || r.getState() != Follower { + // Ensure transition to follower + r.setState(Follower) + r.setCurrentTerm(a.Term) + resp.Term = a.Term + } + + // Save the current leader + r.setLeader(r.trans.DecodePeer(a.Leader)) + + // Verify the last log entry + if a.PrevLogEntry > 0 { + lastIdx, lastTerm := r.getLastEntry() + + var prevLogTerm uint64 + if a.PrevLogEntry == lastIdx { + prevLogTerm = lastTerm + + } else { + var prevLog Log + if err := r.logs.GetLog(a.PrevLogEntry, &prevLog); err != nil { + r.logger.Printf("[WARN] raft: Failed to get previous log: %d %v (last: %d)", + a.PrevLogEntry, err, lastIdx) + resp.NoRetryBackoff = true + return + } + prevLogTerm = prevLog.Term + } + + if a.PrevLogTerm != prevLogTerm { + r.logger.Printf("[WARN] raft: Previous log term mis-match: ours: %d remote: %d", + prevLogTerm, a.PrevLogTerm) + resp.NoRetryBackoff = true + return + } + } + + // Process any new entries + if n := len(a.Entries); n > 0 { + start := time.Now() + first := a.Entries[0] + last := a.Entries[n-1] + + // Delete any conflicting entries + lastLogIdx, _ := r.getLastLog() + if first.Index <= lastLogIdx { + r.logger.Printf("[WARN] raft: Clearing log suffix from %d to %d", first.Index, lastLogIdx) + if err := r.logs.DeleteRange(first.Index, lastLogIdx); err != nil { + r.logger.Printf("[ERR] raft: Failed to clear log suffix: %v", err) + return + } + } + + // Append the entry + if err := r.logs.StoreLogs(a.Entries); err != nil { + r.logger.Printf("[ERR] raft: Failed to append to logs: %v", err) + return + } + + // Update the lastLog + r.setLastLog(last.Index, last.Term) + metrics.MeasureSince([]string{"raft", "rpc", "appendEntries", "storeLogs"}, start) + } + + // Update the commit index + if a.LeaderCommitIndex > 0 && a.LeaderCommitIndex > r.getCommitIndex() { + start := time.Now() + idx := min(a.LeaderCommitIndex, r.getLastIndex()) + r.setCommitIndex(idx) + r.processLogs(idx, nil) + metrics.MeasureSince([]string{"raft", "rpc", "appendEntries", "processLogs"}, start) + } + + // Everything went well, set success + resp.Success = true + r.setLastContact() + return +} + +// requestVote is invoked when we get an request vote RPC call. +func (r *Raft) requestVote(rpc RPC, req *RequestVoteRequest) { + defer metrics.MeasureSince([]string{"raft", "rpc", "requestVote"}, time.Now()) + r.observe(*req) + + // Setup a response + resp := &RequestVoteResponse{ + Term: r.getCurrentTerm(), + Peers: encodePeers(r.peers, r.trans), + Granted: false, + } + var rpcErr error + defer func() { + rpc.Respond(resp, rpcErr) + }() + + // Check if we have an existing leader [who's not the candidate] + candidate := r.trans.DecodePeer(req.Candidate) + if leader := r.Leader(); leader != "" && leader != candidate { + r.logger.Printf("[WARN] raft: Rejecting vote request from %v since we have a leader: %v", + candidate, leader) + return + } + + // Ignore an older term + if req.Term < r.getCurrentTerm() { + return + } + + // Increase the term if we see a newer one + if req.Term > r.getCurrentTerm() { + // Ensure transition to follower + r.setState(Follower) + r.setCurrentTerm(req.Term) + resp.Term = req.Term + } + + // Check if we have voted yet + lastVoteTerm, err := r.stable.GetUint64(keyLastVoteTerm) + if err != nil && err.Error() != "not found" { + r.logger.Printf("[ERR] raft: Failed to get last vote term: %v", err) + return + } + lastVoteCandBytes, err := r.stable.Get(keyLastVoteCand) + if err != nil && err.Error() != "not found" { + r.logger.Printf("[ERR] raft: Failed to get last vote candidate: %v", err) + return + } + + // Check if we've voted in this election before + if lastVoteTerm == req.Term && lastVoteCandBytes != nil { + r.logger.Printf("[INFO] raft: Duplicate RequestVote for same term: %d", req.Term) + if bytes.Compare(lastVoteCandBytes, req.Candidate) == 0 { + r.logger.Printf("[WARN] raft: Duplicate RequestVote from candidate: %s", req.Candidate) + resp.Granted = true + } + return + } + + // Reject if their term is older + lastIdx, lastTerm := r.getLastEntry() + if lastTerm > req.LastLogTerm { + r.logger.Printf("[WARN] raft: Rejecting vote request from %v since our last term is greater (%d, %d)", + candidate, lastTerm, req.LastLogTerm) + return + } + + if lastTerm == req.LastLogTerm && lastIdx > req.LastLogIndex { + r.logger.Printf("[WARN] raft: Rejecting vote request from %v since our last index is greater (%d, %d)", + candidate, lastIdx, req.LastLogIndex) + return + } + + // Persist a vote for safety + if err := r.persistVote(req.Term, req.Candidate); err != nil { + r.logger.Printf("[ERR] raft: Failed to persist vote: %v", err) + return + } + + resp.Granted = true + r.setLastContact() + return +} + +// installSnapshot is invoked when we get a InstallSnapshot RPC call. +// We must be in the follower state for this, since it means we are +// too far behind a leader for log replay. +func (r *Raft) installSnapshot(rpc RPC, req *InstallSnapshotRequest) { + defer metrics.MeasureSince([]string{"raft", "rpc", "installSnapshot"}, time.Now()) + // Setup a response + resp := &InstallSnapshotResponse{ + Term: r.getCurrentTerm(), + Success: false, + } + var rpcErr error + defer func() { + io.Copy(ioutil.Discard, rpc.Reader) // ensure we always consume all the snapshot data from the stream [see issue #212] + rpc.Respond(resp, rpcErr) + }() + + // Ignore an older term + if req.Term < r.getCurrentTerm() { + r.logger.Printf("[INFO] raft: Ignoring installSnapshot request with older term of %d vs currentTerm %d", req.Term, r.getCurrentTerm()) + return + } + + // Increase the term if we see a newer one + if req.Term > r.getCurrentTerm() { + // Ensure transition to follower + r.setState(Follower) + r.setCurrentTerm(req.Term) + resp.Term = req.Term + } + + // Save the current leader + r.setLeader(r.trans.DecodePeer(req.Leader)) + + // Create a new snapshot + sink, err := r.snapshots.Create(req.LastLogIndex, req.LastLogTerm, req.Peers) + if err != nil { + r.logger.Printf("[ERR] raft: Failed to create snapshot to install: %v", err) + rpcErr = fmt.Errorf("failed to create snapshot: %v", err) + return + } + + // Spill the remote snapshot to disk + n, err := io.Copy(sink, rpc.Reader) + if err != nil { + sink.Cancel() + r.logger.Printf("[ERR] raft: Failed to copy snapshot: %v", err) + rpcErr = err + return + } + + // Check that we received it all + if n != req.Size { + sink.Cancel() + r.logger.Printf("[ERR] raft: Failed to receive whole snapshot: %d / %d", n, req.Size) + rpcErr = fmt.Errorf("short read") + return + } + + // Finalize the snapshot + if err := sink.Close(); err != nil { + r.logger.Printf("[ERR] raft: Failed to finalize snapshot: %v", err) + rpcErr = err + return + } + r.logger.Printf("[INFO] raft: Copied %d bytes to local snapshot", n) + + // Restore snapshot + future := &restoreFuture{ID: sink.ID()} + future.init() + select { + case r.fsmRestoreCh <- future: + case <-r.shutdownCh: + future.respond(ErrRaftShutdown) + return + } + + // Wait for the restore to happen + if err := future.Error(); err != nil { + r.logger.Printf("[ERR] raft: Failed to restore snapshot: %v", err) + rpcErr = err + return + } + + // Update the lastApplied so we don't replay old logs + r.setLastApplied(req.LastLogIndex) + + // Update the last stable snapshot info + r.setLastSnapshot(req.LastLogIndex, req.LastLogTerm) + + // Restore the peer set + peers := decodePeers(req.Peers, r.trans) + r.peers = ExcludePeer(peers, r.localAddr) + r.peerStore.SetPeers(peers) + + // Compact logs, continue even if this fails + if err := r.compactLogs(req.LastLogIndex); err != nil { + r.logger.Printf("[ERR] raft: Failed to compact logs: %v", err) + } + + r.logger.Printf("[INFO] raft: Installed remote snapshot") + resp.Success = true + r.setLastContact() + return +} + +// setLastContact is used to set the last contact time to now +func (r *Raft) setLastContact() { + r.lastContactLock.Lock() + r.lastContact = time.Now() + r.lastContactLock.Unlock() +} + +type voteResult struct { + RequestVoteResponse + voter string +} + +// electSelf is used to send a RequestVote RPC to all peers, +// and vote for ourself. This has the side affecting of incrementing +// the current term. The response channel returned is used to wait +// for all the responses (including a vote for ourself). +func (r *Raft) electSelf() <-chan *voteResult { + // Create a response channel + respCh := make(chan *voteResult, len(r.peers)+1) + + // Increment the term + r.setCurrentTerm(r.getCurrentTerm() + 1) + + // Construct the request + lastIdx, lastTerm := r.getLastEntry() + req := &RequestVoteRequest{ + Term: r.getCurrentTerm(), + Candidate: r.trans.EncodePeer(r.localAddr), + LastLogIndex: lastIdx, + LastLogTerm: lastTerm, + } + + // Construct a function to ask for a vote + askPeer := func(peer string) { + r.goFunc(func() { + defer metrics.MeasureSince([]string{"raft", "candidate", "electSelf"}, time.Now()) + resp := &voteResult{voter: peer} + err := r.trans.RequestVote(peer, req, &resp.RequestVoteResponse) + if err != nil { + r.logger.Printf("[ERR] raft: Failed to make RequestVote RPC to %v: %v", peer, err) + resp.Term = req.Term + resp.Granted = false + } + + // If we are not a peer, we could have been removed but failed + // to receive the log message. OR it could mean an improperly configured + // cluster. Either way, we should warn + if err == nil { + peerSet := decodePeers(resp.Peers, r.trans) + if !PeerContained(peerSet, r.localAddr) { + r.logger.Printf("[WARN] raft: Remote peer %v does not have local node %v as a peer", + peer, r.localAddr) + } + } + + respCh <- resp + }) + } + + // For each peer, request a vote + for _, peer := range r.peers { + askPeer(peer) + } + + // Persist a vote for ourselves + if err := r.persistVote(req.Term, req.Candidate); err != nil { + r.logger.Printf("[ERR] raft: Failed to persist vote : %v", err) + return nil + } + + // Include our own vote + respCh <- &voteResult{ + RequestVoteResponse: RequestVoteResponse{ + Term: req.Term, + Granted: true, + }, + voter: r.localAddr, + } + return respCh +} + +// persistVote is used to persist our vote for safety. +func (r *Raft) persistVote(term uint64, candidate []byte) error { + if err := r.stable.SetUint64(keyLastVoteTerm, term); err != nil { + return err + } + if err := r.stable.Set(keyLastVoteCand, candidate); err != nil { + return err + } + return nil +} + +// setCurrentTerm is used to set the current term in a durable manner. +func (r *Raft) setCurrentTerm(t uint64) { + // Persist to disk first + if err := r.stable.SetUint64(keyCurrentTerm, t); err != nil { + panic(fmt.Errorf("failed to save current term: %v", err)) + } + r.raftState.setCurrentTerm(t) +} + +// setState is used to update the current state. Any state +// transition causes the known leader to be cleared. This means +// that leader should be set only after updating the state. +func (r *Raft) setState(state RaftState) { + r.setLeader("") + oldState := r.raftState.getState() + r.raftState.setState(state) + if oldState != state { + r.observe(state) + } +} + +// runSnapshots is a long running goroutine used to manage taking +// new snapshots of the FSM. It runs in parallel to the FSM and +// main goroutines, so that snapshots do not block normal operation. +func (r *Raft) runSnapshots() { + for { + select { + case <-randomTimeout(r.conf.SnapshotInterval): + // Check if we should snapshot + if !r.shouldSnapshot() { + continue + } + + // Trigger a snapshot + if err := r.takeSnapshot(); err != nil { + r.logger.Printf("[ERR] raft: Failed to take snapshot: %v", err) + } + + case future := <-r.snapshotCh: + // User-triggered, run immediately + err := r.takeSnapshot() + if err != nil { + r.logger.Printf("[ERR] raft: Failed to take snapshot: %v", err) + } + future.respond(err) + + case <-r.shutdownCh: + return + } + } +} + +// shouldSnapshot checks if we meet the conditions to take +// a new snapshot. +func (r *Raft) shouldSnapshot() bool { + // Check the last snapshot index + lastSnap, _ := r.getLastSnapshot() + + // Check the last log index + lastIdx, err := r.logs.LastIndex() + if err != nil { + r.logger.Printf("[ERR] raft: Failed to get last log index: %v", err) + return false + } + + // Compare the delta to the threshold + delta := lastIdx - lastSnap + return delta >= r.conf.SnapshotThreshold +} + +// takeSnapshot is used to take a new snapshot. +func (r *Raft) takeSnapshot() error { + defer metrics.MeasureSince([]string{"raft", "snapshot", "takeSnapshot"}, time.Now()) + // Create a snapshot request + req := &reqSnapshotFuture{} + req.init() + + // Wait for dispatch or shutdown + select { + case r.fsmSnapshotCh <- req: + case <-r.shutdownCh: + return ErrRaftShutdown + } + + // Wait until we get a response + if err := req.Error(); err != nil { + if err != ErrNothingNewToSnapshot { + err = fmt.Errorf("failed to start snapshot: %v", err) + } + return err + } + defer req.snapshot.Release() + + // Log that we are starting the snapshot + r.logger.Printf("[INFO] raft: Starting snapshot up to %d", req.index) + + // Encode the peerset + peerSet := encodePeers(req.peers, r.trans) + + // Create a new snapshot + start := time.Now() + sink, err := r.snapshots.Create(req.index, req.term, peerSet) + if err != nil { + return fmt.Errorf("failed to create snapshot: %v", err) + } + metrics.MeasureSince([]string{"raft", "snapshot", "create"}, start) + + // Try to persist the snapshot + start = time.Now() + if err := req.snapshot.Persist(sink); err != nil { + sink.Cancel() + return fmt.Errorf("failed to persist snapshot: %v", err) + } + metrics.MeasureSince([]string{"raft", "snapshot", "persist"}, start) + + // Close and check for error + if err := sink.Close(); err != nil { + return fmt.Errorf("failed to close snapshot: %v", err) + } + + // Update the last stable snapshot info + r.setLastSnapshot(req.index, req.term) + + // Compact the logs + if err := r.compactLogs(req.index); err != nil { + return err + } + + // Log completion + r.logger.Printf("[INFO] raft: Snapshot to %d complete", req.index) + return nil +} + +// compactLogs takes the last inclusive index of a snapshot +// and trims the logs that are no longer needed. +func (r *Raft) compactLogs(snapIdx uint64) error { + defer metrics.MeasureSince([]string{"raft", "compactLogs"}, time.Now()) + // Determine log ranges to compact + minLog, err := r.logs.FirstIndex() + if err != nil { + return fmt.Errorf("failed to get first log index: %v", err) + } + + // Check if we have enough logs to truncate + lastLogIdx, _ := r.getLastLog() + if lastLogIdx <= r.conf.TrailingLogs { + return nil + } + + // Truncate up to the end of the snapshot, or `TrailingLogs` + // back from the head, which ever is further back. This ensures + // at least `TrailingLogs` entries, but does not allow logs + // after the snapshot to be removed. + maxLog := min(snapIdx, lastLogIdx-r.conf.TrailingLogs) + + // Log this + r.logger.Printf("[INFO] raft: Compacting logs from %d to %d", minLog, maxLog) + + // Compact the logs + if err := r.logs.DeleteRange(minLog, maxLog); err != nil { + return fmt.Errorf("log compaction failed: %v", err) + } + return nil +} + +// restoreSnapshot attempts to restore the latest snapshots, and fails +// if none of them can be restored. This is called at initialization time, +// and is completely unsafe to call at any other time. +func (r *Raft) restoreSnapshot() error { + snapshots, err := r.snapshots.List() + if err != nil { + r.logger.Printf("[ERR] raft: Failed to list snapshots: %v", err) + return err + } + + // Try to load in order of newest to oldest + for _, snapshot := range snapshots { + _, source, err := r.snapshots.Open(snapshot.ID) + if err != nil { + r.logger.Printf("[ERR] raft: Failed to open snapshot %v: %v", snapshot.ID, err) + continue + } + defer source.Close() + + if err := r.fsm.Restore(source); err != nil { + r.logger.Printf("[ERR] raft: Failed to restore snapshot %v: %v", snapshot.ID, err) + continue + } + + // Log success + r.logger.Printf("[INFO] raft: Restored from snapshot %v", snapshot.ID) + + // Update the lastApplied so we don't replay old logs + r.setLastApplied(snapshot.Index) + + // Update the last stable snapshot info + r.setLastSnapshot(snapshot.Index, snapshot.Term) + + // Success! + return nil + } + + // If we had snapshots and failed to load them, its an error + if len(snapshots) > 0 { + return fmt.Errorf("failed to load any existing snapshots") + } + return nil +} diff --git a/vendor/github.com/hashicorp/raft/raft_test.go b/vendor/github.com/hashicorp/raft/raft_test.go new file mode 100644 index 00000000..5eb660ae --- /dev/null +++ b/vendor/github.com/hashicorp/raft/raft_test.go @@ -0,0 +1,1845 @@ +package raft + +import ( + "bytes" + "fmt" + "io" + "io/ioutil" + "log" + "os" + "reflect" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/hashicorp/go-msgpack/codec" +) + +// MockFSM is an implementation of the FSM interface, and just stores +// the logs sequentially. +type MockFSM struct { + sync.Mutex + logs [][]byte +} + +type MockSnapshot struct { + logs [][]byte + maxIndex int +} + +func (m *MockFSM) Apply(log *Log) interface{} { + m.Lock() + defer m.Unlock() + m.logs = append(m.logs, log.Data) + return len(m.logs) +} + +func (m *MockFSM) Snapshot() (FSMSnapshot, error) { + m.Lock() + defer m.Unlock() + return &MockSnapshot{m.logs, len(m.logs)}, nil +} + +func (m *MockFSM) Restore(inp io.ReadCloser) error { + m.Lock() + defer m.Unlock() + defer inp.Close() + hd := codec.MsgpackHandle{} + dec := codec.NewDecoder(inp, &hd) + + m.logs = nil + return dec.Decode(&m.logs) +} + +func (m *MockSnapshot) Persist(sink SnapshotSink) error { + hd := codec.MsgpackHandle{} + enc := codec.NewEncoder(sink, &hd) + if err := enc.Encode(m.logs[:m.maxIndex]); err != nil { + sink.Cancel() + return err + } + sink.Close() + return nil +} + +func (m *MockSnapshot) Release() { +} + +// Return configurations optimized for in-memory +func inmemConfig(t *testing.T) *Config { + conf := DefaultConfig() + conf.HeartbeatTimeout = 50 * time.Millisecond + conf.ElectionTimeout = 50 * time.Millisecond + conf.LeaderLeaseTimeout = 50 * time.Millisecond + conf.CommitTimeout = 5 * time.Millisecond + conf.Logger = newTestLogger(t) + return conf +} + +// This can be used as the destination for a logger and it'll +// map them into calls to testing.T.Log, so that you only see +// the logging for failed tests. +type testLoggerAdapter struct { + t *testing.T + prefix string +} + +func (a *testLoggerAdapter) Write(d []byte) (int, error) { + if d[len(d)-1] == '\n' { + d = d[:len(d)-1] + } + if a.prefix != "" { + l := a.prefix + ": " + string(d) + a.t.Log(l) + return len(l), nil + } + + a.t.Log(string(d)) + return len(d), nil +} + +func newTestLogger(t *testing.T) *log.Logger { + return log.New(&testLoggerAdapter{t: t}, "", log.Lmicroseconds) +} + +func newTestLoggerWithPrefix(t *testing.T, prefix string) *log.Logger { + return log.New(&testLoggerAdapter{t: t, prefix: prefix}, "", log.Lmicroseconds) +} + +type cluster struct { + dirs []string + stores []*InmemStore + fsms []*MockFSM + snaps []*FileSnapshotStore + trans []LoopbackTransport + rafts []*Raft + t *testing.T + observationCh chan Observation + conf *Config + propagateTimeout time.Duration + longstopTimeout time.Duration + logger *log.Logger + startTime time.Time + + failedLock sync.Mutex + failedCh chan struct{} + failed bool +} + +func (c *cluster) Merge(other *cluster) { + c.dirs = append(c.dirs, other.dirs...) + c.stores = append(c.stores, other.stores...) + c.fsms = append(c.fsms, other.fsms...) + c.snaps = append(c.snaps, other.snaps...) + c.trans = append(c.trans, other.trans...) + c.rafts = append(c.rafts, other.rafts...) +} + +// notifyFailed will close the failed channel which can signal the goroutine +// running the test that another goroutine has detected a failure in order to +// terminate the test. +func (c *cluster) notifyFailed() { + c.failedLock.Lock() + defer c.failedLock.Unlock() + if !c.failed { + c.failed = true + close(c.failedCh) + } +} + +// Failf provides a logging function that fails the tests, prints the output +// with microseconds, and does not mysteriously eat the string. This can be +// safely called from goroutines but won't immediately halt the test. The +// failedCh will be closed to allow blocking functions in the main thread to +// detect the failure and react. Note that you should arrange for the main +// thread to block until all goroutines have completed in order to reliably +// fail tests using this function. +func (c *cluster) Failf(format string, args ...interface{}) { + c.logger.Printf(format, args...) + c.t.Fail() + c.notifyFailed() +} + +// FailNowf provides a logging function that fails the tests, prints the output +// with microseconds, and does not mysteriously eat the string. FailNowf must be +// called from the goroutine running the test or benchmark function, not from +// other goroutines created during the test. Calling FailNowf does not stop +// those other goroutines. +func (c *cluster) FailNowf(format string, args ...interface{}) { + c.logger.Printf(format, args...) + c.t.FailNow() +} + +// Close shuts down the cluster and cleans up. +func (c *cluster) Close() { + var futures []Future + for _, r := range c.rafts { + futures = append(futures, r.Shutdown()) + } + + // Wait for shutdown + limit := time.AfterFunc(c.longstopTimeout, func() { + // We can't FailNowf here, and c.Failf won't do anything if we + // hang, so panic. + panic("timed out waiting for shutdown") + }) + defer limit.Stop() + + for _, f := range futures { + if err := f.Error(); err != nil { + c.FailNowf("[ERR] shutdown future err: %v", err) + } + } + + for _, d := range c.dirs { + os.RemoveAll(d) + } +} + +// WaitEventChan returns a channel which will signal if an observation is made +// or a timeout occurs. It is possible to set a filter to look for specific +// observations. Setting timeout to 0 means that it will wait forever until a +// non-filtered observation is made. +func (c *cluster) WaitEventChan(filter FilterFn, timeout time.Duration) <-chan struct{} { + ch := make(chan struct{}) + go func() { + defer close(ch) + var timeoutCh <-chan time.Time + if timeout > 0 { + timeoutCh = time.After(timeout) + } + for { + select { + case <-timeoutCh: + return + + case o, ok := <-c.observationCh: + if !ok || filter == nil || filter(&o) { + return + } + } + } + }() + return ch +} + +// WaitEvent waits until an observation is made, a timeout occurs, or a test +// failure is signaled. It is possible to set a filter to look for specific +// observations. Setting timeout to 0 means that it will wait forever until a +// non-filtered observation is made or a test failure is signaled. +func (c *cluster) WaitEvent(filter FilterFn, timeout time.Duration) { + select { + case <-c.failedCh: + c.t.FailNow() + + case <-c.WaitEventChan(filter, timeout): + } +} + +// WaitForReplication blocks until every FSM in the cluster has the given +// length, or the long sanity check timeout expires. +func (c *cluster) WaitForReplication(fsmLength int) { + limitCh := time.After(c.longstopTimeout) + +CHECK: + for { + ch := c.WaitEventChan(nil, c.conf.CommitTimeout) + select { + case <-c.failedCh: + c.t.FailNow() + + case <-limitCh: + c.FailNowf("[ERR] Timeout waiting for replication") + + case <-ch: + for _, fsm := range c.fsms { + fsm.Lock() + num := len(fsm.logs) + fsm.Unlock() + if num != fsmLength { + continue CHECK + } + } + return + } + } +} + +// pollState takes a snapshot of the state of the cluster. This might not be +// stable, so use GetInState() to apply some additional checks when waiting +// for the cluster to achieve a particular state. +func (c *cluster) pollState(s RaftState) ([]*Raft, uint64) { + var highestTerm uint64 + in := make([]*Raft, 0, 1) + for _, r := range c.rafts { + if r.State() == s { + in = append(in, r) + } + term := r.getCurrentTerm() + if term > highestTerm { + highestTerm = term + } + } + return in, highestTerm +} + +// GetInState polls the state of the cluster and attempts to identify when it has +// settled into the given state. +func (c *cluster) GetInState(s RaftState) []*Raft { + c.logger.Printf("[INFO] Starting stability test for raft state: %+v", s) + limitCh := time.After(c.longstopTimeout) + + // An election should complete after 2 * max(HeartbeatTimeout, ElectionTimeout) + // because of the randomised timer expiring in 1 x interval ... 2 x interval. + // We add a bit for propagation delay. If the election fails (e.g. because + // two elections start at once), we will have got something through our + // observer channel indicating a different state (i.e. one of the nodes + // will have moved to candidate state) which will reset the timer. + // + // Because of an implementation peculiarity, it can actually be 3 x timeout. + timeout := c.conf.HeartbeatTimeout + if timeout < c.conf.ElectionTimeout { + timeout = c.conf.ElectionTimeout + } + timeout = 2*timeout + c.conf.CommitTimeout + timer := time.NewTimer(timeout) + defer timer.Stop() + + // Wait until we have a stable instate slice. Each time we see an + // observation a state has changed, recheck it and if it has changed, + // restart the timer. + var pollStartTime = time.Now() + for { + inState, highestTerm := c.pollState(s) + inStateTime := time.Now() + + // Sometimes this routine is called very early on before the + // rafts have started up. We then timeout even though no one has + // even started an election. So if the highest term in use is + // zero, we know there are no raft processes that have yet issued + // a RequestVote, and we set a long time out. This is fixed when + // we hear the first RequestVote, at which point we reset the + // timer. + if highestTerm == 0 { + timer.Reset(c.longstopTimeout) + } else { + timer.Reset(timeout) + } + + // Filter will wake up whenever we observe a RequestVote. + filter := func(ob *Observation) bool { + switch ob.Data.(type) { + case RaftState: + return true + case RequestVoteRequest: + return true + default: + return false + } + } + + select { + case <-c.failedCh: + c.t.FailNow() + + case <-limitCh: + c.FailNowf("[ERR] Timeout waiting for stable %s state", s) + + case <-c.WaitEventChan(filter, 0): + c.logger.Printf("[DEBUG] Resetting stability timeout") + + case t, ok := <-timer.C: + if !ok { + c.FailNowf("[ERR] Timer channel errored") + } + c.logger.Printf("[INFO] Stable state for %s reached at %s (%d nodes), %s from start of poll, %s from cluster start. Timeout at %s, %s after stability", + s, inStateTime, len(inState), inStateTime.Sub(pollStartTime), inStateTime.Sub(c.startTime), t, t.Sub(inStateTime)) + return inState + } + } +} + +// Leader waits for the cluster to elect a leader and stay in a stable state. +func (c *cluster) Leader() *Raft { + leaders := c.GetInState(Leader) + if len(leaders) != 1 { + c.FailNowf("[ERR] expected one leader: %v", leaders) + } + return leaders[0] +} + +// Followers waits for the cluster to have N-1 followers and stay in a stable +// state. +func (c *cluster) Followers() []*Raft { + expFollowers := len(c.rafts) - 1 + followers := c.GetInState(Follower) + if len(followers) != expFollowers { + c.FailNowf("[ERR] timeout waiting for %d followers (followers are %v)", expFollowers, followers) + } + return followers +} + +// FullyConnect connects all the transports together. +func (c *cluster) FullyConnect() { + c.logger.Printf("[DEBUG] Fully Connecting") + for i, t1 := range c.trans { + for j, t2 := range c.trans { + if i != j { + t1.Connect(t2.LocalAddr(), t2) + t2.Connect(t1.LocalAddr(), t1) + } + } + } +} + +// Disconnect disconnects all transports from the given address. +func (c *cluster) Disconnect(a string) { + c.logger.Printf("[DEBUG] Disconnecting %v", a) + for _, t := range c.trans { + if t.LocalAddr() == a { + t.DisconnectAll() + } else { + t.Disconnect(a) + } + } +} + +// IndexOf returns the index of the given raft instance. +func (c *cluster) IndexOf(r *Raft) int { + for i, n := range c.rafts { + if n == r { + return i + } + } + return -1 +} + +// EnsureLeader checks that ALL the nodes think the leader is the given expected +// leader. +func (c *cluster) EnsureLeader(t *testing.T, expect string) { + // We assume c.Leader() has been called already; now check all the rafts + // think the leader is correct + fail := false + for _, r := range c.rafts { + leader := r.Leader() + if leader != expect { + if leader == "" { + leader = "[none]" + } + if expect == "" { + c.logger.Printf("[ERR] Peer %s sees leader %v expected [none]", r, leader) + } else { + c.logger.Printf("[ERR] Peer %s sees leader %v expected %v", r, leader, expect) + } + fail = true + } + } + if fail { + c.FailNowf("[ERR] At least one peer has the wrong notion of leader") + } +} + +// EnsureSame makes sure all the FSMs have the same contents. +func (c *cluster) EnsureSame(t *testing.T) { + limit := time.Now().Add(c.longstopTimeout) + first := c.fsms[0] + +CHECK: + first.Lock() + for i, fsm := range c.fsms { + if i == 0 { + continue + } + fsm.Lock() + + if len(first.logs) != len(fsm.logs) { + fsm.Unlock() + if time.Now().After(limit) { + c.FailNowf("[ERR] FSM log length mismatch: %d %d", + len(first.logs), len(fsm.logs)) + } else { + goto WAIT + } + } + + for idx := 0; idx < len(first.logs); idx++ { + if bytes.Compare(first.logs[idx], fsm.logs[idx]) != 0 { + fsm.Unlock() + if time.Now().After(limit) { + c.FailNowf("[ERR] FSM log mismatch at index %d", idx) + } else { + goto WAIT + } + } + } + fsm.Unlock() + } + + first.Unlock() + return + +WAIT: + first.Unlock() + c.WaitEvent(nil, c.conf.CommitTimeout) + goto CHECK +} + +// raftToPeerSet returns the set of peers as a map. +func raftToPeerSet(r *Raft) map[string]struct{} { + peers := make(map[string]struct{}) + peers[r.localAddr] = struct{}{} + + raftPeers, _ := r.peerStore.Peers() + for _, p := range raftPeers { + peers[p] = struct{}{} + } + return peers +} + +// EnsureSamePeers makes sure all the rafts have the same set of peers. +func (c *cluster) EnsureSamePeers(t *testing.T) { + limit := time.Now().Add(c.longstopTimeout) + peerSet := raftToPeerSet(c.rafts[0]) + +CHECK: + for i, raft := range c.rafts { + if i == 0 { + continue + } + + otherSet := raftToPeerSet(raft) + if !reflect.DeepEqual(peerSet, otherSet) { + if time.Now().After(limit) { + c.FailNowf("[ERR] peer mismatch: %v %v", peerSet, otherSet) + } else { + goto WAIT + } + } + } + return + +WAIT: + c.WaitEvent(nil, c.conf.CommitTimeout) + goto CHECK +} + +// makeCluster will return a cluster with the given config and number of peers. +// If addPeers is true, they will be added into the peer store before starting, +// otherwise their transports will be wired up but they won't yet have configured +// each other. +func makeCluster(n int, addPeers bool, t *testing.T, conf *Config) *cluster { + if conf == nil { + conf = inmemConfig(t) + } + + c := &cluster{ + observationCh: make(chan Observation, 1024), + conf: conf, + // Propagation takes a maximum of 2 heartbeat timeouts (time to + // get a new heartbeat that would cause a commit) plus a bit. + propagateTimeout: conf.HeartbeatTimeout*2 + conf.CommitTimeout, + longstopTimeout: 5 * time.Second, + logger: newTestLoggerWithPrefix(t, "cluster"), + failedCh: make(chan struct{}), + } + c.t = t + peers := make([]string, 0, n) + + // Setup the stores and transports + for i := 0; i < n; i++ { + dir, err := ioutil.TempDir("", "raft") + if err != nil { + c.FailNowf("[ERR] err: %v ", err) + } + + store := NewInmemStore() + c.dirs = append(c.dirs, dir) + c.stores = append(c.stores, store) + c.fsms = append(c.fsms, &MockFSM{}) + + dir2, snap := FileSnapTest(t) + c.dirs = append(c.dirs, dir2) + c.snaps = append(c.snaps, snap) + + addr, trans := NewInmemTransport("") + c.trans = append(c.trans, trans) + peers = append(peers, addr) + } + + // Wire the transports together + c.FullyConnect() + + // Create all the rafts + c.startTime = time.Now() + for i := 0; i < n; i++ { + if n == 1 { + conf.EnableSingleNode = true + } + + logs := c.stores[i] + store := c.stores[i] + snap := c.snaps[i] + trans := c.trans[i] + + peerStore := &StaticPeers{} + if addPeers { + peerStore.StaticPeers = peers + } + peerConf := conf + peerConf.Logger = newTestLoggerWithPrefix(t, peers[i]) + + raft, err := NewRaft(peerConf, c.fsms[i], logs, store, snap, peerStore, trans) + if err != nil { + c.FailNowf("[ERR] NewRaft failed: %v", err) + } + + raft.RegisterObserver(NewObserver(c.observationCh, false, nil)) + if err != nil { + c.FailNowf("[ERR] RegisterObserver failed: %v", err) + } + c.rafts = append(c.rafts, raft) + } + + return c +} + +// See makeCluster. This adds the peers initially to the peer store. +func MakeCluster(n int, t *testing.T, conf *Config) *cluster { + return makeCluster(n, true, t, conf) +} + +// See makeCluster. This doesn't add the peers initially to the peer store. +func MakeClusterNoPeers(n int, t *testing.T, conf *Config) *cluster { + return makeCluster(n, false, t, conf) +} + +func TestRaft_StartStop(t *testing.T) { + c := MakeCluster(1, t, nil) + c.Close() +} + +func TestRaft_AfterShutdown(t *testing.T) { + c := MakeCluster(1, t, nil) + c.Close() + raft := c.rafts[0] + + // Everything should fail now + if f := raft.Apply(nil, 0); f.Error() != ErrRaftShutdown { + c.FailNowf("[ERR] should be shutdown: %v", f.Error()) + } + if f := raft.AddPeer(NewInmemAddr()); f.Error() != ErrRaftShutdown { + c.FailNowf("[ERR] should be shutdown: %v", f.Error()) + } + if f := raft.RemovePeer(NewInmemAddr()); f.Error() != ErrRaftShutdown { + c.FailNowf("[ERR] should be shutdown: %v", f.Error()) + } + if f := raft.Snapshot(); f.Error() != ErrRaftShutdown { + c.FailNowf("[ERR] should be shutdown: %v", f.Error()) + } + + // Should be idempotent + if f := raft.Shutdown(); f.Error() != nil { + c.FailNowf("[ERR] shutdown should be idempotent") + } + +} + +func TestRaft_SingleNode(t *testing.T) { + conf := inmemConfig(t) + c := MakeCluster(1, t, conf) + defer c.Close() + raft := c.rafts[0] + + // Watch leaderCh for change + select { + case v := <-raft.LeaderCh(): + if !v { + c.FailNowf("[ERR] should become leader") + } + case <-time.After(conf.HeartbeatTimeout * 3): + c.FailNowf("[ERR] timeout becoming leader") + } + + // Should be leader + if s := raft.State(); s != Leader { + c.FailNowf("[ERR] expected leader: %v", s) + } + + // Should be able to apply + future := raft.Apply([]byte("test"), c.conf.HeartbeatTimeout) + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Check the response + if future.Response().(int) != 1 { + c.FailNowf("[ERR] bad response: %v", future.Response()) + } + + // Check the index + if idx := future.Index(); idx == 0 { + c.FailNowf("[ERR] bad index: %d", idx) + } + + // Check that it is applied to the FSM + if len(c.fsms[0].logs) != 1 { + c.FailNowf("[ERR] did not apply to FSM!") + } +} + +func TestRaft_TripleNode(t *testing.T) { + // Make the cluster + c := MakeCluster(3, t, nil) + defer c.Close() + + // Should be one leader + c.Followers() + leader := c.Leader() + c.EnsureLeader(t, leader.localAddr) + + // Should be able to apply + future := leader.Apply([]byte("test"), c.conf.CommitTimeout) + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + c.WaitForReplication(1) +} + +func TestRaft_LeaderFail(t *testing.T) { + // Make the cluster + c := MakeCluster(3, t, nil) + defer c.Close() + + // Should be one leader + c.Followers() + leader := c.Leader() + + // Should be able to apply + future := leader.Apply([]byte("test"), c.conf.CommitTimeout) + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + c.WaitForReplication(1) + + // Disconnect the leader now + t.Logf("[INFO] Disconnecting %v", leader) + leaderTerm := leader.getCurrentTerm() + c.Disconnect(leader.localAddr) + + // Wait for new leader + limit := time.Now().Add(c.longstopTimeout) + var newLead *Raft + for time.Now().Before(limit) && newLead == nil { + c.WaitEvent(nil, c.conf.CommitTimeout) + leaders := c.GetInState(Leader) + if len(leaders) == 1 && leaders[0] != leader { + newLead = leaders[0] + } + } + if newLead == nil { + c.FailNowf("[ERR] expected new leader") + } + + // Ensure the term is greater + if newLead.getCurrentTerm() <= leaderTerm { + c.FailNowf("[ERR] expected newer term! %d %d (%v, %v)", newLead.getCurrentTerm(), leaderTerm, newLead, leader) + } + + // Apply should work not work on old leader + future1 := leader.Apply([]byte("fail"), c.conf.CommitTimeout) + + // Apply should work on newer leader + future2 := newLead.Apply([]byte("apply"), c.conf.CommitTimeout) + + // Future2 should work + if err := future2.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Reconnect the networks + t.Logf("[INFO] Reconnecting %v", leader) + c.FullyConnect() + + // Future1 should fail + if err := future1.Error(); err != ErrLeadershipLost && err != ErrNotLeader { + c.FailNowf("[ERR] err: %v", err) + } + + // Wait for log replication + c.EnsureSame(t) + + // Check two entries are applied to the FSM + for _, fsm := range c.fsms { + fsm.Lock() + if len(fsm.logs) != 2 { + c.FailNowf("[ERR] did not apply both to FSM! %v", fsm.logs) + } + if bytes.Compare(fsm.logs[0], []byte("test")) != 0 { + c.FailNowf("[ERR] first entry should be 'test'") + } + if bytes.Compare(fsm.logs[1], []byte("apply")) != 0 { + c.FailNowf("[ERR] second entry should be 'apply'") + } + fsm.Unlock() + } +} + +func TestRaft_BehindFollower(t *testing.T) { + // Make the cluster + c := MakeCluster(3, t, nil) + defer c.Close() + + // Disconnect one follower + leader := c.Leader() + followers := c.Followers() + behind := followers[0] + c.Disconnect(behind.localAddr) + + // Commit a lot of things + var future Future + for i := 0; i < 100; i++ { + future = leader.Apply([]byte(fmt.Sprintf("test%d", i)), 0) + } + + // Wait for the last future to apply + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } else { + t.Logf("[INFO] Finished apply without behind follower") + } + + // Check that we have a non zero last contact + if behind.LastContact().IsZero() { + c.FailNowf("[ERR] expected previous contact") + } + + // Reconnect the behind node + c.FullyConnect() + + // Ensure all the logs are the same + c.EnsureSame(t) + + // Ensure one leader + leader = c.Leader() + c.EnsureLeader(t, leader.localAddr) +} + +func TestRaft_ApplyNonLeader(t *testing.T) { + // Make the cluster + c := MakeCluster(3, t, nil) + defer c.Close() + + // Wait for a leader + c.Leader() + + // Try to apply to them + followers := c.GetInState(Follower) + if len(followers) != 2 { + c.FailNowf("[ERR] Expected 2 followers") + } + follower := followers[0] + + // Try to apply + future := follower.Apply([]byte("test"), c.conf.CommitTimeout) + if future.Error() != ErrNotLeader { + c.FailNowf("[ERR] should not apply on follower") + } + + // Should be cached + if future.Error() != ErrNotLeader { + c.FailNowf("[ERR] should not apply on follower") + } +} + +func TestRaft_ApplyConcurrent(t *testing.T) { + // Make the cluster + conf := inmemConfig(t) + conf.HeartbeatTimeout = 2 * conf.HeartbeatTimeout + conf.ElectionTimeout = 2 * conf.ElectionTimeout + c := MakeCluster(3, t, conf) + defer c.Close() + + // Wait for a leader + leader := c.Leader() + + // Create a wait group + const sz = 100 + var group sync.WaitGroup + group.Add(sz) + + applyF := func(i int) { + defer group.Done() + future := leader.Apply([]byte(fmt.Sprintf("test%d", i)), 0) + if err := future.Error(); err != nil { + c.Failf("[ERR] err: %v", err) + } + } + + // Concurrently apply + for i := 0; i < sz; i++ { + go applyF(i) + } + + // Wait to finish + doneCh := make(chan struct{}) + go func() { + group.Wait() + close(doneCh) + }() + select { + case <-doneCh: + case <-time.After(c.longstopTimeout): + c.FailNowf("[ERR] timeout") + } + + // If anything failed up to this point then bail now, rather than do a + // confusing compare. + if t.Failed() { + c.FailNowf("[ERR] One or more of the apply operations failed") + } + + // Check the FSMs + c.EnsureSame(t) +} + +func TestRaft_ApplyConcurrent_Timeout(t *testing.T) { + // Make the cluster + conf := inmemConfig(t) + conf.CommitTimeout = 1 * time.Millisecond + conf.HeartbeatTimeout = 2 * conf.HeartbeatTimeout + conf.ElectionTimeout = 2 * conf.ElectionTimeout + c := MakeCluster(1, t, conf) + defer c.Close() + + // Wait for a leader + leader := c.Leader() + + // Enough enqueues should cause at least one timeout... + var didTimeout int32 + for i := 0; (i < 5000) && (atomic.LoadInt32(&didTimeout) == 0); i++ { + go func(i int) { + future := leader.Apply([]byte(fmt.Sprintf("test%d", i)), time.Microsecond) + if future.Error() == ErrEnqueueTimeout { + atomic.StoreInt32(&didTimeout, 1) + } + }(i) + + // Give the leader loop some other things to do in order to + // increase the odds of a timeout. + if i%5 == 0 { + leader.VerifyLeader() + } + } + + // Loop until we see a timeout, or give up. + limit := time.Now().Add(c.longstopTimeout) + for time.Now().Before(limit) { + if atomic.LoadInt32(&didTimeout) != 0 { + return + } + c.WaitEvent(nil, c.propagateTimeout) + } + c.FailNowf("[ERR] Timeout waiting to detect apply timeouts") +} + +func TestRaft_JoinNode(t *testing.T) { + // Make a cluster + c := MakeCluster(2, t, nil) + defer c.Close() + + // Apply a log to this cluster to ensure it is 'newer' + var future Future + leader := c.Leader() + future = leader.Apply([]byte("first"), 0) + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } else { + t.Logf("[INFO] Applied log") + } + + // Make a new cluster of 1 + c1 := MakeCluster(1, t, nil) + + // Merge clusters + c.Merge(c1) + c.FullyConnect() + + // Wait until we have 2 leaders + limit := time.Now().Add(c.longstopTimeout) + var leaders []*Raft + for time.Now().Before(limit) && len(leaders) != 2 { + c.WaitEvent(nil, c.conf.CommitTimeout) + leaders = c.GetInState(Leader) + } + if len(leaders) != 2 { + c.FailNowf("[ERR] expected two leader: %v", leaders) + } + + // Join the new node in + future = leader.AddPeer(c1.rafts[0].localAddr) + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Wait until we have 2 followers + limit = time.Now().Add(c.longstopTimeout) + var followers []*Raft + for time.Now().Before(limit) && len(followers) != 2 { + c.WaitEvent(nil, c.conf.CommitTimeout) + followers = c.GetInState(Follower) + } + if len(followers) != 2 { + c.FailNowf("[ERR] expected two followers: %v", followers) + } + + // Check the FSMs + c.EnsureSame(t) + + // Check the peers + c.EnsureSamePeers(t) + + // Ensure one leader + leader = c.Leader() + c.EnsureLeader(t, leader.localAddr) +} + +func TestRaft_RemoveFollower(t *testing.T) { + // Make a cluster + c := MakeCluster(3, t, nil) + defer c.Close() + + // Get the leader + leader := c.Leader() + + // Wait until we have 2 followers + limit := time.Now().Add(c.longstopTimeout) + var followers []*Raft + for time.Now().Before(limit) && len(followers) != 2 { + c.WaitEvent(nil, c.conf.CommitTimeout) + followers = c.GetInState(Follower) + } + if len(followers) != 2 { + c.FailNowf("[ERR] expected two followers: %v", followers) + } + + // Remove a follower + follower := followers[0] + future := leader.RemovePeer(follower.localAddr) + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Wait a while + time.Sleep(c.propagateTimeout) + + // Other nodes should have fewer peers + if peers, _ := leader.peerStore.Peers(); len(peers) != 2 { + c.FailNowf("[ERR] too many peers") + } + if peers, _ := followers[1].peerStore.Peers(); len(peers) != 2 { + c.FailNowf("[ERR] too many peers") + } +} + +func TestRaft_RemoveLeader(t *testing.T) { + // Make a cluster + c := MakeCluster(3, t, nil) + defer c.Close() + + // Get the leader + leader := c.Leader() + + // Wait until we have 2 followers + limit := time.Now().Add(c.longstopTimeout) + var followers []*Raft + for time.Now().Before(limit) && len(followers) != 2 { + c.WaitEvent(nil, c.conf.CommitTimeout) + followers = c.GetInState(Follower) + } + if len(followers) != 2 { + c.FailNowf("[ERR] expected two followers: %v", followers) + } + + // Remove the leader + leader.RemovePeer(leader.localAddr) + + // Wait a while + time.Sleep(c.propagateTimeout) + + // Should have a new leader + newLeader := c.Leader() + + // Wait a bit for log application + time.Sleep(c.propagateTimeout) + + // Other nodes should have fewer peers + if peers, _ := newLeader.peerStore.Peers(); len(peers) != 2 { + c.FailNowf("[ERR] too many peers") + } + + // Old leader should be shutdown + if leader.State() != Shutdown { + c.FailNowf("[ERR] leader should be shutdown") + } + + // Old leader should have no peers + if peers, _ := leader.peerStore.Peers(); len(peers) != 1 { + c.FailNowf("[ERR] leader should have no peers") + } +} + +func TestRaft_RemoveLeader_NoShutdown(t *testing.T) { + // Make a cluster + conf := inmemConfig(t) + conf.ShutdownOnRemove = false + c := MakeCluster(3, t, conf) + defer c.Close() + + // Get the leader + c.Followers() + leader := c.Leader() + + // Remove the leader + var removeFuture Future + for i := byte(0); i < 100; i++ { + future := leader.Apply([]byte{i}, 0) + if i == 80 { + removeFuture = leader.RemovePeer(leader.localAddr) + } + if i > 80 { + if err := future.Error(); err == nil || err != ErrNotLeader { + c.FailNowf("[ERR] err: %v, future entries should fail", err) + } + } + } + + if err := removeFuture.Error(); err != nil { + c.FailNowf("[ERR] RemovePeer failed with error %v", err) + } + + // Wait a while + time.Sleep(c.propagateTimeout) + + // Should have a new leader + newLeader := c.Leader() + + // Wait a bit for log application + time.Sleep(c.propagateTimeout) + + // Other nodes should have fewer peers + if peers, _ := newLeader.peerStore.Peers(); len(peers) != 2 { + c.FailNowf("[ERR] too many peers") + } + + // Old leader should be a follower + if leader.State() != Follower { + c.FailNowf("[ERR] leader should be shutdown") + } + + // Old leader should have no peers + if peers, _ := leader.peerStore.Peers(); len(peers) != 1 { + c.FailNowf("[ERR] leader should have no peers") + } + + // Other nodes should have the same state + c.EnsureSame(t) +} + +func TestRaft_RemoveLeader_SplitCluster(t *testing.T) { + // Enable operation after a remove + conf := inmemConfig(t) + conf.EnableSingleNode = true + conf.ShutdownOnRemove = false + conf.DisableBootstrapAfterElect = false + + // Make a cluster + c := MakeCluster(3, t, conf) + defer c.Close() + + // Get the leader + c.Followers() + leader := c.Leader() + + // Remove the leader + leader.RemovePeer(leader.localAddr) + + // Wait until we have 2 leaders + limit := time.Now().Add(c.longstopTimeout) + var leaders []*Raft + for time.Now().Before(limit) && len(leaders) != 2 { + c.WaitEvent(nil, c.conf.CommitTimeout) + leaders = c.GetInState(Leader) + } + if len(leaders) != 2 { + c.FailNowf("[ERR] expected two leader: %v", leaders) + } + + // Old leader should have no peers + if len(leader.peers) != 0 { + c.FailNowf("[ERR] leader should have no peers") + } +} + +func TestRaft_AddKnownPeer(t *testing.T) { + // Make a cluster + c := MakeCluster(3, t, nil) + defer c.Close() + + // Get the leader + leader := c.Leader() + followers := c.GetInState(Follower) + + // Add a follower + future := leader.AddPeer(followers[0].localAddr) + + // Should be already added + if err := future.Error(); err != ErrKnownPeer { + c.FailNowf("[ERR] err: %v", err) + } +} + +func TestRaft_RemoveUnknownPeer(t *testing.T) { + // Make a cluster + c := MakeCluster(3, t, nil) + defer c.Close() + + // Get the leader + leader := c.Leader() + + // Remove unknown + future := leader.RemovePeer(NewInmemAddr()) + + // Should be already added + if err := future.Error(); err != ErrUnknownPeer { + c.FailNowf("[ERR] err: %v", err) + } +} + +func TestRaft_SnapshotRestore(t *testing.T) { + // Make the cluster + conf := inmemConfig(t) + conf.TrailingLogs = 10 + c := MakeCluster(1, t, conf) + defer c.Close() + + // Commit a lot of things + leader := c.Leader() + var future Future + for i := 0; i < 100; i++ { + future = leader.Apply([]byte(fmt.Sprintf("test%d", i)), 0) + } + + // Wait for the last future to apply + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Take a snapshot + snapFuture := leader.Snapshot() + if err := snapFuture.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Check for snapshot + if snaps, _ := leader.snapshots.List(); len(snaps) != 1 { + c.FailNowf("[ERR] should have a snapshot") + } + + // Logs should be trimmed + if idx, _ := leader.logs.FirstIndex(); idx != 92 { + c.FailNowf("[ERR] should trim logs to 92: %d", idx) + } + + // Shutdown + shutdown := leader.Shutdown() + if err := shutdown.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Restart the Raft + r := leader + // Can't just reuse the old transport as it will be closed + _, trans2 := NewInmemTransport(r.trans.LocalAddr()) + r, err := NewRaft(r.conf, r.fsm, r.logs, r.stable, + r.snapshots, r.peerStore, trans2) + if err != nil { + c.FailNowf("[ERR] err: %v", err) + } + c.rafts[0] = r + + // We should have restored from the snapshot! + if last := r.getLastApplied(); last != 101 { + c.FailNowf("[ERR] bad last: %v", last) + } +} + +func TestRaft_SnapshotRestore_PeerChange(t *testing.T) { + // Make the cluster + conf := inmemConfig(t) + conf.TrailingLogs = 10 + c := MakeCluster(3, t, conf) + defer c.Close() + + // Commit a lot of things + leader := c.Leader() + var future Future + for i := 0; i < 100; i++ { + future = leader.Apply([]byte(fmt.Sprintf("test%d", i)), 0) + } + + // Wait for the last future to apply + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Take a snapshot + snapFuture := leader.Snapshot() + if err := snapFuture.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Shutdown + shutdown := leader.Shutdown() + if err := shutdown.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Make a separate cluster + c2 := MakeClusterNoPeers(2, t, conf) + defer c2.Close() + + // Kill the old cluster + for _, sec := range c.rafts { + if sec != leader { + sec.Shutdown() + } + } + + // Change the peer addresses + peers := []string{leader.trans.LocalAddr()} + for _, sec := range c2.rafts { + peers = append(peers, sec.trans.LocalAddr()) + } + + // Restart the Raft with new peers + r := leader + peerStore := &StaticPeers{StaticPeers: peers} + // Can't just reuse the old transport as it will be closed + _, trans2 := NewInmemTransport(r.trans.LocalAddr()) + r, err := NewRaft(r.conf, r.fsm, r.logs, r.stable, + r.snapshots, peerStore, trans2) + if err != nil { + c.FailNowf("[ERR] err: %v", err) + } + c.rafts[0] = r + c2.rafts = append(c2.rafts, r) + c2.trans = append(c2.trans, r.trans.(*InmemTransport)) + c2.fsms = append(c2.fsms, r.fsm.(*MockFSM)) + c2.FullyConnect() + + // Wait a while + time.Sleep(c.propagateTimeout) + + // Ensure we elect a leader, and that we replicate + // to our new followers + c2.EnsureSame(t) + + // We should have restored from the snapshot! + if last := r.getLastApplied(); last != 102 { + c.FailNowf("[ERR] bad last: %v", last) + } +} + +func TestRaft_AutoSnapshot(t *testing.T) { + // Make the cluster + conf := inmemConfig(t) + conf.SnapshotInterval = conf.CommitTimeout * 2 + conf.SnapshotThreshold = 50 + conf.TrailingLogs = 10 + c := MakeCluster(1, t, conf) + defer c.Close() + + // Commit a lot of things + leader := c.Leader() + var future Future + for i := 0; i < 100; i++ { + future = leader.Apply([]byte(fmt.Sprintf("test%d", i)), 0) + } + + // Wait for the last future to apply + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Wait for a snapshot to happen + time.Sleep(c.propagateTimeout) + + // Check for snapshot + if snaps, _ := leader.snapshots.List(); len(snaps) == 0 { + c.FailNowf("[ERR] should have a snapshot") + } +} + +func TestRaft_ManualSnapshot(t *testing.T) { + // Make the cluster + conf := inmemConfig(t) + conf.SnapshotThreshold = 50 + conf.TrailingLogs = 10 + c := MakeCluster(1, t, conf) + defer c.Close() + + leader := c.Leader() + // with nothing commited, asking for a snapshot should return an error + ssErr := leader.Snapshot().Error() + if ssErr != ErrNothingNewToSnapshot { + t.Errorf("Attempt to manualy create snapshot should of errored because there's nothing to do: %v", ssErr) + } + // commit some things + var future Future + for i := 0; i < 10; i++ { + future = leader.Apply([]byte(fmt.Sprintf("test %d", i)), 0) + } + if err := future.Error(); err != nil { + c.FailNowf("[ERR] Error Apply new log entries: %v", err) + } + // now we should be able to ask for a snapshot without getting an error + ssErr = leader.Snapshot().Error() + if ssErr != nil { + t.Errorf("Request for Snapshot failed: %v", ssErr) + } +} + +func TestRaft_SendSnapshotFollower(t *testing.T) { + // Make the cluster + conf := inmemConfig(t) + conf.TrailingLogs = 10 + c := MakeCluster(3, t, conf) + defer c.Close() + + // Disconnect one follower + followers := c.Followers() + leader := c.Leader() + behind := followers[0] + c.Disconnect(behind.localAddr) + + // Commit a lot of things + var future Future + for i := 0; i < 100; i++ { + future = leader.Apply([]byte(fmt.Sprintf("test%d", i)), 0) + } + + // Wait for the last future to apply + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } else { + t.Logf("[INFO] Finished apply without behind follower") + } + + // Snapshot, this will truncate logs! + for _, r := range c.rafts { + future = r.Snapshot() + // the disconnected node will have nothing to snapshot, so that's expected + if err := future.Error(); err != nil && err != ErrNothingNewToSnapshot { + c.FailNowf("[ERR] err: %v", err) + } + } + + // Reconnect the behind node + c.FullyConnect() + + // Ensure all the logs are the same + c.EnsureSame(t) +} + +func TestRaft_ReJoinFollower(t *testing.T) { + // Enable operation after a remove + conf := inmemConfig(t) + conf.ShutdownOnRemove = false + + // Make a cluster + c := MakeCluster(3, t, conf) + defer c.Close() + + // Get the leader + leader := c.Leader() + + // Wait until we have 2 followers + limit := time.Now().Add(c.longstopTimeout) + var followers []*Raft + for time.Now().Before(limit) && len(followers) != 2 { + c.WaitEvent(nil, c.conf.CommitTimeout) + followers = c.GetInState(Follower) + } + if len(followers) != 2 { + c.FailNowf("[ERR] expected two followers: %v", followers) + } + + // Remove a follower + follower := followers[0] + future := leader.RemovePeer(follower.localAddr) + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Wait a while + time.Sleep(c.propagateTimeout) + + // Other nodes should have fewer peers + if peers, _ := leader.peerStore.Peers(); len(peers) != 2 { + c.FailNowf("[ERR] too many peers: %v", peers) + } + if peers, _ := followers[1].peerStore.Peers(); len(peers) != 2 { + c.FailNowf("[ERR] too many peers: %v", peers) + } + + // Get the leader + time.Sleep(c.propagateTimeout) + leader = c.Leader() + + // Rejoin. The follower will have a higher term than the leader, + // this will cause the leader to step down, and a new round of elections + // to take place. We should eventually re-stabilize. + future = leader.AddPeer(follower.localAddr) + if err := future.Error(); err != nil && err != ErrLeadershipLost { + c.FailNowf("[ERR] err: %v", err) + } + + // Wait a while + time.Sleep(c.propagateTimeout) + + // Other nodes should have fewer peers + if peers, _ := leader.peerStore.Peers(); len(peers) != 3 { + c.FailNowf("[ERR] missing peers: %v", peers) + } + if peers, _ := followers[1].peerStore.Peers(); len(peers) != 3 { + c.FailNowf("[ERR] missing peers: %v", peers) + } + + // Should be a follower now + if follower.State() != Follower { + c.FailNowf("[ERR] bad state: %v", follower.State()) + } +} + +func TestRaft_LeaderLeaseExpire(t *testing.T) { + // Make a cluster + conf := inmemConfig(t) + c := MakeCluster(2, t, conf) + defer c.Close() + + // Get the leader + leader := c.Leader() + + // Wait until we have a followers + limit := time.Now().Add(c.longstopTimeout) + var followers []*Raft + for time.Now().Before(limit) && len(followers) != 1 { + c.WaitEvent(nil, c.conf.CommitTimeout) + followers = c.GetInState(Follower) + } + if len(followers) != 1 { + c.FailNowf("[ERR] expected a followers: %v", followers) + } + + // Disconnect the follower now + follower := followers[0] + t.Logf("[INFO] Disconnecting %v", follower) + c.Disconnect(follower.localAddr) + + // Watch the leaderCh + select { + case v := <-leader.LeaderCh(): + if v { + c.FailNowf("[ERR] should step down as leader") + } + case <-time.After(conf.LeaderLeaseTimeout * 2): + c.FailNowf("[ERR] timeout stepping down as leader") + } + + // Ensure the last contact of the leader is non-zero + if leader.LastContact().IsZero() { + c.FailNowf("[ERR] expected non-zero contact time") + } + + // Should be no leaders + if len(c.GetInState(Leader)) != 0 { + c.FailNowf("[ERR] expected step down") + } + + // Verify no further contact + last := follower.LastContact() + time.Sleep(c.propagateTimeout) + + // Check that last contact has not changed + if last != follower.LastContact() { + c.FailNowf("[ERR] unexpected further contact") + } + + // Ensure both have cleared their leader + if l := leader.Leader(); l != "" { + c.FailNowf("[ERR] bad: %v", l) + } + if l := follower.Leader(); l != "" { + c.FailNowf("[ERR] bad: %v", l) + } +} + +func TestRaft_Barrier(t *testing.T) { + // Make the cluster + c := MakeCluster(3, t, nil) + defer c.Close() + + // Get the leader + leader := c.Leader() + + // Commit a lot of things + for i := 0; i < 100; i++ { + leader.Apply([]byte(fmt.Sprintf("test%d", i)), 0) + } + + // Wait for a barrier complete + barrier := leader.Barrier(0) + + // Wait for the barrier future to apply + if err := barrier.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Ensure all the logs are the same + c.EnsureSame(t) + if len(c.fsms[0].logs) != 100 { + c.FailNowf("[ERR] Bad log length") + } +} + +func TestRaft_VerifyLeader(t *testing.T) { + // Make the cluster + c := MakeCluster(3, t, nil) + defer c.Close() + + // Get the leader + leader := c.Leader() + + // Verify we are leader + verify := leader.VerifyLeader() + + // Wait for the verify to apply + if err := verify.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } +} + +func TestRaft_VerifyLeader_Single(t *testing.T) { + // Make the cluster + c := MakeCluster(1, t, nil) + defer c.Close() + + // Get the leader + leader := c.Leader() + + // Verify we are leader + verify := leader.VerifyLeader() + + // Wait for the verify to apply + if err := verify.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } +} + +func TestRaft_VerifyLeader_Fail(t *testing.T) { + // Make a cluster + conf := inmemConfig(t) + c := MakeCluster(2, t, conf) + defer c.Close() + + // Get the leader + leader := c.Leader() + + // Wait until we have a followers + followers := c.Followers() + + // Force follower to different term + follower := followers[0] + follower.setCurrentTerm(follower.getCurrentTerm() + 1) + + // Verify we are leader + verify := leader.VerifyLeader() + + // Wait for the leader to step down + if err := verify.Error(); err != ErrNotLeader && err != ErrLeadershipLost { + c.FailNowf("[ERR] err: %v", err) + } + + // Ensure the known leader is cleared + if l := leader.Leader(); l != "" { + c.FailNowf("[ERR] bad: %v", l) + } +} + +func TestRaft_VerifyLeader_ParitalConnect(t *testing.T) { + // Make a cluster + conf := inmemConfig(t) + c := MakeCluster(3, t, conf) + defer c.Close() + + // Get the leader + leader := c.Leader() + + // Wait until we have a followers + limit := time.Now().Add(c.longstopTimeout) + var followers []*Raft + for time.Now().Before(limit) && len(followers) != 2 { + c.WaitEvent(nil, c.conf.CommitTimeout) + followers = c.GetInState(Follower) + } + if len(followers) != 2 { + c.FailNowf("[ERR] expected two followers but got: %v", followers) + } + + // Force partial disconnect + follower := followers[0] + t.Logf("[INFO] Disconnecting %v", follower) + c.Disconnect(follower.localAddr) + + // Verify we are leader + verify := leader.VerifyLeader() + + // Wait for the leader to step down + if err := verify.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } +} + +func TestRaft_SettingPeers(t *testing.T) { + // Make the cluster + c := MakeClusterNoPeers(3, t, nil) + defer c.Close() + + peers := make([]string, 0, len(c.rafts)) + for _, v := range c.rafts { + peers = append(peers, v.localAddr) + } + + for _, v := range c.rafts { + future := v.SetPeers(peers) + if err := future.Error(); err != nil { + c.FailNowf("[ERR] error setting peers: %v", err) + } + } + + // Wait a while + time.Sleep(c.propagateTimeout) + + // Should have a new leader + if leader := c.Leader(); leader == nil { + c.FailNowf("[ERR] no leader?") + } +} + +func TestRaft_StartAsLeader(t *testing.T) { + conf := inmemConfig(t) + conf.StartAsLeader = true + c := MakeCluster(1, t, conf) + defer c.Close() + raft := c.rafts[0] + + // Watch leaderCh for change + select { + case v := <-raft.LeaderCh(): + if !v { + c.FailNowf("[ERR] should become leader") + } + case <-time.After(c.conf.HeartbeatTimeout * 4): + // Longer than you think as possibility of multiple elections + c.FailNowf("[ERR] timeout becoming leader") + } + + // Should be leader + if s := raft.State(); s != Leader { + c.FailNowf("[ERR] expected leader: %v", s) + } + + // Should be able to apply + future := raft.Apply([]byte("test"), c.conf.CommitTimeout) + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Check the response + if future.Response().(int) != 1 { + c.FailNowf("[ERR] bad response: %v", future.Response()) + } + + // Check the index + if idx := future.Index(); idx == 0 { + c.FailNowf("[ERR] bad index: %d", idx) + } + + // Check that it is applied to the FSM + if len(c.fsms[0].logs) != 1 { + c.FailNowf("[ERR] did not apply to FSM!") + } +} + +func TestRaft_NotifyCh(t *testing.T) { + ch := make(chan bool, 1) + conf := inmemConfig(t) + conf.NotifyCh = ch + c := MakeCluster(1, t, conf) + defer c.Close() + + // Watch leaderCh for change + select { + case v := <-ch: + if !v { + c.FailNowf("[ERR] should become leader") + } + case <-time.After(conf.HeartbeatTimeout * 8): + c.FailNowf("[ERR] timeout becoming leader") + } + + // Close the cluster + c.Close() + + // Watch leaderCh for change + select { + case v := <-ch: + if v { + c.FailNowf("[ERR] should step down as leader") + } + case <-time.After(conf.HeartbeatTimeout * 6): + c.FailNowf("[ERR] timeout on step down as leader") + } +} + +func TestRaft_Voting(t *testing.T) { + c := MakeCluster(3, t, nil) + defer c.Close() + followers := c.Followers() + ldr := c.Leader() + ldrT := c.trans[c.IndexOf(ldr)] + + reqVote := RequestVoteRequest{ + Term: 42, + Candidate: ldrT.EncodePeer(ldr.localAddr), + LastLogIndex: ldr.LastIndex(), + LastLogTerm: 1, + } + // a follower that thinks there's a leader should vote for that leader. + var resp RequestVoteResponse + if err := ldrT.RequestVote(followers[0].localAddr, &reqVote, &resp); err != nil { + c.FailNowf("[ERR] RequestVote RPC failed %v", err) + } + if !resp.Granted { + c.FailNowf("[ERR] expected vote to be granted, but wasn't %+v", resp) + } + // a follow that thinks there's a leader shouldn't vote for a different candidate + reqVote.Candidate = ldrT.EncodePeer(followers[0].localAddr) + if err := ldrT.RequestVote(followers[1].localAddr, &reqVote, &resp); err != nil { + c.FailNowf("[ERR] RequestVote RPC failed %v", err) + } + if resp.Granted { + c.FailNowf("[ERR] expected vote not to be granted, but was %+v", resp) + } +} diff --git a/vendor/github.com/hashicorp/raft/replication.go b/vendor/github.com/hashicorp/raft/replication.go new file mode 100644 index 00000000..1f8b923c --- /dev/null +++ b/vendor/github.com/hashicorp/raft/replication.go @@ -0,0 +1,522 @@ +package raft + +import ( + "errors" + "fmt" + "sync" + "time" + + "github.com/armon/go-metrics" +) + +const ( + maxFailureScale = 12 + failureWait = 10 * time.Millisecond +) + +var ( + // ErrLogNotFound indicates a given log entry is not available. + ErrLogNotFound = errors.New("log not found") + + // ErrPipelineReplicationNotSupported can be returned by the transport to + // signal that pipeline replication is not supported in general, and that + // no error message should be produced. + ErrPipelineReplicationNotSupported = errors.New("pipeline replication not supported") +) + +type followerReplication struct { + peer string + inflight *inflight + + stopCh chan uint64 + triggerCh chan struct{} + + currentTerm uint64 + matchIndex uint64 + nextIndex uint64 + + lastContact time.Time + lastContactLock sync.RWMutex + + failures uint64 + + notifyCh chan struct{} + notify []*verifyFuture + notifyLock sync.Mutex + + // stepDown is used to indicate to the leader that we + // should step down based on information from a follower. + stepDown chan struct{} + + // allowPipeline is used to control it seems like + // pipeline replication should be enabled. + allowPipeline bool +} + +// notifyAll is used to notify all the waiting verify futures +// if the follower believes we are still the leader. +func (s *followerReplication) notifyAll(leader bool) { + // Clear the waiting notifies minimizing lock time + s.notifyLock.Lock() + n := s.notify + s.notify = nil + s.notifyLock.Unlock() + + // Submit our votes + for _, v := range n { + v.vote(leader) + } +} + +// LastContact returns the time of last contact. +func (s *followerReplication) LastContact() time.Time { + s.lastContactLock.RLock() + last := s.lastContact + s.lastContactLock.RUnlock() + return last +} + +// setLastContact sets the last contact to the current time. +func (s *followerReplication) setLastContact() { + s.lastContactLock.Lock() + s.lastContact = time.Now() + s.lastContactLock.Unlock() +} + +// replicate is a long running routine that is used to manage +// the process of replicating logs to our followers. +func (r *Raft) replicate(s *followerReplication) { + // Start an async heartbeating routing + stopHeartbeat := make(chan struct{}) + defer close(stopHeartbeat) + r.goFunc(func() { r.heartbeat(s, stopHeartbeat) }) + +RPC: + shouldStop := false + for !shouldStop { + select { + case maxIndex := <-s.stopCh: + // Make a best effort to replicate up to this index + if maxIndex > 0 { + r.replicateTo(s, maxIndex) + } + return + case <-s.triggerCh: + lastLogIdx, _ := r.getLastLog() + shouldStop = r.replicateTo(s, lastLogIdx) + case <-randomTimeout(r.conf.CommitTimeout): + lastLogIdx, _ := r.getLastLog() + shouldStop = r.replicateTo(s, lastLogIdx) + } + + // If things looks healthy, switch to pipeline mode + if !shouldStop && s.allowPipeline { + goto PIPELINE + } + } + return + +PIPELINE: + // Disable until re-enabled + s.allowPipeline = false + + // Replicates using a pipeline for high performance. This method + // is not able to gracefully recover from errors, and so we fall back + // to standard mode on failure. + if err := r.pipelineReplicate(s); err != nil { + if err != ErrPipelineReplicationNotSupported { + r.logger.Printf("[ERR] raft: Failed to start pipeline replication to %s: %s", s.peer, err) + } + } + goto RPC +} + +// replicateTo is used to replicate the logs up to a given last index. +// If the follower log is behind, we take care to bring them up to date. +func (r *Raft) replicateTo(s *followerReplication, lastIndex uint64) (shouldStop bool) { + // Create the base request + var req AppendEntriesRequest + var resp AppendEntriesResponse + var start time.Time +START: + // Prevent an excessive retry rate on errors + if s.failures > 0 { + select { + case <-time.After(backoff(failureWait, s.failures, maxFailureScale)): + case <-r.shutdownCh: + } + } + + // Setup the request + if err := r.setupAppendEntries(s, &req, s.nextIndex, lastIndex); err == ErrLogNotFound { + goto SEND_SNAP + } else if err != nil { + return + } + + // Make the RPC call + start = time.Now() + if err := r.trans.AppendEntries(s.peer, &req, &resp); err != nil { + r.logger.Printf("[ERR] raft: Failed to AppendEntries to %v: %v", s.peer, err) + s.failures++ + return + } + appendStats(s.peer, start, float32(len(req.Entries))) + + // Check for a newer term, stop running + if resp.Term > req.Term { + r.handleStaleTerm(s) + return true + } + + // Update the last contact + s.setLastContact() + + // Update s based on success + if resp.Success { + // Update our replication state + updateLastAppended(s, &req) + + // Clear any failures, allow pipelining + s.failures = 0 + s.allowPipeline = true + } else { + s.nextIndex = max(min(s.nextIndex-1, resp.LastLog+1), 1) + s.matchIndex = s.nextIndex - 1 + if resp.NoRetryBackoff { + s.failures = 0 + } else { + s.failures++ + } + r.logger.Printf("[WARN] raft: AppendEntries to %v rejected, sending older logs (next: %d)", s.peer, s.nextIndex) + } + +CHECK_MORE: + // Check if there are more logs to replicate + if s.nextIndex <= lastIndex { + goto START + } + return + + // SEND_SNAP is used when we fail to get a log, usually because the follower + // is too far behind, and we must ship a snapshot down instead +SEND_SNAP: + if stop, err := r.sendLatestSnapshot(s); stop { + return true + } else if err != nil { + r.logger.Printf("[ERR] raft: Failed to send snapshot to %v: %v", s.peer, err) + return + } + + // Check if there is more to replicate + goto CHECK_MORE +} + +// sendLatestSnapshot is used to send the latest snapshot we have +// down to our follower. +func (r *Raft) sendLatestSnapshot(s *followerReplication) (bool, error) { + // Get the snapshots + snapshots, err := r.snapshots.List() + if err != nil { + r.logger.Printf("[ERR] raft: Failed to list snapshots: %v", err) + return false, err + } + + // Check we have at least a single snapshot + if len(snapshots) == 0 { + return false, fmt.Errorf("no snapshots found") + } + + // Open the most recent snapshot + snapID := snapshots[0].ID + meta, snapshot, err := r.snapshots.Open(snapID) + if err != nil { + r.logger.Printf("[ERR] raft: Failed to open snapshot %v: %v", snapID, err) + return false, err + } + defer snapshot.Close() + + // Setup the request + req := InstallSnapshotRequest{ + Term: s.currentTerm, + Leader: r.trans.EncodePeer(r.localAddr), + LastLogIndex: meta.Index, + LastLogTerm: meta.Term, + Peers: meta.Peers, + Size: meta.Size, + } + + // Make the call + start := time.Now() + var resp InstallSnapshotResponse + if err := r.trans.InstallSnapshot(s.peer, &req, &resp, snapshot); err != nil { + r.logger.Printf("[ERR] raft: Failed to install snapshot %v: %v", snapID, err) + s.failures++ + return false, err + } + metrics.MeasureSince([]string{"raft", "replication", "installSnapshot", s.peer}, start) + + // Check for a newer term, stop running + if resp.Term > req.Term { + r.handleStaleTerm(s) + return true, nil + } + + // Update the last contact + s.setLastContact() + + // Check for success + if resp.Success { + // Mark any inflight logs as committed + s.inflight.CommitRange(s.matchIndex+1, meta.Index) + + // Update the indexes + s.matchIndex = meta.Index + s.nextIndex = s.matchIndex + 1 + + // Clear any failures + s.failures = 0 + + // Notify we are still leader + s.notifyAll(true) + } else { + s.failures++ + r.logger.Printf("[WARN] raft: InstallSnapshot to %v rejected", s.peer) + } + return false, nil +} + +// heartbeat is used to periodically invoke AppendEntries on a peer +// to ensure they don't time out. This is done async of replicate(), +// since that routine could potentially be blocked on disk IO. +func (r *Raft) heartbeat(s *followerReplication, stopCh chan struct{}) { + var failures uint64 + req := AppendEntriesRequest{ + Term: s.currentTerm, + Leader: r.trans.EncodePeer(r.localAddr), + } + var resp AppendEntriesResponse + for { + // Wait for the next heartbeat interval or forced notify + select { + case <-s.notifyCh: + case <-randomTimeout(r.conf.HeartbeatTimeout / 10): + case <-stopCh: + return + } + + start := time.Now() + if err := r.trans.AppendEntries(s.peer, &req, &resp); err != nil { + r.logger.Printf("[ERR] raft: Failed to heartbeat to %v: %v", s.peer, err) + failures++ + select { + case <-time.After(backoff(failureWait, failures, maxFailureScale)): + case <-stopCh: + } + } else { + s.setLastContact() + failures = 0 + metrics.MeasureSince([]string{"raft", "replication", "heartbeat", s.peer}, start) + s.notifyAll(resp.Success) + } + } +} + +// pipelineReplicate is used when we have synchronized our state with the follower, +// and want to switch to a higher performance pipeline mode of replication. +// We only pipeline AppendEntries commands, and if we ever hit an error, we fall +// back to the standard replication which can handle more complex situations. +func (r *Raft) pipelineReplicate(s *followerReplication) error { + // Create a new pipeline + pipeline, err := r.trans.AppendEntriesPipeline(s.peer) + if err != nil { + return err + } + defer pipeline.Close() + + // Log start and stop of pipeline + r.logger.Printf("[INFO] raft: pipelining replication to peer %v", s.peer) + defer r.logger.Printf("[INFO] raft: aborting pipeline replication to peer %v", s.peer) + + // Create a shutdown and finish channel + stopCh := make(chan struct{}) + finishCh := make(chan struct{}) + + // Start a dedicated decoder + r.goFunc(func() { r.pipelineDecode(s, pipeline, stopCh, finishCh) }) + + // Start pipeline sends at the last good nextIndex + nextIndex := s.nextIndex + + shouldStop := false +SEND: + for !shouldStop { + select { + case <-finishCh: + break SEND + case maxIndex := <-s.stopCh: + if maxIndex > 0 { + r.pipelineSend(s, pipeline, &nextIndex, maxIndex) + } + break SEND + case <-s.triggerCh: + lastLogIdx, _ := r.getLastLog() + shouldStop = r.pipelineSend(s, pipeline, &nextIndex, lastLogIdx) + case <-randomTimeout(r.conf.CommitTimeout): + lastLogIdx, _ := r.getLastLog() + shouldStop = r.pipelineSend(s, pipeline, &nextIndex, lastLogIdx) + } + } + + // Stop our decoder, and wait for it to finish + close(stopCh) + select { + case <-finishCh: + case <-r.shutdownCh: + } + return nil +} + +// pipelineSend is used to send data over a pipeline. +func (r *Raft) pipelineSend(s *followerReplication, p AppendPipeline, nextIdx *uint64, lastIndex uint64) (shouldStop bool) { + // Create a new append request + req := new(AppendEntriesRequest) + if err := r.setupAppendEntries(s, req, *nextIdx, lastIndex); err != nil { + return true + } + + // Pipeline the append entries + if _, err := p.AppendEntries(req, new(AppendEntriesResponse)); err != nil { + r.logger.Printf("[ERR] raft: Failed to pipeline AppendEntries to %v: %v", s.peer, err) + return true + } + + // Increase the next send log to avoid re-sending old logs + if n := len(req.Entries); n > 0 { + last := req.Entries[n-1] + *nextIdx = last.Index + 1 + } + return false +} + +// pipelineDecode is used to decode the responses of pipelined requests. +func (r *Raft) pipelineDecode(s *followerReplication, p AppendPipeline, stopCh, finishCh chan struct{}) { + defer close(finishCh) + respCh := p.Consumer() + for { + select { + case ready := <-respCh: + req, resp := ready.Request(), ready.Response() + appendStats(s.peer, ready.Start(), float32(len(req.Entries))) + + // Check for a newer term, stop running + if resp.Term > req.Term { + r.handleStaleTerm(s) + return + } + + // Update the last contact + s.setLastContact() + + // Abort pipeline if not successful + if !resp.Success { + return + } + + // Update our replication state + updateLastAppended(s, req) + case <-stopCh: + return + } + } +} + +// setupAppendEntries is used to setup an append entries request. +func (r *Raft) setupAppendEntries(s *followerReplication, req *AppendEntriesRequest, nextIndex, lastIndex uint64) error { + req.Term = s.currentTerm + req.Leader = r.trans.EncodePeer(r.localAddr) + req.LeaderCommitIndex = r.getCommitIndex() + if err := r.setPreviousLog(req, nextIndex); err != nil { + return err + } + if err := r.setNewLogs(req, nextIndex, lastIndex); err != nil { + return err + } + return nil +} + +// setPreviousLog is used to setup the PrevLogEntry and PrevLogTerm for an +// AppendEntriesRequest given the next index to replicate. +func (r *Raft) setPreviousLog(req *AppendEntriesRequest, nextIndex uint64) error { + // Guard for the first index, since there is no 0 log entry + // Guard against the previous index being a snapshot as well + lastSnapIdx, lastSnapTerm := r.getLastSnapshot() + if nextIndex == 1 { + req.PrevLogEntry = 0 + req.PrevLogTerm = 0 + + } else if (nextIndex - 1) == lastSnapIdx { + req.PrevLogEntry = lastSnapIdx + req.PrevLogTerm = lastSnapTerm + + } else { + var l Log + if err := r.logs.GetLog(nextIndex-1, &l); err != nil { + r.logger.Printf("[ERR] raft: Failed to get log at index %d: %v", + nextIndex-1, err) + return err + } + + // Set the previous index and term (0 if nextIndex is 1) + req.PrevLogEntry = l.Index + req.PrevLogTerm = l.Term + } + return nil +} + +// setNewLogs is used to setup the logs which should be appended for a request. +func (r *Raft) setNewLogs(req *AppendEntriesRequest, nextIndex, lastIndex uint64) error { + // Append up to MaxAppendEntries or up to the lastIndex + req.Entries = make([]*Log, 0, r.conf.MaxAppendEntries) + maxIndex := min(nextIndex+uint64(r.conf.MaxAppendEntries)-1, lastIndex) + for i := nextIndex; i <= maxIndex; i++ { + oldLog := new(Log) + if err := r.logs.GetLog(i, oldLog); err != nil { + r.logger.Printf("[ERR] raft: Failed to get log at index %d: %v", i, err) + return err + } + req.Entries = append(req.Entries, oldLog) + } + return nil +} + +// appendStats is used to emit stats about an AppendEntries invocation. +func appendStats(peer string, start time.Time, logs float32) { + metrics.MeasureSince([]string{"raft", "replication", "appendEntries", "rpc", peer}, start) + metrics.IncrCounter([]string{"raft", "replication", "appendEntries", "logs", peer}, logs) +} + +// handleStaleTerm is used when a follower indicates that we have a stale term. +func (r *Raft) handleStaleTerm(s *followerReplication) { + r.logger.Printf("[ERR] raft: peer %v has newer term, stopping replication", s.peer) + s.notifyAll(false) // No longer leader + asyncNotifyCh(s.stepDown) +} + +// updateLastAppended is used to update follower replication state after a successful +// AppendEntries RPC. +func updateLastAppended(s *followerReplication, req *AppendEntriesRequest) { + // Mark any inflight logs as committed + if logs := req.Entries; len(logs) > 0 { + first := logs[0] + last := logs[len(logs)-1] + s.inflight.CommitRange(first.Index, last.Index) + + // Update the indexes + s.matchIndex = last.Index + s.nextIndex = last.Index + 1 + } + + // Notify still leader + s.notifyAll(true) +} diff --git a/vendor/github.com/hashicorp/raft/snapshot.go b/vendor/github.com/hashicorp/raft/snapshot.go new file mode 100644 index 00000000..a4a17f1c --- /dev/null +++ b/vendor/github.com/hashicorp/raft/snapshot.go @@ -0,0 +1,40 @@ +package raft + +import ( + "io" +) + +// SnapshotMeta is for metadata of a snapshot. +type SnapshotMeta struct { + ID string // ID is opaque to the store, and is used for opening + Index uint64 + Term uint64 + Peers []byte + Size int64 +} + +// SnapshotStore interface is used to allow for flexible implementations +// of snapshot storage and retrieval. For example, a client could implement +// a shared state store such as S3, allowing new nodes to restore snapshots +// without streaming from the leader. +type SnapshotStore interface { + // Create is used to begin a snapshot at a given index and term, + // with the current peer set already encoded. + Create(index, term uint64, peers []byte) (SnapshotSink, error) + + // List is used to list the available snapshots in the store. + // It should return then in descending order, with the highest index first. + List() ([]*SnapshotMeta, error) + + // Open takes a snapshot ID and provides a ReadCloser. Once close is + // called it is assumed the snapshot is no longer needed. + Open(id string) (*SnapshotMeta, io.ReadCloser, error) +} + +// SnapshotSink is returned by StartSnapshot. The FSM will Write state +// to the sink and call Close on completion. On error, Cancel will be invoked. +type SnapshotSink interface { + io.WriteCloser + ID() string + Cancel() error +} diff --git a/vendor/github.com/hashicorp/raft/stable.go b/vendor/github.com/hashicorp/raft/stable.go new file mode 100644 index 00000000..ff59a8c5 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/stable.go @@ -0,0 +1,15 @@ +package raft + +// StableStore is used to provide stable storage +// of key configurations to ensure safety. +type StableStore interface { + Set(key []byte, val []byte) error + + // Get returns the value for key, or an empty byte slice if key was not found. + Get(key []byte) ([]byte, error) + + SetUint64(key []byte, val uint64) error + + // GetUint64 returns the uint64 value for key, or 0 if key was not found. + GetUint64(key []byte) (uint64, error) +} diff --git a/vendor/github.com/hashicorp/raft/state.go b/vendor/github.com/hashicorp/raft/state.go new file mode 100644 index 00000000..a58cd0d1 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/state.go @@ -0,0 +1,171 @@ +package raft + +import ( + "sync" + "sync/atomic" +) + +// RaftState captures the state of a Raft node: Follower, Candidate, Leader, +// or Shutdown. +type RaftState uint32 + +const ( + // Follower is the initial state of a Raft node. + Follower RaftState = iota + + // Candidate is one of the valid states of a Raft node. + Candidate + + // Leader is one of the valid states of a Raft node. + Leader + + // Shutdown is the terminal state of a Raft node. + Shutdown +) + +func (s RaftState) String() string { + switch s { + case Follower: + return "Follower" + case Candidate: + return "Candidate" + case Leader: + return "Leader" + case Shutdown: + return "Shutdown" + default: + return "Unknown" + } +} + +// raftState is used to maintain various state variables +// and provides an interface to set/get the variables in a +// thread safe manner. +type raftState struct { + // currentTerm commitIndex, lastApplied, must be kept at the top of + // the struct so they're 64 bit aligned which is a requirement for + // atomic ops on 32 bit platforms. + + // The current term, cache of StableStore + currentTerm uint64 + + // Highest committed log entry + commitIndex uint64 + + // Last applied log to the FSM + lastApplied uint64 + + // protects 4 next fields + lastLock sync.Mutex + + // Cache the latest snapshot index/term + lastSnapshotIndex uint64 + lastSnapshotTerm uint64 + + // Cache the latest log from LogStore + lastLogIndex uint64 + lastLogTerm uint64 + + // Tracks running goroutines + routinesGroup sync.WaitGroup + + // The current state + state RaftState +} + +func (r *raftState) getState() RaftState { + stateAddr := (*uint32)(&r.state) + return RaftState(atomic.LoadUint32(stateAddr)) +} + +func (r *raftState) setState(s RaftState) { + stateAddr := (*uint32)(&r.state) + atomic.StoreUint32(stateAddr, uint32(s)) +} + +func (r *raftState) getCurrentTerm() uint64 { + return atomic.LoadUint64(&r.currentTerm) +} + +func (r *raftState) setCurrentTerm(term uint64) { + atomic.StoreUint64(&r.currentTerm, term) +} + +func (r *raftState) getLastLog() (index, term uint64) { + r.lastLock.Lock() + index = r.lastLogIndex + term = r.lastLogTerm + r.lastLock.Unlock() + return +} + +func (r *raftState) setLastLog(index, term uint64) { + r.lastLock.Lock() + r.lastLogIndex = index + r.lastLogTerm = term + r.lastLock.Unlock() +} + +func (r *raftState) getLastSnapshot() (index, term uint64) { + r.lastLock.Lock() + index = r.lastSnapshotIndex + term = r.lastSnapshotTerm + r.lastLock.Unlock() + return +} + +func (r *raftState) setLastSnapshot(index, term uint64) { + r.lastLock.Lock() + r.lastSnapshotIndex = index + r.lastSnapshotTerm = term + r.lastLock.Unlock() +} + +func (r *raftState) getCommitIndex() uint64 { + return atomic.LoadUint64(&r.commitIndex) +} + +func (r *raftState) setCommitIndex(index uint64) { + atomic.StoreUint64(&r.commitIndex, index) +} + +func (r *raftState) getLastApplied() uint64 { + return atomic.LoadUint64(&r.lastApplied) +} + +func (r *raftState) setLastApplied(index uint64) { + atomic.StoreUint64(&r.lastApplied, index) +} + +// Start a goroutine and properly handle the race between a routine +// starting and incrementing, and exiting and decrementing. +func (r *raftState) goFunc(f func()) { + r.routinesGroup.Add(1) + go func() { + defer r.routinesGroup.Done() + f() + }() +} + +func (r *raftState) waitShutdown() { + r.routinesGroup.Wait() +} + +// getLastIndex returns the last index in stable storage. +// Either from the last log or from the last snapshot. +func (r *raftState) getLastIndex() uint64 { + r.lastLock.Lock() + defer r.lastLock.Unlock() + return max(r.lastLogIndex, r.lastSnapshotIndex) +} + +// getLastEntry returns the last index and term in stable storage. +// Either from the last log or from the last snapshot. +func (r *raftState) getLastEntry() (uint64, uint64) { + r.lastLock.Lock() + defer r.lastLock.Unlock() + if r.lastLogIndex >= r.lastSnapshotIndex { + return r.lastLogIndex, r.lastLogTerm + } + return r.lastSnapshotIndex, r.lastSnapshotTerm +} diff --git a/vendor/github.com/hashicorp/raft/tag.sh b/vendor/github.com/hashicorp/raft/tag.sh new file mode 100755 index 00000000..cd16623a --- /dev/null +++ b/vendor/github.com/hashicorp/raft/tag.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +set -e + +# The version must be supplied from the environment. Do not include the +# leading "v". +if [ -z $VERSION ]; then + echo "Please specify a version." + exit 1 +fi + +# Generate the tag. +echo "==> Tagging version $VERSION..." +git commit --allow-empty -a --gpg-sign=348FFC4C -m "Release v$VERSION" +git tag -a -m "Version $VERSION" -s -u 348FFC4C "v${VERSION}" master + +exit 0 diff --git a/vendor/github.com/hashicorp/raft/tcp_transport.go b/vendor/github.com/hashicorp/raft/tcp_transport.go new file mode 100644 index 00000000..50c6d15d --- /dev/null +++ b/vendor/github.com/hashicorp/raft/tcp_transport.go @@ -0,0 +1,105 @@ +package raft + +import ( + "errors" + "io" + "log" + "net" + "time" +) + +var ( + errNotAdvertisable = errors.New("local bind address is not advertisable") + errNotTCP = errors.New("local address is not a TCP address") +) + +// TCPStreamLayer implements StreamLayer interface for plain TCP. +type TCPStreamLayer struct { + advertise net.Addr + listener *net.TCPListener +} + +// NewTCPTransport returns a NetworkTransport that is built on top of +// a TCP streaming transport layer. +func NewTCPTransport( + bindAddr string, + advertise net.Addr, + maxPool int, + timeout time.Duration, + logOutput io.Writer, +) (*NetworkTransport, error) { + return newTCPTransport(bindAddr, advertise, maxPool, timeout, func(stream StreamLayer) *NetworkTransport { + return NewNetworkTransport(stream, maxPool, timeout, logOutput) + }) +} + +// NewTCPTransportWithLogger returns a NetworkTransport that is built on top of +// a TCP streaming transport layer, with log output going to the supplied Logger +func NewTCPTransportWithLogger( + bindAddr string, + advertise net.Addr, + maxPool int, + timeout time.Duration, + logger *log.Logger, +) (*NetworkTransport, error) { + return newTCPTransport(bindAddr, advertise, maxPool, timeout, func(stream StreamLayer) *NetworkTransport { + return NewNetworkTransportWithLogger(stream, maxPool, timeout, logger) + }) +} + +func newTCPTransport(bindAddr string, + advertise net.Addr, + maxPool int, + timeout time.Duration, + transportCreator func(stream StreamLayer) *NetworkTransport) (*NetworkTransport, error) { + // Try to bind + list, err := net.Listen("tcp", bindAddr) + if err != nil { + return nil, err + } + + // Create stream + stream := &TCPStreamLayer{ + advertise: advertise, + listener: list.(*net.TCPListener), + } + + // Verify that we have a usable advertise address + addr, ok := stream.Addr().(*net.TCPAddr) + if !ok { + list.Close() + return nil, errNotTCP + } + if addr.IP.IsUnspecified() { + list.Close() + return nil, errNotAdvertisable + } + + // Create the network transport + trans := transportCreator(stream) + return trans, nil +} + +// Dial implements the StreamLayer interface. +func (t *TCPStreamLayer) Dial(address string, timeout time.Duration) (net.Conn, error) { + return net.DialTimeout("tcp", address, timeout) +} + +// Accept implements the net.Listener interface. +func (t *TCPStreamLayer) Accept() (c net.Conn, err error) { + return t.listener.Accept() +} + +// Close implements the net.Listener interface. +func (t *TCPStreamLayer) Close() (err error) { + return t.listener.Close() +} + +// Addr implements the net.Listener interface. +func (t *TCPStreamLayer) Addr() net.Addr { + // Use an advertise addr if provided + if t.advertise != nil { + return t.advertise + } + return t.listener.Addr() +} diff --git a/vendor/github.com/hashicorp/raft/tcp_transport_test.go b/vendor/github.com/hashicorp/raft/tcp_transport_test.go new file mode 100644 index 00000000..6020a546 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/tcp_transport_test.go @@ -0,0 +1,24 @@ +package raft + +import ( + "net" + "testing" +) + +func TestTCPTransport_BadAddr(t *testing.T) { + _, err := NewTCPTransportWithLogger("0.0.0.0:0", nil, 1, 0, newTestLogger(t)) + if err != errNotAdvertisable { + t.Fatalf("err: %v", err) + } +} + +func TestTCPTransport_WithAdvertise(t *testing.T) { + addr := &net.TCPAddr{IP: []byte{127, 0, 0, 1}, Port: 12345} + trans, err := NewTCPTransportWithLogger("0.0.0.0:0", addr, 1, 0, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + if trans.LocalAddr() != "127.0.0.1:12345" { + t.Fatalf("bad: %v", trans.LocalAddr()) + } +} diff --git a/vendor/github.com/hashicorp/raft/transport.go b/vendor/github.com/hashicorp/raft/transport.go new file mode 100644 index 00000000..2b8b422f --- /dev/null +++ b/vendor/github.com/hashicorp/raft/transport.go @@ -0,0 +1,124 @@ +package raft + +import ( + "io" + "time" +) + +// RPCResponse captures both a response and a potential error. +type RPCResponse struct { + Response interface{} + Error error +} + +// RPC has a command, and provides a response mechanism. +type RPC struct { + Command interface{} + Reader io.Reader // Set only for InstallSnapshot + RespChan chan<- RPCResponse +} + +// Respond is used to respond with a response, error or both +func (r *RPC) Respond(resp interface{}, err error) { + r.RespChan <- RPCResponse{resp, err} +} + +// Transport provides an interface for network transports +// to allow Raft to communicate with other nodes. +type Transport interface { + // Consumer returns a channel that can be used to + // consume and respond to RPC requests. + Consumer() <-chan RPC + + // LocalAddr is used to return our local address to distinguish from our peers. + LocalAddr() string + + // AppendEntriesPipeline returns an interface that can be used to pipeline + // AppendEntries requests. + AppendEntriesPipeline(target string) (AppendPipeline, error) + + // AppendEntries sends the appropriate RPC to the target node. + AppendEntries(target string, args *AppendEntriesRequest, resp *AppendEntriesResponse) error + + // RequestVote sends the appropriate RPC to the target node. + RequestVote(target string, args *RequestVoteRequest, resp *RequestVoteResponse) error + + // InstallSnapshot is used to push a snapshot down to a follower. The data is read from + // the ReadCloser and streamed to the client. + InstallSnapshot(target string, args *InstallSnapshotRequest, resp *InstallSnapshotResponse, data io.Reader) error + + // EncodePeer is used to serialize a peer name. + EncodePeer(string) []byte + + // DecodePeer is used to deserialize a peer name. + DecodePeer([]byte) string + + // SetHeartbeatHandler is used to setup a heartbeat handler + // as a fast-pass. This is to avoid head-of-line blocking from + // disk IO. If a Transport does not support this, it can simply + // ignore the call, and push the heartbeat onto the Consumer channel. + SetHeartbeatHandler(cb func(rpc RPC)) +} + +// WithClose is an interface that a transport may provide which +// allows a transport to be shut down cleanly when a Raft instance +// shuts down. +// +// It is defined separately from Transport as unfortunately it wasn't in the +// original interface specification. +type WithClose interface { + // Close permanently closes a transport, stopping + // any associated goroutines and freeing other resources. + Close() error +} + +// LoopbackTransport is an interface that provides a loopback transport suitable for testing +// e.g. InmemTransport. It's there so we don't have to rewrite tests. +type LoopbackTransport interface { + Transport // Embedded transport reference + WithPeers // Embedded peer management + WithClose // with a close routine +} + +// WithPeers is an interface that a transport may provide which allows for connection and +// disconnection. Unless the transport is a loopback transport, the transport specified to +// "Connect" is likely to be nil. +type WithPeers interface { + Connect(peer string, t Transport) // Connect a peer + Disconnect(peer string) // Disconnect a given peer + DisconnectAll() // Disconnect all peers, possibly to reconnect them later +} + +// AppendPipeline is used for pipelining AppendEntries requests. It is used +// to increase the replication throughput by masking latency and better +// utilizing bandwidth. +type AppendPipeline interface { + // AppendEntries is used to add another request to the pipeline. + // The send may block which is an effective form of back-pressure. + AppendEntries(args *AppendEntriesRequest, resp *AppendEntriesResponse) (AppendFuture, error) + + // Consumer returns a channel that can be used to consume + // response futures when they are ready. + Consumer() <-chan AppendFuture + + // Close closes the pipeline and cancels all inflight RPCs + Close() error +} + +// AppendFuture is used to return information about a pipelined AppendEntries request. +type AppendFuture interface { + Future + + // Start returns the time that the append request was started. + // It is always OK to call this method. + Start() time.Time + + // Request holds the parameters of the AppendEntries call. + // It is always OK to call this method. + Request() *AppendEntriesRequest + + // Response holds the results of the AppendEntries call. + // This method must only be called after the Error + // method returns, and will only be valid on success. + Response() *AppendEntriesResponse +} diff --git a/vendor/github.com/hashicorp/raft/transport_test.go b/vendor/github.com/hashicorp/raft/transport_test.go new file mode 100644 index 00000000..e3cbd525 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/transport_test.go @@ -0,0 +1,313 @@ +package raft + +import ( + "bytes" + "reflect" + "testing" + "time" +) + +const ( + TT_Inmem = iota + + // NOTE: must be last + numTestTransports +) + +func NewTestTransport(ttype int, addr string) (string, LoopbackTransport) { + switch ttype { + case TT_Inmem: + addr, lt := NewInmemTransport(addr) + return addr, lt + default: + panic("Unknown transport type") + } +} + +func TestTransport_StartStop(t *testing.T) { + for ttype := 0; ttype < numTestTransports; ttype++ { + _, trans := NewTestTransport(ttype, "") + if err := trans.Close(); err != nil { + t.Fatalf("err: %v", err) + } + } +} + +func TestTransport_AppendEntries(t *testing.T) { + for ttype := 0; ttype < numTestTransports; ttype++ { + addr1, trans1 := NewTestTransport(ttype, "") + defer trans1.Close() + rpcCh := trans1.Consumer() + + // Make the RPC request + args := AppendEntriesRequest{ + Term: 10, + Leader: []byte("cartman"), + PrevLogEntry: 100, + PrevLogTerm: 4, + Entries: []*Log{ + &Log{ + Index: 101, + Term: 4, + Type: LogNoop, + }, + }, + LeaderCommitIndex: 90, + } + resp := AppendEntriesResponse{ + Term: 4, + LastLog: 90, + Success: true, + } + + // Listen for a request + go func() { + select { + case rpc := <-rpcCh: + // Verify the command + req := rpc.Command.(*AppendEntriesRequest) + if !reflect.DeepEqual(req, &args) { + t.Fatalf("command mismatch: %#v %#v", *req, args) + } + rpc.Respond(&resp, nil) + + case <-time.After(200 * time.Millisecond): + t.Fatalf("timeout") + } + }() + + // Transport 2 makes outbound request + addr2, trans2 := NewTestTransport(ttype, "") + defer trans2.Close() + + trans1.Connect(addr2, trans2) + trans2.Connect(addr1, trans1) + + var out AppendEntriesResponse + if err := trans2.AppendEntries(trans1.LocalAddr(), &args, &out); err != nil { + t.Fatalf("err: %v", err) + } + + // Verify the response + if !reflect.DeepEqual(resp, out) { + t.Fatalf("command mismatch: %#v %#v", resp, out) + } + } +} + +func TestTransport_AppendEntriesPipeline(t *testing.T) { + for ttype := 0; ttype < numTestTransports; ttype++ { + addr1, trans1 := NewTestTransport(ttype, "") + defer trans1.Close() + rpcCh := trans1.Consumer() + + // Make the RPC request + args := AppendEntriesRequest{ + Term: 10, + Leader: []byte("cartman"), + PrevLogEntry: 100, + PrevLogTerm: 4, + Entries: []*Log{ + &Log{ + Index: 101, + Term: 4, + Type: LogNoop, + }, + }, + LeaderCommitIndex: 90, + } + resp := AppendEntriesResponse{ + Term: 4, + LastLog: 90, + Success: true, + } + + // Listen for a request + go func() { + for i := 0; i < 10; i++ { + select { + case rpc := <-rpcCh: + // Verify the command + req := rpc.Command.(*AppendEntriesRequest) + if !reflect.DeepEqual(req, &args) { + t.Fatalf("command mismatch: %#v %#v", *req, args) + } + rpc.Respond(&resp, nil) + + case <-time.After(200 * time.Millisecond): + t.Fatalf("timeout") + } + } + }() + + // Transport 2 makes outbound request + addr2, trans2 := NewTestTransport(ttype, "") + defer trans2.Close() + + trans1.Connect(addr2, trans2) + trans2.Connect(addr1, trans1) + + pipeline, err := trans2.AppendEntriesPipeline(trans1.LocalAddr()) + if err != nil { + t.Fatalf("err: %v", err) + } + defer pipeline.Close() + for i := 0; i < 10; i++ { + out := new(AppendEntriesResponse) + if _, err := pipeline.AppendEntries(&args, out); err != nil { + t.Fatalf("err: %v", err) + } + } + + respCh := pipeline.Consumer() + for i := 0; i < 10; i++ { + select { + case ready := <-respCh: + // Verify the response + if !reflect.DeepEqual(&resp, ready.Response()) { + t.Fatalf("command mismatch: %#v %#v", &resp, ready.Response()) + } + case <-time.After(200 * time.Millisecond): + t.Fatalf("timeout") + } + } + } +} + +func TestTransport_RequestVote(t *testing.T) { + for ttype := 0; ttype < numTestTransports; ttype++ { + addr1, trans1 := NewTestTransport(ttype, "") + defer trans1.Close() + rpcCh := trans1.Consumer() + + // Make the RPC request + args := RequestVoteRequest{ + Term: 20, + Candidate: []byte("butters"), + LastLogIndex: 100, + LastLogTerm: 19, + } + resp := RequestVoteResponse{ + Term: 100, + Peers: []byte("blah"), + Granted: false, + } + + // Listen for a request + go func() { + select { + case rpc := <-rpcCh: + // Verify the command + req := rpc.Command.(*RequestVoteRequest) + if !reflect.DeepEqual(req, &args) { + t.Fatalf("command mismatch: %#v %#v", *req, args) + } + + rpc.Respond(&resp, nil) + + case <-time.After(200 * time.Millisecond): + t.Fatalf("timeout") + } + }() + + // Transport 2 makes outbound request + addr2, trans2 := NewTestTransport(ttype, "") + defer trans2.Close() + + trans1.Connect(addr2, trans2) + trans2.Connect(addr1, trans1) + + var out RequestVoteResponse + if err := trans2.RequestVote(trans1.LocalAddr(), &args, &out); err != nil { + t.Fatalf("err: %v", err) + } + + // Verify the response + if !reflect.DeepEqual(resp, out) { + t.Fatalf("command mismatch: %#v %#v", resp, out) + } + } +} + +func TestTransport_InstallSnapshot(t *testing.T) { + for ttype := 0; ttype < numTestTransports; ttype++ { + addr1, trans1 := NewTestTransport(ttype, "") + defer trans1.Close() + rpcCh := trans1.Consumer() + + // Make the RPC request + args := InstallSnapshotRequest{ + Term: 10, + Leader: []byte("kyle"), + LastLogIndex: 100, + LastLogTerm: 9, + Peers: []byte("blah blah"), + Size: 10, + } + resp := InstallSnapshotResponse{ + Term: 10, + Success: true, + } + + // Listen for a request + go func() { + select { + case rpc := <-rpcCh: + // Verify the command + req := rpc.Command.(*InstallSnapshotRequest) + if !reflect.DeepEqual(req, &args) { + t.Fatalf("command mismatch: %#v %#v", *req, args) + } + + // Try to read the bytes + buf := make([]byte, 10) + rpc.Reader.Read(buf) + + // Compare + if bytes.Compare(buf, []byte("0123456789")) != 0 { + t.Fatalf("bad buf %v", buf) + } + + rpc.Respond(&resp, nil) + + case <-time.After(200 * time.Millisecond): + t.Fatalf("timeout") + } + }() + + // Transport 2 makes outbound request + addr2, trans2 := NewTestTransport(ttype, "") + defer trans2.Close() + + trans1.Connect(addr2, trans2) + trans2.Connect(addr1, trans1) + + // Create a buffer + buf := bytes.NewBuffer([]byte("0123456789")) + + var out InstallSnapshotResponse + if err := trans2.InstallSnapshot(trans1.LocalAddr(), &args, &out, buf); err != nil { + t.Fatalf("err: %v", err) + } + + // Verify the response + if !reflect.DeepEqual(resp, out) { + t.Fatalf("command mismatch: %#v %#v", resp, out) + } + } +} + +func TestTransport_EncodeDecode(t *testing.T) { + for ttype := 0; ttype < numTestTransports; ttype++ { + _, trans1 := NewTestTransport(ttype, "") + defer trans1.Close() + + local := trans1.LocalAddr() + enc := trans1.EncodePeer(local) + dec := trans1.DecodePeer(enc) + + if dec != local { + t.Fatalf("enc/dec fail: %v %v", dec, local) + } + } +} diff --git a/vendor/github.com/hashicorp/raft/util.go b/vendor/github.com/hashicorp/raft/util.go new file mode 100644 index 00000000..944968a2 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/util.go @@ -0,0 +1,179 @@ +package raft + +import ( + "bytes" + crand "crypto/rand" + "fmt" + "math" + "math/big" + "math/rand" + "time" + + "github.com/hashicorp/go-msgpack/codec" +) + +func init() { + // Ensure we use a high-entropy seed for the psuedo-random generator + rand.Seed(newSeed()) +} + +// returns an int64 from a crypto random source +// can be used to seed a source for a math/rand. +func newSeed() int64 { + r, err := crand.Int(crand.Reader, big.NewInt(math.MaxInt64)) + if err != nil { + panic(fmt.Errorf("failed to read random bytes: %v", err)) + } + return r.Int64() +} + +// randomTimeout returns a value that is between the minVal and 2x minVal. +func randomTimeout(minVal time.Duration) <-chan time.Time { + if minVal == 0 { + return nil + } + extra := (time.Duration(rand.Int63()) % minVal) + return time.After(minVal + extra) +} + +// min returns the minimum. +func min(a, b uint64) uint64 { + if a <= b { + return a + } + return b +} + +// max returns the maximum. +func max(a, b uint64) uint64 { + if a >= b { + return a + } + return b +} + +// generateUUID is used to generate a random UUID. +func generateUUID() string { + buf := make([]byte, 16) + if _, err := crand.Read(buf); err != nil { + panic(fmt.Errorf("failed to read random bytes: %v", err)) + } + + return fmt.Sprintf("%08x-%04x-%04x-%04x-%12x", + buf[0:4], + buf[4:6], + buf[6:8], + buf[8:10], + buf[10:16]) +} + +// asyncNotifyCh is used to do an async channel send +// to a single channel without blocking. +func asyncNotifyCh(ch chan struct{}) { + select { + case ch <- struct{}{}: + default: + } +} + +// asyncNotifyBool is used to do an async notification +// on a bool channel. +func asyncNotifyBool(ch chan bool, v bool) { + select { + case ch <- v: + default: + } +} + +// ExcludePeer is used to exclude a single peer from a list of peers. +func ExcludePeer(peers []string, peer string) []string { + otherPeers := make([]string, 0, len(peers)) + for _, p := range peers { + if p != peer { + otherPeers = append(otherPeers, p) + } + } + return otherPeers +} + +// PeerContained checks if a given peer is contained in a list. +func PeerContained(peers []string, peer string) bool { + for _, p := range peers { + if p == peer { + return true + } + } + return false +} + +// AddUniquePeer is used to add a peer to a list of existing +// peers only if it is not already contained. +func AddUniquePeer(peers []string, peer string) []string { + if PeerContained(peers, peer) { + return peers + } + return append(peers, peer) +} + +// encodePeers is used to serialize a list of peers. +func encodePeers(peers []string, trans Transport) []byte { + // Encode each peer + var encPeers [][]byte + for _, p := range peers { + encPeers = append(encPeers, trans.EncodePeer(p)) + } + + // Encode the entire array + buf, err := encodeMsgPack(encPeers) + if err != nil { + panic(fmt.Errorf("failed to encode peers: %v", err)) + } + + return buf.Bytes() +} + +// decodePeers is used to deserialize a list of peers. +func decodePeers(buf []byte, trans Transport) []string { + // Decode the buffer first + var encPeers [][]byte + if err := decodeMsgPack(buf, &encPeers); err != nil { + panic(fmt.Errorf("failed to decode peers: %v", err)) + } + + // Deserialize each peer + var peers []string + for _, enc := range encPeers { + peers = append(peers, trans.DecodePeer(enc)) + } + + return peers +} + +// Decode reverses the encode operation on a byte slice input. +func decodeMsgPack(buf []byte, out interface{}) error { + r := bytes.NewBuffer(buf) + hd := codec.MsgpackHandle{} + dec := codec.NewDecoder(r, &hd) + return dec.Decode(out) +} + +// Encode writes an encoded object to a new bytes buffer. +func encodeMsgPack(in interface{}) (*bytes.Buffer, error) { + buf := bytes.NewBuffer(nil) + hd := codec.MsgpackHandle{} + enc := codec.NewEncoder(buf, &hd) + err := enc.Encode(in) + return buf, err +} + +// backoff is used to compute an exponential backoff +// duration. Base time is scaled by the current round, +// up to some maximum scale factor. +func backoff(base time.Duration, round, limit uint64) time.Duration { + power := min(round, limit) + for power > 2 { + base *= 2 + power-- + } + return base +} diff --git a/vendor/github.com/hashicorp/raft/util_test.go b/vendor/github.com/hashicorp/raft/util_test.go new file mode 100644 index 00000000..88b93211 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/util_test.go @@ -0,0 +1,152 @@ +package raft + +import ( + "reflect" + "regexp" + "testing" + "time" +) + +func TestRandomTimeout(t *testing.T) { + start := time.Now() + timeout := randomTimeout(time.Millisecond) + + select { + case <-timeout: + diff := time.Now().Sub(start) + if diff < time.Millisecond { + t.Fatalf("fired early") + } + case <-time.After(3 * time.Millisecond): + t.Fatalf("timeout") + } +} + +func TestNewSeed(t *testing.T) { + vals := make(map[int64]bool) + for i := 0; i < 1000; i++ { + seed := newSeed() + if _, exists := vals[seed]; exists { + t.Fatal("newSeed() return a value it'd previously returned") + } + vals[seed] = true + } +} + +func TestRandomTimeout_NoTime(t *testing.T) { + timeout := randomTimeout(0) + if timeout != nil { + t.Fatalf("expected nil channel") + } +} + +func TestMin(t *testing.T) { + if min(1, 1) != 1 { + t.Fatalf("bad min") + } + if min(2, 1) != 1 { + t.Fatalf("bad min") + } + if min(1, 2) != 1 { + t.Fatalf("bad min") + } +} + +func TestMax(t *testing.T) { + if max(1, 1) != 1 { + t.Fatalf("bad max") + } + if max(2, 1) != 2 { + t.Fatalf("bad max") + } + if max(1, 2) != 2 { + t.Fatalf("bad max") + } +} + +func TestGenerateUUID(t *testing.T) { + prev := generateUUID() + for i := 0; i < 100; i++ { + id := generateUUID() + if prev == id { + t.Fatalf("Should get a new ID!") + } + + matched, err := regexp.MatchString( + `[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}`, id) + if !matched || err != nil { + t.Fatalf("expected match %s %v %s", id, matched, err) + } + } +} + +func TestExcludePeer(t *testing.T) { + peers := []string{NewInmemAddr(), NewInmemAddr(), NewInmemAddr()} + peer := peers[2] + + after := ExcludePeer(peers, peer) + if len(after) != 2 { + t.Fatalf("Bad length") + } + if after[0] == peer || after[1] == peer { + t.Fatalf("should not contain peer") + } +} + +func TestPeerContained(t *testing.T) { + peers := []string{NewInmemAddr(), NewInmemAddr(), NewInmemAddr()} + + if !PeerContained(peers, peers[2]) { + t.Fatalf("Expect contained") + } + if PeerContained(peers, NewInmemAddr()) { + t.Fatalf("unexpected contained") + } +} + +func TestAddUniquePeer(t *testing.T) { + peers := []string{NewInmemAddr(), NewInmemAddr(), NewInmemAddr()} + after := AddUniquePeer(peers, peers[2]) + if !reflect.DeepEqual(after, peers) { + t.Fatalf("unexpected append") + } + after = AddUniquePeer(peers, NewInmemAddr()) + if len(after) != 4 { + t.Fatalf("expected append") + } +} + +func TestEncodeDecodePeers(t *testing.T) { + peers := []string{NewInmemAddr(), NewInmemAddr(), NewInmemAddr()} + _, trans := NewInmemTransport("") + + // Try to encode/decode + buf := encodePeers(peers, trans) + decoded := decodePeers(buf, trans) + + if !reflect.DeepEqual(peers, decoded) { + t.Fatalf("mismatch %v %v", peers, decoded) + } +} + +func TestBackoff(t *testing.T) { + b := backoff(10*time.Millisecond, 1, 8) + if b != 10*time.Millisecond { + t.Fatalf("bad: %v", b) + } + + b = backoff(20*time.Millisecond, 2, 8) + if b != 20*time.Millisecond { + t.Fatalf("bad: %v", b) + } + + b = backoff(10*time.Millisecond, 8, 8) + if b != 640*time.Millisecond { + t.Fatalf("bad: %v", b) + } + + b = backoff(10*time.Millisecond, 9, 8) + if b != 640*time.Millisecond { + t.Fatalf("bad: %v", b) + } +}