commit
c541298bc2
@ -0,0 +1,10 @@
|
|||||||
|
language: go
|
||||||
|
|
||||||
|
go:
|
||||||
|
- 1.6
|
||||||
|
- 1.7
|
||||||
|
- tip
|
||||||
|
|
||||||
|
install: make deps
|
||||||
|
script:
|
||||||
|
- make test
|
@ -0,0 +1,362 @@
|
|||||||
|
Mozilla Public License, version 2.0
|
||||||
|
|
||||||
|
1. Definitions
|
||||||
|
|
||||||
|
1.1. "Contributor"
|
||||||
|
|
||||||
|
means each individual or legal entity that creates, contributes to the
|
||||||
|
creation of, or owns Covered Software.
|
||||||
|
|
||||||
|
1.2. "Contributor Version"
|
||||||
|
|
||||||
|
means the combination of the Contributions of others (if any) used by a
|
||||||
|
Contributor and that particular Contributor's Contribution.
|
||||||
|
|
||||||
|
1.3. "Contribution"
|
||||||
|
|
||||||
|
means Covered Software of a particular Contributor.
|
||||||
|
|
||||||
|
1.4. "Covered Software"
|
||||||
|
|
||||||
|
means Source Code Form to which the initial Contributor has attached the
|
||||||
|
notice in Exhibit A, the Executable Form of such Source Code Form, and
|
||||||
|
Modifications of such Source Code Form, in each case including portions
|
||||||
|
thereof.
|
||||||
|
|
||||||
|
1.5. "Incompatible With Secondary Licenses"
|
||||||
|
means
|
||||||
|
|
||||||
|
a. that the initial Contributor has attached the notice described in
|
||||||
|
Exhibit B to the Covered Software; or
|
||||||
|
|
||||||
|
b. that the Covered Software was made available under the terms of
|
||||||
|
version 1.1 or earlier of the License, but not also under the terms of
|
||||||
|
a Secondary License.
|
||||||
|
|
||||||
|
1.6. "Executable Form"
|
||||||
|
|
||||||
|
means any form of the work other than Source Code Form.
|
||||||
|
|
||||||
|
1.7. "Larger Work"
|
||||||
|
|
||||||
|
means a work that combines Covered Software with other material, in a
|
||||||
|
separate file or files, that is not Covered Software.
|
||||||
|
|
||||||
|
1.8. "License"
|
||||||
|
|
||||||
|
means this document.
|
||||||
|
|
||||||
|
1.9. "Licensable"
|
||||||
|
|
||||||
|
means having the right to grant, to the maximum extent possible, whether
|
||||||
|
at the time of the initial grant or subsequently, any and all of the
|
||||||
|
rights conveyed by this License.
|
||||||
|
|
||||||
|
1.10. "Modifications"
|
||||||
|
|
||||||
|
means any of the following:
|
||||||
|
|
||||||
|
a. any file in Source Code Form that results from an addition to,
|
||||||
|
deletion from, or modification of the contents of Covered Software; or
|
||||||
|
|
||||||
|
b. any new file in Source Code Form that contains any Covered Software.
|
||||||
|
|
||||||
|
1.11. "Patent Claims" of a Contributor
|
||||||
|
|
||||||
|
means any patent claim(s), including without limitation, method,
|
||||||
|
process, and apparatus claims, in any patent Licensable by such
|
||||||
|
Contributor that would be infringed, but for the grant of the License,
|
||||||
|
by the making, using, selling, offering for sale, having made, import,
|
||||||
|
or transfer of either its Contributions or its Contributor Version.
|
||||||
|
|
||||||
|
1.12. "Secondary License"
|
||||||
|
|
||||||
|
means either the GNU General Public License, Version 2.0, the GNU Lesser
|
||||||
|
General Public License, Version 2.1, the GNU Affero General Public
|
||||||
|
License, Version 3.0, or any later versions of those licenses.
|
||||||
|
|
||||||
|
1.13. "Source Code Form"
|
||||||
|
|
||||||
|
means the form of the work preferred for making modifications.
|
||||||
|
|
||||||
|
1.14. "You" (or "Your")
|
||||||
|
|
||||||
|
means an individual or a legal entity exercising rights under this
|
||||||
|
License. For legal entities, "You" includes any entity that controls, is
|
||||||
|
controlled by, or is under common control with You. For purposes of this
|
||||||
|
definition, "control" means (a) the power, direct or indirect, to cause
|
||||||
|
the direction or management of such entity, whether by contract or
|
||||||
|
otherwise, or (b) ownership of more than fifty percent (50%) of the
|
||||||
|
outstanding shares or beneficial ownership of such entity.
|
||||||
|
|
||||||
|
|
||||||
|
2. License Grants and Conditions
|
||||||
|
|
||||||
|
2.1. Grants
|
||||||
|
|
||||||
|
Each Contributor hereby grants You a world-wide, royalty-free,
|
||||||
|
non-exclusive license:
|
||||||
|
|
||||||
|
a. under intellectual property rights (other than patent or trademark)
|
||||||
|
Licensable by such Contributor to use, reproduce, make available,
|
||||||
|
modify, display, perform, distribute, and otherwise exploit its
|
||||||
|
Contributions, either on an unmodified basis, with Modifications, or
|
||||||
|
as part of a Larger Work; and
|
||||||
|
|
||||||
|
b. under Patent Claims of such Contributor to make, use, sell, offer for
|
||||||
|
sale, have made, import, and otherwise transfer either its
|
||||||
|
Contributions or its Contributor Version.
|
||||||
|
|
||||||
|
2.2. Effective Date
|
||||||
|
|
||||||
|
The licenses granted in Section 2.1 with respect to any Contribution
|
||||||
|
become effective for each Contribution on the date the Contributor first
|
||||||
|
distributes such Contribution.
|
||||||
|
|
||||||
|
2.3. Limitations on Grant Scope
|
||||||
|
|
||||||
|
The licenses granted in this Section 2 are the only rights granted under
|
||||||
|
this License. No additional rights or licenses will be implied from the
|
||||||
|
distribution or licensing of Covered Software under this License.
|
||||||
|
Notwithstanding Section 2.1(b) above, no patent license is granted by a
|
||||||
|
Contributor:
|
||||||
|
|
||||||
|
a. for any code that a Contributor has removed from Covered Software; or
|
||||||
|
|
||||||
|
b. for infringements caused by: (i) Your and any other third party's
|
||||||
|
modifications of Covered Software, or (ii) the combination of its
|
||||||
|
Contributions with other software (except as part of its Contributor
|
||||||
|
Version); or
|
||||||
|
|
||||||
|
c. under Patent Claims infringed by Covered Software in the absence of
|
||||||
|
its Contributions.
|
||||||
|
|
||||||
|
This License does not grant any rights in the trademarks, service marks,
|
||||||
|
or logos of any Contributor (except as may be necessary to comply with
|
||||||
|
the notice requirements in Section 3.4).
|
||||||
|
|
||||||
|
2.4. Subsequent Licenses
|
||||||
|
|
||||||
|
No Contributor makes additional grants as a result of Your choice to
|
||||||
|
distribute the Covered Software under a subsequent version of this
|
||||||
|
License (see Section 10.2) or under the terms of a Secondary License (if
|
||||||
|
permitted under the terms of Section 3.3).
|
||||||
|
|
||||||
|
2.5. Representation
|
||||||
|
|
||||||
|
Each Contributor represents that the Contributor believes its
|
||||||
|
Contributions are its original creation(s) or it has sufficient rights to
|
||||||
|
grant the rights to its Contributions conveyed by this License.
|
||||||
|
|
||||||
|
2.6. Fair Use
|
||||||
|
|
||||||
|
This License is not intended to limit any rights You have under
|
||||||
|
applicable copyright doctrines of fair use, fair dealing, or other
|
||||||
|
equivalents.
|
||||||
|
|
||||||
|
2.7. Conditions
|
||||||
|
|
||||||
|
Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in
|
||||||
|
Section 2.1.
|
||||||
|
|
||||||
|
|
||||||
|
3. Responsibilities
|
||||||
|
|
||||||
|
3.1. Distribution of Source Form
|
||||||
|
|
||||||
|
All distribution of Covered Software in Source Code Form, including any
|
||||||
|
Modifications that You create or to which You contribute, must be under
|
||||||
|
the terms of this License. You must inform recipients that the Source
|
||||||
|
Code Form of the Covered Software is governed by the terms of this
|
||||||
|
License, and how they can obtain a copy of this License. You may not
|
||||||
|
attempt to alter or restrict the recipients' rights in the Source Code
|
||||||
|
Form.
|
||||||
|
|
||||||
|
3.2. Distribution of Executable Form
|
||||||
|
|
||||||
|
If You distribute Covered Software in Executable Form then:
|
||||||
|
|
||||||
|
a. such Covered Software must also be made available in Source Code Form,
|
||||||
|
as described in Section 3.1, and You must inform recipients of the
|
||||||
|
Executable Form how they can obtain a copy of such Source Code Form by
|
||||||
|
reasonable means in a timely manner, at a charge no more than the cost
|
||||||
|
of distribution to the recipient; and
|
||||||
|
|
||||||
|
b. You may distribute such Executable Form under the terms of this
|
||||||
|
License, or sublicense it under different terms, provided that the
|
||||||
|
license for the Executable Form does not attempt to limit or alter the
|
||||||
|
recipients' rights in the Source Code Form under this License.
|
||||||
|
|
||||||
|
3.3. Distribution of a Larger Work
|
||||||
|
|
||||||
|
You may create and distribute a Larger Work under terms of Your choice,
|
||||||
|
provided that You also comply with the requirements of this License for
|
||||||
|
the Covered Software. If the Larger Work is a combination of Covered
|
||||||
|
Software with a work governed by one or more Secondary Licenses, and the
|
||||||
|
Covered Software is not Incompatible With Secondary Licenses, this
|
||||||
|
License permits You to additionally distribute such Covered Software
|
||||||
|
under the terms of such Secondary License(s), so that the recipient of
|
||||||
|
the Larger Work may, at their option, further distribute the Covered
|
||||||
|
Software under the terms of either this License or such Secondary
|
||||||
|
License(s).
|
||||||
|
|
||||||
|
3.4. Notices
|
||||||
|
|
||||||
|
You may not remove or alter the substance of any license notices
|
||||||
|
(including copyright notices, patent notices, disclaimers of warranty, or
|
||||||
|
limitations of liability) contained within the Source Code Form of the
|
||||||
|
Covered Software, except that You may alter any license notices to the
|
||||||
|
extent required to remedy known factual inaccuracies.
|
||||||
|
|
||||||
|
3.5. Application of Additional Terms
|
||||||
|
|
||||||
|
You may choose to offer, and to charge a fee for, warranty, support,
|
||||||
|
indemnity or liability obligations to one or more recipients of Covered
|
||||||
|
Software. However, You may do so only on Your own behalf, and not on
|
||||||
|
behalf of any Contributor. You must make it absolutely clear that any
|
||||||
|
such warranty, support, indemnity, or liability obligation is offered by
|
||||||
|
You alone, and You hereby agree to indemnify every Contributor for any
|
||||||
|
liability incurred by such Contributor as a result of warranty, support,
|
||||||
|
indemnity or liability terms You offer. You may include additional
|
||||||
|
disclaimers of warranty and limitations of liability specific to any
|
||||||
|
jurisdiction.
|
||||||
|
|
||||||
|
4. Inability to Comply Due to Statute or Regulation
|
||||||
|
|
||||||
|
If it is impossible for You to comply with any of the terms of this License
|
||||||
|
with respect to some or all of the Covered Software due to statute,
|
||||||
|
judicial order, or regulation then You must: (a) comply with the terms of
|
||||||
|
this License to the maximum extent possible; and (b) describe the
|
||||||
|
limitations and the code they affect. Such description must be placed in a
|
||||||
|
text file included with all distributions of the Covered Software under
|
||||||
|
this License. Except to the extent prohibited by statute or regulation,
|
||||||
|
such description must be sufficiently detailed for a recipient of ordinary
|
||||||
|
skill to be able to understand it.
|
||||||
|
|
||||||
|
5. Termination
|
||||||
|
|
||||||
|
5.1. The rights granted under this License will terminate automatically if You
|
||||||
|
fail to comply with any of its terms. However, if You become compliant,
|
||||||
|
then the rights granted under this License from a particular Contributor
|
||||||
|
are reinstated (a) provisionally, unless and until such Contributor
|
||||||
|
explicitly and finally terminates Your grants, and (b) on an ongoing
|
||||||
|
basis, if such Contributor fails to notify You of the non-compliance by
|
||||||
|
some reasonable means prior to 60 days after You have come back into
|
||||||
|
compliance. Moreover, Your grants from a particular Contributor are
|
||||||
|
reinstated on an ongoing basis if such Contributor notifies You of the
|
||||||
|
non-compliance by some reasonable means, this is the first time You have
|
||||||
|
received notice of non-compliance with this License from such
|
||||||
|
Contributor, and You become compliant prior to 30 days after Your receipt
|
||||||
|
of the notice.
|
||||||
|
|
||||||
|
5.2. If You initiate litigation against any entity by asserting a patent
|
||||||
|
infringement claim (excluding declaratory judgment actions,
|
||||||
|
counter-claims, and cross-claims) alleging that a Contributor Version
|
||||||
|
directly or indirectly infringes any patent, then the rights granted to
|
||||||
|
You by any and all Contributors for the Covered Software under Section
|
||||||
|
2.1 of this License shall terminate.
|
||||||
|
|
||||||
|
5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user
|
||||||
|
license agreements (excluding distributors and resellers) which have been
|
||||||
|
validly granted by You or Your distributors under this License prior to
|
||||||
|
termination shall survive termination.
|
||||||
|
|
||||||
|
6. Disclaimer of Warranty
|
||||||
|
|
||||||
|
Covered Software is provided under this License on an "as is" basis,
|
||||||
|
without warranty of any kind, either expressed, implied, or statutory,
|
||||||
|
including, without limitation, warranties that the Covered Software is free
|
||||||
|
of defects, merchantable, fit for a particular purpose or non-infringing.
|
||||||
|
The entire risk as to the quality and performance of the Covered Software
|
||||||
|
is with You. Should any Covered Software prove defective in any respect,
|
||||||
|
You (not any Contributor) assume the cost of any necessary servicing,
|
||||||
|
repair, or correction. This disclaimer of warranty constitutes an essential
|
||||||
|
part of this License. No use of any Covered Software is authorized under
|
||||||
|
this License except under this disclaimer.
|
||||||
|
|
||||||
|
7. Limitation of Liability
|
||||||
|
|
||||||
|
Under no circumstances and under no legal theory, whether tort (including
|
||||||
|
negligence), contract, or otherwise, shall any Contributor, or anyone who
|
||||||
|
distributes Covered Software as permitted above, be liable to You for any
|
||||||
|
direct, indirect, special, incidental, or consequential damages of any
|
||||||
|
character including, without limitation, damages for lost profits, loss of
|
||||||
|
goodwill, work stoppage, computer failure or malfunction, or any and all
|
||||||
|
other commercial damages or losses, even if such party shall have been
|
||||||
|
informed of the possibility of such damages. This limitation of liability
|
||||||
|
shall not apply to liability for death or personal injury resulting from
|
||||||
|
such party's negligence to the extent applicable law prohibits such
|
||||||
|
limitation. Some jurisdictions do not allow the exclusion or limitation of
|
||||||
|
incidental or consequential damages, so this exclusion and limitation may
|
||||||
|
not apply to You.
|
||||||
|
|
||||||
|
8. Litigation
|
||||||
|
|
||||||
|
Any litigation relating to this License may be brought only in the courts
|
||||||
|
of a jurisdiction where the defendant maintains its principal place of
|
||||||
|
business and such litigation shall be governed by laws of that
|
||||||
|
jurisdiction, without reference to its conflict-of-law provisions. Nothing
|
||||||
|
in this Section shall prevent a party's ability to bring cross-claims or
|
||||||
|
counter-claims.
|
||||||
|
|
||||||
|
9. Miscellaneous
|
||||||
|
|
||||||
|
This License represents the complete agreement concerning the subject
|
||||||
|
matter hereof. If any provision of this License is held to be
|
||||||
|
unenforceable, such provision shall be reformed only to the extent
|
||||||
|
necessary to make it enforceable. Any law or regulation which provides that
|
||||||
|
the language of a contract shall be construed against the drafter shall not
|
||||||
|
be used to construe this License against a Contributor.
|
||||||
|
|
||||||
|
|
||||||
|
10. Versions of the License
|
||||||
|
|
||||||
|
10.1. New Versions
|
||||||
|
|
||||||
|
Mozilla Foundation is the license steward. Except as provided in Section
|
||||||
|
10.3, no one other than the license steward has the right to modify or
|
||||||
|
publish new versions of this License. Each version will be given a
|
||||||
|
distinguishing version number.
|
||||||
|
|
||||||
|
10.2. Effect of New Versions
|
||||||
|
|
||||||
|
You may distribute the Covered Software under the terms of the version
|
||||||
|
of the License under which You originally received the Covered Software,
|
||||||
|
or under the terms of any subsequent version published by the license
|
||||||
|
steward.
|
||||||
|
|
||||||
|
10.3. Modified Versions
|
||||||
|
|
||||||
|
If you create software not governed by this License, and you want to
|
||||||
|
create a new license for such software, you may create and use a
|
||||||
|
modified version of this License if you rename the license and remove
|
||||||
|
any references to the name of the license steward (except to note that
|
||||||
|
such modified license differs from this License).
|
||||||
|
|
||||||
|
10.4. Distributing Source Code Form that is Incompatible With Secondary
|
||||||
|
Licenses If You choose to distribute Source Code Form that is
|
||||||
|
Incompatible With Secondary Licenses under the terms of this version of
|
||||||
|
the License, the notice described in Exhibit B of this License must be
|
||||||
|
attached.
|
||||||
|
|
||||||
|
Exhibit A - Source Code Form License Notice
|
||||||
|
|
||||||
|
This Source Code Form is subject to the
|
||||||
|
terms of the Mozilla Public License, v.
|
||||||
|
2.0. If a copy of the MPL was not
|
||||||
|
distributed with this file, You can
|
||||||
|
obtain one at
|
||||||
|
http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
If it is not possible or desirable to put the notice in a particular file,
|
||||||
|
then You may include the notice in a location (such as a LICENSE file in a
|
||||||
|
relevant directory) where a recipient would be likely to look for such a
|
||||||
|
notice.
|
||||||
|
|
||||||
|
You may add additional accurate notices of copyright ownership.
|
||||||
|
|
||||||
|
Exhibit B - "Incompatible With Secondary Licenses" Notice
|
||||||
|
|
||||||
|
This Source Code Form is "Incompatible
|
||||||
|
With Secondary Licenses", as defined by
|
||||||
|
the Mozilla Public License, v. 2.0.
|
@ -0,0 +1,11 @@
|
|||||||
|
DEPS = $(go list -f '{{range .TestImports}}{{.}} {{end}}' ./...)
|
||||||
|
|
||||||
|
.PHONY: test deps
|
||||||
|
|
||||||
|
test:
|
||||||
|
go test -timeout=30s ./...
|
||||||
|
|
||||||
|
deps:
|
||||||
|
go get -d -v ./...
|
||||||
|
echo $(DEPS) | xargs -n1 go get -d
|
||||||
|
|
@ -0,0 +1,11 @@
|
|||||||
|
raft-boltdb
|
||||||
|
===========
|
||||||
|
|
||||||
|
This repository provides the `raftboltdb` package. The package exports the
|
||||||
|
`BoltStore` which is an implementation of both a `LogStore` and `StableStore`.
|
||||||
|
|
||||||
|
It is meant to be used as a backend for the `raft` [package
|
||||||
|
here](https://github.com/hashicorp/raft).
|
||||||
|
|
||||||
|
This implementation uses [BoltDB](https://github.com/boltdb/bolt). BoltDB is
|
||||||
|
a simple key/value store implemented in pure Go, and inspired by LMDB.
|
@ -0,0 +1,88 @@
|
|||||||
|
package raftboltdb
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/hashicorp/raft/bench"
|
||||||
|
)
|
||||||
|
|
||||||
|
func BenchmarkBoltStore_FirstIndex(b *testing.B) {
|
||||||
|
store := testBoltStore(b)
|
||||||
|
defer store.Close()
|
||||||
|
defer os.Remove(store.path)
|
||||||
|
|
||||||
|
raftbench.FirstIndex(b, store)
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkBoltStore_LastIndex(b *testing.B) {
|
||||||
|
store := testBoltStore(b)
|
||||||
|
defer store.Close()
|
||||||
|
defer os.Remove(store.path)
|
||||||
|
|
||||||
|
raftbench.LastIndex(b, store)
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkBoltStore_GetLog(b *testing.B) {
|
||||||
|
store := testBoltStore(b)
|
||||||
|
defer store.Close()
|
||||||
|
defer os.Remove(store.path)
|
||||||
|
|
||||||
|
raftbench.GetLog(b, store)
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkBoltStore_StoreLog(b *testing.B) {
|
||||||
|
store := testBoltStore(b)
|
||||||
|
defer store.Close()
|
||||||
|
defer os.Remove(store.path)
|
||||||
|
|
||||||
|
raftbench.StoreLog(b, store)
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkBoltStore_StoreLogs(b *testing.B) {
|
||||||
|
store := testBoltStore(b)
|
||||||
|
defer store.Close()
|
||||||
|
defer os.Remove(store.path)
|
||||||
|
|
||||||
|
raftbench.StoreLogs(b, store)
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkBoltStore_DeleteRange(b *testing.B) {
|
||||||
|
store := testBoltStore(b)
|
||||||
|
defer store.Close()
|
||||||
|
defer os.Remove(store.path)
|
||||||
|
|
||||||
|
raftbench.DeleteRange(b, store)
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkBoltStore_Set(b *testing.B) {
|
||||||
|
store := testBoltStore(b)
|
||||||
|
defer store.Close()
|
||||||
|
defer os.Remove(store.path)
|
||||||
|
|
||||||
|
raftbench.Set(b, store)
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkBoltStore_Get(b *testing.B) {
|
||||||
|
store := testBoltStore(b)
|
||||||
|
defer store.Close()
|
||||||
|
defer os.Remove(store.path)
|
||||||
|
|
||||||
|
raftbench.Get(b, store)
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkBoltStore_SetUint64(b *testing.B) {
|
||||||
|
store := testBoltStore(b)
|
||||||
|
defer store.Close()
|
||||||
|
defer os.Remove(store.path)
|
||||||
|
|
||||||
|
raftbench.SetUint64(b, store)
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkBoltStore_GetUint64(b *testing.B) {
|
||||||
|
store := testBoltStore(b)
|
||||||
|
defer store.Close()
|
||||||
|
defer os.Remove(store.path)
|
||||||
|
|
||||||
|
raftbench.GetUint64(b, store)
|
||||||
|
}
|
@ -0,0 +1,255 @@
|
|||||||
|
package raftboltdb
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
|
||||||
|
"github.com/boltdb/bolt"
|
||||||
|
"github.com/hashicorp/raft"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// Permissions to use on the db file. This is only used if the
|
||||||
|
// database file does not exist and needs to be created.
|
||||||
|
dbFileMode = 0600
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
// Bucket names we perform transactions in
|
||||||
|
dbLogs = []byte("logs")
|
||||||
|
dbConf = []byte("conf")
|
||||||
|
|
||||||
|
// An error indicating a given key does not exist
|
||||||
|
ErrKeyNotFound = errors.New("not found")
|
||||||
|
)
|
||||||
|
|
||||||
|
// BoltStore provides access to BoltDB for Raft to store and retrieve
|
||||||
|
// log entries. It also provides key/value storage, and can be used as
|
||||||
|
// a LogStore and StableStore.
|
||||||
|
type BoltStore struct {
|
||||||
|
// conn is the underlying handle to the db.
|
||||||
|
conn *bolt.DB
|
||||||
|
|
||||||
|
// The path to the Bolt database file
|
||||||
|
path string
|
||||||
|
}
|
||||||
|
|
||||||
|
// Options contains all the configuraiton used to open the BoltDB
|
||||||
|
type Options struct {
|
||||||
|
// Path is the file path to the BoltDB to use
|
||||||
|
Path string
|
||||||
|
|
||||||
|
// BoltOptions contains any specific BoltDB options you might
|
||||||
|
// want to specify [e.g. open timeout]
|
||||||
|
BoltOptions *bolt.Options
|
||||||
|
}
|
||||||
|
|
||||||
|
// readOnly returns true if the contained bolt options say to open
|
||||||
|
// the DB in readOnly mode [this can be useful to tools that want
|
||||||
|
// to examine the log]
|
||||||
|
func (o *Options) readOnly() bool {
|
||||||
|
return o != nil && o.BoltOptions != nil && o.BoltOptions.ReadOnly
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewBoltStore takes a file path and returns a connected Raft backend.
|
||||||
|
func NewBoltStore(path string) (*BoltStore, error) {
|
||||||
|
return New(Options{Path: path})
|
||||||
|
}
|
||||||
|
|
||||||
|
// New uses the supplied options to open the BoltDB and prepare it for use as a raft backend.
|
||||||
|
func New(options Options) (*BoltStore, error) {
|
||||||
|
// Try to connect
|
||||||
|
handle, err := bolt.Open(options.Path, dbFileMode, options.BoltOptions)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the new store
|
||||||
|
store := &BoltStore{
|
||||||
|
conn: handle,
|
||||||
|
path: options.Path,
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the store was opened read-only, don't try and create buckets
|
||||||
|
if !options.readOnly() {
|
||||||
|
// Set up our buckets
|
||||||
|
if err := store.initialize(); err != nil {
|
||||||
|
store.Close()
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return store, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// initialize is used to set up all of the buckets.
|
||||||
|
func (b *BoltStore) initialize() error {
|
||||||
|
tx, err := b.conn.Begin(true)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer tx.Rollback()
|
||||||
|
|
||||||
|
// Create all the buckets
|
||||||
|
if _, err := tx.CreateBucketIfNotExists(dbLogs); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if _, err := tx.CreateBucketIfNotExists(dbConf); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return tx.Commit()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close is used to gracefully close the DB connection.
|
||||||
|
func (b *BoltStore) Close() error {
|
||||||
|
return b.conn.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
// FirstIndex returns the first known index from the Raft log.
|
||||||
|
func (b *BoltStore) FirstIndex() (uint64, error) {
|
||||||
|
tx, err := b.conn.Begin(false)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
defer tx.Rollback()
|
||||||
|
|
||||||
|
curs := tx.Bucket(dbLogs).Cursor()
|
||||||
|
if first, _ := curs.First(); first == nil {
|
||||||
|
return 0, nil
|
||||||
|
} else {
|
||||||
|
return bytesToUint64(first), nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// LastIndex returns the last known index from the Raft log.
|
||||||
|
func (b *BoltStore) LastIndex() (uint64, error) {
|
||||||
|
tx, err := b.conn.Begin(false)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
defer tx.Rollback()
|
||||||
|
|
||||||
|
curs := tx.Bucket(dbLogs).Cursor()
|
||||||
|
if last, _ := curs.Last(); last == nil {
|
||||||
|
return 0, nil
|
||||||
|
} else {
|
||||||
|
return bytesToUint64(last), nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetLog is used to retrieve a log from BoltDB at a given index.
|
||||||
|
func (b *BoltStore) GetLog(idx uint64, log *raft.Log) error {
|
||||||
|
tx, err := b.conn.Begin(false)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer tx.Rollback()
|
||||||
|
|
||||||
|
bucket := tx.Bucket(dbLogs)
|
||||||
|
val := bucket.Get(uint64ToBytes(idx))
|
||||||
|
|
||||||
|
if val == nil {
|
||||||
|
return raft.ErrLogNotFound
|
||||||
|
}
|
||||||
|
return decodeMsgPack(val, log)
|
||||||
|
}
|
||||||
|
|
||||||
|
// StoreLog is used to store a single raft log
|
||||||
|
func (b *BoltStore) StoreLog(log *raft.Log) error {
|
||||||
|
return b.StoreLogs([]*raft.Log{log})
|
||||||
|
}
|
||||||
|
|
||||||
|
// StoreLogs is used to store a set of raft logs
|
||||||
|
func (b *BoltStore) StoreLogs(logs []*raft.Log) error {
|
||||||
|
tx, err := b.conn.Begin(true)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer tx.Rollback()
|
||||||
|
|
||||||
|
for _, log := range logs {
|
||||||
|
key := uint64ToBytes(log.Index)
|
||||||
|
val, err := encodeMsgPack(log)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
bucket := tx.Bucket(dbLogs)
|
||||||
|
if err := bucket.Put(key, val.Bytes()); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return tx.Commit()
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeleteRange is used to delete logs within a given range inclusively.
|
||||||
|
func (b *BoltStore) DeleteRange(min, max uint64) error {
|
||||||
|
minKey := uint64ToBytes(min)
|
||||||
|
|
||||||
|
tx, err := b.conn.Begin(true)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer tx.Rollback()
|
||||||
|
|
||||||
|
curs := tx.Bucket(dbLogs).Cursor()
|
||||||
|
for k, _ := curs.Seek(minKey); k != nil; k, _ = curs.Next() {
|
||||||
|
// Handle out-of-range log index
|
||||||
|
if bytesToUint64(k) > max {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete in-range log index
|
||||||
|
if err := curs.Delete(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return tx.Commit()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set is used to set a key/value set outside of the raft log
|
||||||
|
func (b *BoltStore) Set(k, v []byte) error {
|
||||||
|
tx, err := b.conn.Begin(true)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer tx.Rollback()
|
||||||
|
|
||||||
|
bucket := tx.Bucket(dbConf)
|
||||||
|
if err := bucket.Put(k, v); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return tx.Commit()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get is used to retrieve a value from the k/v store by key
|
||||||
|
func (b *BoltStore) Get(k []byte) ([]byte, error) {
|
||||||
|
tx, err := b.conn.Begin(false)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer tx.Rollback()
|
||||||
|
|
||||||
|
bucket := tx.Bucket(dbConf)
|
||||||
|
val := bucket.Get(k)
|
||||||
|
|
||||||
|
if val == nil {
|
||||||
|
return nil, ErrKeyNotFound
|
||||||
|
}
|
||||||
|
return append([]byte(nil), val...), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// SetUint64 is like Set, but handles uint64 values
|
||||||
|
func (b *BoltStore) SetUint64(key []byte, val uint64) error {
|
||||||
|
return b.Set(key, uint64ToBytes(val))
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetUint64 is like Get, but handles uint64 values
|
||||||
|
func (b *BoltStore) GetUint64(key []byte) (uint64, error) {
|
||||||
|
val, err := b.Get(key)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
return bytesToUint64(val), nil
|
||||||
|
}
|
@ -0,0 +1,416 @@
|
|||||||
|
package raftboltdb
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"io/ioutil"
|
||||||
|
"os"
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/boltdb/bolt"
|
||||||
|
"github.com/hashicorp/raft"
|
||||||
|
)
|
||||||
|
|
||||||
|
func testBoltStore(t testing.TB) *BoltStore {
|
||||||
|
fh, err := ioutil.TempFile("", "bolt")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
os.Remove(fh.Name())
|
||||||
|
|
||||||
|
// Successfully creates and returns a store
|
||||||
|
store, err := NewBoltStore(fh.Name())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return store
|
||||||
|
}
|
||||||
|
|
||||||
|
func testRaftLog(idx uint64, data string) *raft.Log {
|
||||||
|
return &raft.Log{
|
||||||
|
Data: []byte(data),
|
||||||
|
Index: idx,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBoltStore_Implements(t *testing.T) {
|
||||||
|
var store interface{} = &BoltStore{}
|
||||||
|
if _, ok := store.(raft.StableStore); !ok {
|
||||||
|
t.Fatalf("BoltStore does not implement raft.StableStore")
|
||||||
|
}
|
||||||
|
if _, ok := store.(raft.LogStore); !ok {
|
||||||
|
t.Fatalf("BoltStore does not implement raft.LogStore")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBoltOptionsTimeout(t *testing.T) {
|
||||||
|
fh, err := ioutil.TempFile("", "bolt")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
os.Remove(fh.Name())
|
||||||
|
defer os.Remove(fh.Name())
|
||||||
|
options := Options{
|
||||||
|
Path: fh.Name(),
|
||||||
|
BoltOptions: &bolt.Options{
|
||||||
|
Timeout: time.Second / 10,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
store, err := New(options)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
defer store.Close()
|
||||||
|
// trying to open it again should timeout
|
||||||
|
doneCh := make(chan error, 1)
|
||||||
|
go func() {
|
||||||
|
_, err := New(options)
|
||||||
|
doneCh <- err
|
||||||
|
}()
|
||||||
|
select {
|
||||||
|
case err := <-doneCh:
|
||||||
|
if err == nil || err.Error() != "timeout" {
|
||||||
|
t.Errorf("Expected timeout error but got %v", err)
|
||||||
|
}
|
||||||
|
case <-time.After(5 * time.Second):
|
||||||
|
t.Errorf("Gave up waiting for timeout response")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBoltOptionsReadOnly(t *testing.T) {
|
||||||
|
fh, err := ioutil.TempFile("", "bolt")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
defer os.Remove(fh.Name())
|
||||||
|
store, err := NewBoltStore(fh.Name())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
// Create the log
|
||||||
|
log := &raft.Log{
|
||||||
|
Data: []byte("log1"),
|
||||||
|
Index: 1,
|
||||||
|
}
|
||||||
|
// Attempt to store the log
|
||||||
|
if err := store.StoreLog(log); err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
store.Close()
|
||||||
|
options := Options{
|
||||||
|
Path: fh.Name(),
|
||||||
|
BoltOptions: &bolt.Options{
|
||||||
|
Timeout: time.Second / 10,
|
||||||
|
ReadOnly: true,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
roStore, err := New(options)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
defer roStore.Close()
|
||||||
|
result := new(raft.Log)
|
||||||
|
if err := roStore.GetLog(1, result); err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure the log comes back the same
|
||||||
|
if !reflect.DeepEqual(log, result) {
|
||||||
|
t.Errorf("bad: %v", result)
|
||||||
|
}
|
||||||
|
// Attempt to store the log, should fail on a read-only store
|
||||||
|
err = roStore.StoreLog(log)
|
||||||
|
if err != bolt.ErrDatabaseReadOnly {
|
||||||
|
t.Errorf("expecting error %v, but got %v", bolt.ErrDatabaseReadOnly, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNewBoltStore(t *testing.T) {
|
||||||
|
fh, err := ioutil.TempFile("", "bolt")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
os.Remove(fh.Name())
|
||||||
|
defer os.Remove(fh.Name())
|
||||||
|
|
||||||
|
// Successfully creates and returns a store
|
||||||
|
store, err := NewBoltStore(fh.Name())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure the file was created
|
||||||
|
if store.path != fh.Name() {
|
||||||
|
t.Fatalf("unexpected file path %q", store.path)
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(fh.Name()); err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close the store so we can open again
|
||||||
|
if err := store.Close(); err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure our tables were created
|
||||||
|
db, err := bolt.Open(fh.Name(), dbFileMode, nil)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
tx, err := db.Begin(true)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
if _, err := tx.CreateBucket([]byte(dbLogs)); err != bolt.ErrBucketExists {
|
||||||
|
t.Fatalf("bad: %v", err)
|
||||||
|
}
|
||||||
|
if _, err := tx.CreateBucket([]byte(dbConf)); err != bolt.ErrBucketExists {
|
||||||
|
t.Fatalf("bad: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBoltStore_FirstIndex(t *testing.T) {
|
||||||
|
store := testBoltStore(t)
|
||||||
|
defer store.Close()
|
||||||
|
defer os.Remove(store.path)
|
||||||
|
|
||||||
|
// Should get 0 index on empty log
|
||||||
|
idx, err := store.FirstIndex()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
if idx != 0 {
|
||||||
|
t.Fatalf("bad: %v", idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set a mock raft log
|
||||||
|
logs := []*raft.Log{
|
||||||
|
testRaftLog(1, "log1"),
|
||||||
|
testRaftLog(2, "log2"),
|
||||||
|
testRaftLog(3, "log3"),
|
||||||
|
}
|
||||||
|
if err := store.StoreLogs(logs); err != nil {
|
||||||
|
t.Fatalf("bad: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fetch the first Raft index
|
||||||
|
idx, err = store.FirstIndex()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
if idx != 1 {
|
||||||
|
t.Fatalf("bad: %d", idx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBoltStore_LastIndex(t *testing.T) {
|
||||||
|
store := testBoltStore(t)
|
||||||
|
defer store.Close()
|
||||||
|
defer os.Remove(store.path)
|
||||||
|
|
||||||
|
// Should get 0 index on empty log
|
||||||
|
idx, err := store.LastIndex()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
if idx != 0 {
|
||||||
|
t.Fatalf("bad: %v", idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set a mock raft log
|
||||||
|
logs := []*raft.Log{
|
||||||
|
testRaftLog(1, "log1"),
|
||||||
|
testRaftLog(2, "log2"),
|
||||||
|
testRaftLog(3, "log3"),
|
||||||
|
}
|
||||||
|
if err := store.StoreLogs(logs); err != nil {
|
||||||
|
t.Fatalf("bad: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fetch the last Raft index
|
||||||
|
idx, err = store.LastIndex()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
if idx != 3 {
|
||||||
|
t.Fatalf("bad: %d", idx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBoltStore_GetLog(t *testing.T) {
|
||||||
|
store := testBoltStore(t)
|
||||||
|
defer store.Close()
|
||||||
|
defer os.Remove(store.path)
|
||||||
|
|
||||||
|
log := new(raft.Log)
|
||||||
|
|
||||||
|
// Should return an error on non-existent log
|
||||||
|
if err := store.GetLog(1, log); err != raft.ErrLogNotFound {
|
||||||
|
t.Fatalf("expected raft log not found error, got: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set a mock raft log
|
||||||
|
logs := []*raft.Log{
|
||||||
|
testRaftLog(1, "log1"),
|
||||||
|
testRaftLog(2, "log2"),
|
||||||
|
testRaftLog(3, "log3"),
|
||||||
|
}
|
||||||
|
if err := store.StoreLogs(logs); err != nil {
|
||||||
|
t.Fatalf("bad: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Should return the proper log
|
||||||
|
if err := store.GetLog(2, log); err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
if !reflect.DeepEqual(log, logs[1]) {
|
||||||
|
t.Fatalf("bad: %#v", log)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBoltStore_SetLog(t *testing.T) {
|
||||||
|
store := testBoltStore(t)
|
||||||
|
defer store.Close()
|
||||||
|
defer os.Remove(store.path)
|
||||||
|
|
||||||
|
// Create the log
|
||||||
|
log := &raft.Log{
|
||||||
|
Data: []byte("log1"),
|
||||||
|
Index: 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Attempt to store the log
|
||||||
|
if err := store.StoreLog(log); err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Retrieve the log again
|
||||||
|
result := new(raft.Log)
|
||||||
|
if err := store.GetLog(1, result); err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure the log comes back the same
|
||||||
|
if !reflect.DeepEqual(log, result) {
|
||||||
|
t.Fatalf("bad: %v", result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBoltStore_SetLogs(t *testing.T) {
|
||||||
|
store := testBoltStore(t)
|
||||||
|
defer store.Close()
|
||||||
|
defer os.Remove(store.path)
|
||||||
|
|
||||||
|
// Create a set of logs
|
||||||
|
logs := []*raft.Log{
|
||||||
|
testRaftLog(1, "log1"),
|
||||||
|
testRaftLog(2, "log2"),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Attempt to store the logs
|
||||||
|
if err := store.StoreLogs(logs); err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure we stored them all
|
||||||
|
result1, result2 := new(raft.Log), new(raft.Log)
|
||||||
|
if err := store.GetLog(1, result1); err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
if !reflect.DeepEqual(logs[0], result1) {
|
||||||
|
t.Fatalf("bad: %#v", result1)
|
||||||
|
}
|
||||||
|
if err := store.GetLog(2, result2); err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
if !reflect.DeepEqual(logs[1], result2) {
|
||||||
|
t.Fatalf("bad: %#v", result2)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBoltStore_DeleteRange(t *testing.T) {
|
||||||
|
store := testBoltStore(t)
|
||||||
|
defer store.Close()
|
||||||
|
defer os.Remove(store.path)
|
||||||
|
|
||||||
|
// Create a set of logs
|
||||||
|
log1 := testRaftLog(1, "log1")
|
||||||
|
log2 := testRaftLog(2, "log2")
|
||||||
|
log3 := testRaftLog(3, "log3")
|
||||||
|
logs := []*raft.Log{log1, log2, log3}
|
||||||
|
|
||||||
|
// Attempt to store the logs
|
||||||
|
if err := store.StoreLogs(logs); err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Attempt to delete a range of logs
|
||||||
|
if err := store.DeleteRange(1, 2); err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure the logs were deleted
|
||||||
|
if err := store.GetLog(1, new(raft.Log)); err != raft.ErrLogNotFound {
|
||||||
|
t.Fatalf("should have deleted log1")
|
||||||
|
}
|
||||||
|
if err := store.GetLog(2, new(raft.Log)); err != raft.ErrLogNotFound {
|
||||||
|
t.Fatalf("should have deleted log2")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBoltStore_Set_Get(t *testing.T) {
|
||||||
|
store := testBoltStore(t)
|
||||||
|
defer store.Close()
|
||||||
|
defer os.Remove(store.path)
|
||||||
|
|
||||||
|
// Returns error on non-existent key
|
||||||
|
if _, err := store.Get([]byte("bad")); err != ErrKeyNotFound {
|
||||||
|
t.Fatalf("expected not found error, got: %q", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
k, v := []byte("hello"), []byte("world")
|
||||||
|
|
||||||
|
// Try to set a k/v pair
|
||||||
|
if err := store.Set(k, v); err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to read it back
|
||||||
|
val, err := store.Get(k)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
if !bytes.Equal(val, v) {
|
||||||
|
t.Fatalf("bad: %v", val)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBoltStore_SetUint64_GetUint64(t *testing.T) {
|
||||||
|
store := testBoltStore(t)
|
||||||
|
defer store.Close()
|
||||||
|
defer os.Remove(store.path)
|
||||||
|
|
||||||
|
// Returns error on non-existent key
|
||||||
|
if _, err := store.GetUint64([]byte("bad")); err != ErrKeyNotFound {
|
||||||
|
t.Fatalf("expected not found error, got: %q", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
k, v := []byte("abc"), uint64(123)
|
||||||
|
|
||||||
|
// Attempt to set the k/v pair
|
||||||
|
if err := store.SetUint64(k, v); err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read back the value
|
||||||
|
val, err := store.GetUint64(k)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
if val != v {
|
||||||
|
t.Fatalf("bad: %v", val)
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,37 @@
|
|||||||
|
package raftboltdb
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"encoding/binary"
|
||||||
|
|
||||||
|
"github.com/hashicorp/go-msgpack/codec"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Decode reverses the encode operation on a byte slice input
|
||||||
|
func decodeMsgPack(buf []byte, out interface{}) error {
|
||||||
|
r := bytes.NewBuffer(buf)
|
||||||
|
hd := codec.MsgpackHandle{}
|
||||||
|
dec := codec.NewDecoder(r, &hd)
|
||||||
|
return dec.Decode(out)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Encode writes an encoded object to a new bytes buffer
|
||||||
|
func encodeMsgPack(in interface{}) (*bytes.Buffer, error) {
|
||||||
|
buf := bytes.NewBuffer(nil)
|
||||||
|
hd := codec.MsgpackHandle{}
|
||||||
|
enc := codec.NewEncoder(buf, &hd)
|
||||||
|
err := enc.Encode(in)
|
||||||
|
return buf, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Converts bytes to an integer
|
||||||
|
func bytesToUint64(b []byte) uint64 {
|
||||||
|
return binary.BigEndian.Uint64(b)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Converts a uint to a byte slice
|
||||||
|
func uint64ToBytes(u uint64) []byte {
|
||||||
|
buf := make([]byte, 8)
|
||||||
|
binary.BigEndian.PutUint64(buf, u)
|
||||||
|
return buf
|
||||||
|
}
|
@ -0,0 +1,23 @@
|
|||||||
|
# Compiled Object files, Static and Dynamic libs (Shared Objects)
|
||||||
|
*.o
|
||||||
|
*.a
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Folders
|
||||||
|
_obj
|
||||||
|
_test
|
||||||
|
|
||||||
|
# Architecture specific extensions/prefixes
|
||||||
|
*.[568vq]
|
||||||
|
[568vq].out
|
||||||
|
|
||||||
|
*.cgo1.go
|
||||||
|
*.cgo2.c
|
||||||
|
_cgo_defun.c
|
||||||
|
_cgo_gotypes.go
|
||||||
|
_cgo_export.*
|
||||||
|
|
||||||
|
_testmain.go
|
||||||
|
|
||||||
|
*.exe
|
||||||
|
*.test
|
@ -0,0 +1,16 @@
|
|||||||
|
language: go
|
||||||
|
|
||||||
|
go:
|
||||||
|
- 1.4
|
||||||
|
- 1.5
|
||||||
|
- 1.6
|
||||||
|
- tip
|
||||||
|
|
||||||
|
install: make deps
|
||||||
|
script:
|
||||||
|
- make integ
|
||||||
|
|
||||||
|
notifications:
|
||||||
|
flowdock:
|
||||||
|
secure: fZrcf9rlh2IrQrlch1sHkn3YI7SKvjGnAl/zyV5D6NROe1Bbr6d3QRMuCXWWdhJHzjKmXk5rIzbqJhUc0PNF7YjxGNKSzqWMQ56KcvN1k8DzlqxpqkcA3Jbs6fXCWo2fssRtZ7hj/wOP1f5n6cc7kzHDt9dgaYJ6nO2fqNPJiTc=
|
||||||
|
|
@ -0,0 +1,17 @@
|
|||||||
|
DEPS = $(go list -f '{{range .TestImports}}{{.}} {{end}}' ./...)
|
||||||
|
|
||||||
|
test:
|
||||||
|
go test -timeout=30s ./...
|
||||||
|
|
||||||
|
integ: test
|
||||||
|
INTEG_TESTS=yes go test -timeout=23s -run=Integ ./...
|
||||||
|
|
||||||
|
deps:
|
||||||
|
go get -d -v ./...
|
||||||
|
echo $(DEPS) | xargs -n1 go get -d
|
||||||
|
|
||||||
|
cov:
|
||||||
|
INTEG_TESTS=yes gocov test github.com/hashicorp/raft | gocov-html > /tmp/coverage.html
|
||||||
|
open /tmp/coverage.html
|
||||||
|
|
||||||
|
.PHONY: test cov integ deps
|
@ -0,0 +1,107 @@
|
|||||||
|
raft [![Build Status](https://travis-ci.org/hashicorp/raft.png)](https://travis-ci.org/hashicorp/raft)
|
||||||
|
====
|
||||||
|
|
||||||
|
raft is a [Go](http://www.golang.org) library that manages a replicated
|
||||||
|
log and can be used with an FSM to manage replicated state machines. It
|
||||||
|
is a library for providing [consensus](http://en.wikipedia.org/wiki/Consensus_(computer_science)).
|
||||||
|
|
||||||
|
The use cases for such a library are far-reaching as replicated state
|
||||||
|
machines are a key component of many distributed systems. They enable
|
||||||
|
building Consistent, Partition Tolerant (CP) systems, with limited
|
||||||
|
fault tolerance as well.
|
||||||
|
|
||||||
|
## Building
|
||||||
|
|
||||||
|
If you wish to build raft you'll need Go version 1.2+ installed.
|
||||||
|
|
||||||
|
Please check your installation with:
|
||||||
|
|
||||||
|
```
|
||||||
|
go version
|
||||||
|
```
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
|
||||||
|
For complete documentation, see the associated [Godoc](http://godoc.org/github.com/hashicorp/raft).
|
||||||
|
|
||||||
|
To prevent complications with cgo, the primary backend `MDBStore` is in a separate repository,
|
||||||
|
called [raft-mdb](http://github.com/hashicorp/raft-mdb). That is the recommended implementation
|
||||||
|
for the `LogStore` and `StableStore`.
|
||||||
|
|
||||||
|
A pure Go backend using [BoltDB](https://github.com/boltdb/bolt) is also available called
|
||||||
|
[raft-boltdb](https://github.com/hashicorp/raft-boltdb). It can also be used as a `LogStore`
|
||||||
|
and `StableStore`.
|
||||||
|
|
||||||
|
## Tagged Releases
|
||||||
|
|
||||||
|
As of September 2017, Hashicorp will start using tags for this library to clearly indicate
|
||||||
|
major version updates. We recommend you vendor your application's dependency on this library.
|
||||||
|
|
||||||
|
* v0.1.0 is the original stable version of the library that was in master and has been maintained
|
||||||
|
with no breaking API changes. This was in use by Consul prior to version 0.7.0.
|
||||||
|
|
||||||
|
* v1.0.0 takes the changes that were staged in the library-v2-stage-one branch. This version
|
||||||
|
manages server identities using a UUID, so introduces some breaking API changes. It also versions
|
||||||
|
the Raft protocol, and requires some special steps when interoperating with Raft servers running
|
||||||
|
older versions of the library (see the detailed comment in config.go about version compatibility).
|
||||||
|
You can reference https://github.com/hashicorp/consul/pull/2222 for an idea of what was required
|
||||||
|
to port Consul to these new interfaces.
|
||||||
|
|
||||||
|
This version includes some new features as well, including non voting servers, a new address
|
||||||
|
provider abstraction in the transport layer, and more resilient snapshots.
|
||||||
|
|
||||||
|
## Protocol
|
||||||
|
|
||||||
|
raft is based on ["Raft: In Search of an Understandable Consensus Algorithm"](https://ramcloud.stanford.edu/wiki/download/attachments/11370504/raft.pdf)
|
||||||
|
|
||||||
|
A high level overview of the Raft protocol is described below, but for details please read the full
|
||||||
|
[Raft paper](https://ramcloud.stanford.edu/wiki/download/attachments/11370504/raft.pdf)
|
||||||
|
followed by the raft source. Any questions about the raft protocol should be sent to the
|
||||||
|
[raft-dev mailing list](https://groups.google.com/forum/#!forum/raft-dev).
|
||||||
|
|
||||||
|
### Protocol Description
|
||||||
|
|
||||||
|
Raft nodes are always in one of three states: follower, candidate or leader. All
|
||||||
|
nodes initially start out as a follower. In this state, nodes can accept log entries
|
||||||
|
from a leader and cast votes. If no entries are received for some time, nodes
|
||||||
|
self-promote to the candidate state. In the candidate state nodes request votes from
|
||||||
|
their peers. If a candidate receives a quorum of votes, then it is promoted to a leader.
|
||||||
|
The leader must accept new log entries and replicate to all the other followers.
|
||||||
|
In addition, if stale reads are not acceptable, all queries must also be performed on
|
||||||
|
the leader.
|
||||||
|
|
||||||
|
Once a cluster has a leader, it is able to accept new log entries. A client can
|
||||||
|
request that a leader append a new log entry, which is an opaque binary blob to
|
||||||
|
Raft. The leader then writes the entry to durable storage and attempts to replicate
|
||||||
|
to a quorum of followers. Once the log entry is considered *committed*, it can be
|
||||||
|
*applied* to a finite state machine. The finite state machine is application specific,
|
||||||
|
and is implemented using an interface.
|
||||||
|
|
||||||
|
An obvious question relates to the unbounded nature of a replicated log. Raft provides
|
||||||
|
a mechanism by which the current state is snapshotted, and the log is compacted. Because
|
||||||
|
of the FSM abstraction, restoring the state of the FSM must result in the same state
|
||||||
|
as a replay of old logs. This allows Raft to capture the FSM state at a point in time,
|
||||||
|
and then remove all the logs that were used to reach that state. This is performed automatically
|
||||||
|
without user intervention, and prevents unbounded disk usage as well as minimizing
|
||||||
|
time spent replaying logs.
|
||||||
|
|
||||||
|
Lastly, there is the issue of updating the peer set when new servers are joining
|
||||||
|
or existing servers are leaving. As long as a quorum of nodes is available, this
|
||||||
|
is not an issue as Raft provides mechanisms to dynamically update the peer set.
|
||||||
|
If a quorum of nodes is unavailable, then this becomes a very challenging issue.
|
||||||
|
For example, suppose there are only 2 peers, A and B. The quorum size is also
|
||||||
|
2, meaning both nodes must agree to commit a log entry. If either A or B fails,
|
||||||
|
it is now impossible to reach quorum. This means the cluster is unable to add,
|
||||||
|
or remove a node, or commit any additional log entries. This results in *unavailability*.
|
||||||
|
At this point, manual intervention would be required to remove either A or B,
|
||||||
|
and to restart the remaining node in bootstrap mode.
|
||||||
|
|
||||||
|
A Raft cluster of 3 nodes can tolerate a single node failure, while a cluster
|
||||||
|
of 5 can tolerate 2 node failures. The recommended configuration is to either
|
||||||
|
run 3 or 5 raft servers. This maximizes availability without
|
||||||
|
greatly sacrificing performance.
|
||||||
|
|
||||||
|
In terms of performance, Raft is comparable to Paxos. Assuming stable leadership,
|
||||||
|
committing a log entry requires a single round trip to half of the cluster.
|
||||||
|
Thus performance is bound by disk I/O and network latency.
|
||||||
|
|
@ -0,0 +1,171 @@
|
|||||||
|
package raftbench
|
||||||
|
|
||||||
|
// raftbench provides common benchmarking functions which can be used by
|
||||||
|
// anything which implements the raft.LogStore and raft.StableStore interfaces.
|
||||||
|
// All functions accept these interfaces and perform benchmarking. This
|
||||||
|
// makes comparing backend performance easier by sharing the tests.
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/hashicorp/raft"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func FirstIndex(b *testing.B, store raft.LogStore) {
|
||||||
|
// Create some fake data
|
||||||
|
var logs []*raft.Log
|
||||||
|
for i := 1; i < 10; i++ {
|
||||||
|
logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")})
|
||||||
|
}
|
||||||
|
if err := store.StoreLogs(logs); err != nil {
|
||||||
|
b.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
b.ResetTimer()
|
||||||
|
|
||||||
|
// Run FirstIndex a number of times
|
||||||
|
for n := 0; n < b.N; n++ {
|
||||||
|
store.FirstIndex()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func LastIndex(b *testing.B, store raft.LogStore) {
|
||||||
|
// Create some fake data
|
||||||
|
var logs []*raft.Log
|
||||||
|
for i := 1; i < 10; i++ {
|
||||||
|
logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")})
|
||||||
|
}
|
||||||
|
if err := store.StoreLogs(logs); err != nil {
|
||||||
|
b.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
b.ResetTimer()
|
||||||
|
|
||||||
|
// Run LastIndex a number of times
|
||||||
|
for n := 0; n < b.N; n++ {
|
||||||
|
store.LastIndex()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func GetLog(b *testing.B, store raft.LogStore) {
|
||||||
|
// Create some fake data
|
||||||
|
var logs []*raft.Log
|
||||||
|
for i := 1; i < 10; i++ {
|
||||||
|
logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")})
|
||||||
|
}
|
||||||
|
if err := store.StoreLogs(logs); err != nil {
|
||||||
|
b.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
b.ResetTimer()
|
||||||
|
|
||||||
|
// Run GetLog a number of times
|
||||||
|
for n := 0; n < b.N; n++ {
|
||||||
|
if err := store.GetLog(5, new(raft.Log)); err != nil {
|
||||||
|
b.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func StoreLog(b *testing.B, store raft.LogStore) {
|
||||||
|
// Run StoreLog a number of times
|
||||||
|
for n := 0; n < b.N; n++ {
|
||||||
|
log := &raft.Log{Index: uint64(n), Data: []byte("data")}
|
||||||
|
if err := store.StoreLog(log); err != nil {
|
||||||
|
b.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func StoreLogs(b *testing.B, store raft.LogStore) {
|
||||||
|
// Run StoreLogs a number of times. We want to set multiple logs each
|
||||||
|
// run, so we create 3 logs with incrementing indexes for each iteration.
|
||||||
|
for n := 0; n < b.N; n++ {
|
||||||
|
b.StopTimer()
|
||||||
|
offset := 3 * (n + 1)
|
||||||
|
logs := []*raft.Log{
|
||||||
|
&raft.Log{Index: uint64(offset - 2), Data: []byte("data")},
|
||||||
|
&raft.Log{Index: uint64(offset - 1), Data: []byte("data")},
|
||||||
|
&raft.Log{Index: uint64(offset), Data: []byte("data")},
|
||||||
|
}
|
||||||
|
b.StartTimer()
|
||||||
|
|
||||||
|
if err := store.StoreLogs(logs); err != nil {
|
||||||
|
b.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func DeleteRange(b *testing.B, store raft.LogStore) {
|
||||||
|
// Create some fake data. In this case, we create 3 new log entries for each
|
||||||
|
// test case, and separate them by index in multiples of 10. This allows
|
||||||
|
// some room so that we can test deleting ranges with "extra" logs to
|
||||||
|
// to ensure we stop going to the database once our max index is hit.
|
||||||
|
var logs []*raft.Log
|
||||||
|
for n := 0; n < b.N; n++ {
|
||||||
|
offset := 10 * n
|
||||||
|
for i := offset; i < offset+3; i++ {
|
||||||
|
logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := store.StoreLogs(logs); err != nil {
|
||||||
|
b.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
b.ResetTimer()
|
||||||
|
|
||||||
|
// Delete a range of the data
|
||||||
|
for n := 0; n < b.N; n++ {
|
||||||
|
offset := 10 * n
|
||||||
|
if err := store.DeleteRange(uint64(offset), uint64(offset+9)); err != nil {
|
||||||
|
b.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func Set(b *testing.B, store raft.StableStore) {
|
||||||
|
// Run Set a number of times
|
||||||
|
for n := 0; n < b.N; n++ {
|
||||||
|
if err := store.Set([]byte{byte(n)}, []byte("val")); err != nil {
|
||||||
|
b.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func Get(b *testing.B, store raft.StableStore) {
|
||||||
|
// Create some fake data
|
||||||
|
for i := 1; i < 10; i++ {
|
||||||
|
if err := store.Set([]byte{byte(i)}, []byte("val")); err != nil {
|
||||||
|
b.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b.ResetTimer()
|
||||||
|
|
||||||
|
// Run Get a number of times
|
||||||
|
for n := 0; n < b.N; n++ {
|
||||||
|
if _, err := store.Get([]byte{0x05}); err != nil {
|
||||||
|
b.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func SetUint64(b *testing.B, store raft.StableStore) {
|
||||||
|
// Run SetUint64 a number of times
|
||||||
|
for n := 0; n < b.N; n++ {
|
||||||
|
if err := store.SetUint64([]byte{byte(n)}, uint64(n)); err != nil {
|
||||||
|
b.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func GetUint64(b *testing.B, store raft.StableStore) {
|
||||||
|
// Create some fake data
|
||||||
|
for i := 0; i < 10; i++ {
|
||||||
|
if err := store.SetUint64([]byte{byte(i)}, uint64(i)); err != nil {
|
||||||
|
b.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b.ResetTimer()
|
||||||
|
|
||||||
|
// Run GetUint64 a number of times
|
||||||
|
for n := 0; n < b.N; n++ {
|
||||||
|
if _, err := store.Get([]byte{0x05}); err != nil {
|
||||||
|
b.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,84 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
// AppendEntriesRequest is the command used to append entries to the
|
||||||
|
// replicated log.
|
||||||
|
type AppendEntriesRequest struct {
|
||||||
|
// Provide the current term and leader
|
||||||
|
Term uint64
|
||||||
|
Leader []byte
|
||||||
|
|
||||||
|
// Provide the previous entries for integrity checking
|
||||||
|
PrevLogEntry uint64
|
||||||
|
PrevLogTerm uint64
|
||||||
|
|
||||||
|
// New entries to commit
|
||||||
|
Entries []*Log
|
||||||
|
|
||||||
|
// Commit index on the leader
|
||||||
|
LeaderCommitIndex uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
// AppendEntriesResponse is the response returned from an
|
||||||
|
// AppendEntriesRequest.
|
||||||
|
type AppendEntriesResponse struct {
|
||||||
|
// Newer term if leader is out of date
|
||||||
|
Term uint64
|
||||||
|
|
||||||
|
// Last Log is a hint to help accelerate rebuilding slow nodes
|
||||||
|
LastLog uint64
|
||||||
|
|
||||||
|
// We may not succeed if we have a conflicting entry
|
||||||
|
Success bool
|
||||||
|
|
||||||
|
// There are scenarios where this request didn't succeed
|
||||||
|
// but there's no need to wait/back-off the next attempt.
|
||||||
|
NoRetryBackoff bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// RequestVoteRequest is the command used by a candidate to ask a Raft peer
|
||||||
|
// for a vote in an election.
|
||||||
|
type RequestVoteRequest struct {
|
||||||
|
// Provide the term and our id
|
||||||
|
Term uint64
|
||||||
|
Candidate []byte
|
||||||
|
|
||||||
|
// Used to ensure safety
|
||||||
|
LastLogIndex uint64
|
||||||
|
LastLogTerm uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
// RequestVoteResponse is the response returned from a RequestVoteRequest.
|
||||||
|
type RequestVoteResponse struct {
|
||||||
|
// Newer term if leader is out of date
|
||||||
|
Term uint64
|
||||||
|
|
||||||
|
// Return the peers, so that a node can shutdown on removal
|
||||||
|
Peers []byte
|
||||||
|
|
||||||
|
// Is the vote granted
|
||||||
|
Granted bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// InstallSnapshotRequest is the command sent to a Raft peer to bootstrap its
|
||||||
|
// log (and state machine) from a snapshot on another peer.
|
||||||
|
type InstallSnapshotRequest struct {
|
||||||
|
Term uint64
|
||||||
|
Leader []byte
|
||||||
|
|
||||||
|
// These are the last index/term included in the snapshot
|
||||||
|
LastLogIndex uint64
|
||||||
|
LastLogTerm uint64
|
||||||
|
|
||||||
|
// Peer Set in the snapshot
|
||||||
|
Peers []byte
|
||||||
|
|
||||||
|
// Size of the snapshot
|
||||||
|
Size int64
|
||||||
|
}
|
||||||
|
|
||||||
|
// InstallSnapshotResponse is the response returned from an
|
||||||
|
// InstallSnapshotRequest.
|
||||||
|
type InstallSnapshotResponse struct {
|
||||||
|
Term uint64
|
||||||
|
Success bool
|
||||||
|
}
|
@ -0,0 +1,136 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"log"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Config provides any necessary configuration to
|
||||||
|
// the Raft server
|
||||||
|
type Config struct {
|
||||||
|
// HeartbeatTimeout specifies the time in follower state without
|
||||||
|
// a leader before we attempt an election.
|
||||||
|
HeartbeatTimeout time.Duration
|
||||||
|
|
||||||
|
// ElectionTimeout specifies the time in candidate state without
|
||||||
|
// a leader before we attempt an election.
|
||||||
|
ElectionTimeout time.Duration
|
||||||
|
|
||||||
|
// CommitTimeout controls the time without an Apply() operation
|
||||||
|
// before we heartbeat to ensure a timely commit. Due to random
|
||||||
|
// staggering, may be delayed as much as 2x this value.
|
||||||
|
CommitTimeout time.Duration
|
||||||
|
|
||||||
|
// MaxAppendEntries controls the maximum number of append entries
|
||||||
|
// to send at once. We want to strike a balance between efficiency
|
||||||
|
// and avoiding waste if the follower is going to reject because of
|
||||||
|
// an inconsistent log.
|
||||||
|
MaxAppendEntries int
|
||||||
|
|
||||||
|
// If we are a member of a cluster, and RemovePeer is invoked for the
|
||||||
|
// local node, then we forget all peers and transition into the follower state.
|
||||||
|
// If ShutdownOnRemove is is set, we additional shutdown Raft. Otherwise,
|
||||||
|
// we can become a leader of a cluster containing only this node.
|
||||||
|
ShutdownOnRemove bool
|
||||||
|
|
||||||
|
// DisableBootstrapAfterElect is used to turn off EnableSingleNode
|
||||||
|
// after the node is elected. This is used to prevent self-election
|
||||||
|
// if the node is removed from the Raft cluster via RemovePeer. Setting
|
||||||
|
// it to false will keep the bootstrap mode, allowing the node to self-elect
|
||||||
|
// and potentially bootstrap a separate cluster.
|
||||||
|
DisableBootstrapAfterElect bool
|
||||||
|
|
||||||
|
// TrailingLogs controls how many logs we leave after a snapshot. This is
|
||||||
|
// used so that we can quickly replay logs on a follower instead of being
|
||||||
|
// forced to send an entire snapshot.
|
||||||
|
TrailingLogs uint64
|
||||||
|
|
||||||
|
// SnapshotInterval controls how often we check if we should perform a snapshot.
|
||||||
|
// We randomly stagger between this value and 2x this value to avoid the entire
|
||||||
|
// cluster from performing a snapshot at once.
|
||||||
|
SnapshotInterval time.Duration
|
||||||
|
|
||||||
|
// SnapshotThreshold controls how many outstanding logs there must be before
|
||||||
|
// we perform a snapshot. This is to prevent excessive snapshots when we can
|
||||||
|
// just replay a small set of logs.
|
||||||
|
SnapshotThreshold uint64
|
||||||
|
|
||||||
|
// EnableSingleNode allows for a single node mode of operation. This
|
||||||
|
// is false by default, which prevents a lone node from electing itself.
|
||||||
|
// leader.
|
||||||
|
EnableSingleNode bool
|
||||||
|
|
||||||
|
// LeaderLeaseTimeout is used to control how long the "lease" lasts
|
||||||
|
// for being the leader without being able to contact a quorum
|
||||||
|
// of nodes. If we reach this interval without contact, we will
|
||||||
|
// step down as leader.
|
||||||
|
LeaderLeaseTimeout time.Duration
|
||||||
|
|
||||||
|
// StartAsLeader forces Raft to start in the leader state. This should
|
||||||
|
// never be used except for testing purposes, as it can cause a split-brain.
|
||||||
|
StartAsLeader bool
|
||||||
|
|
||||||
|
// NotifyCh is used to provide a channel that will be notified of leadership
|
||||||
|
// changes. Raft will block writing to this channel, so it should either be
|
||||||
|
// buffered or aggressively consumed.
|
||||||
|
NotifyCh chan<- bool
|
||||||
|
|
||||||
|
// LogOutput is used as a sink for logs, unless Logger is specified.
|
||||||
|
// Defaults to os.Stderr.
|
||||||
|
LogOutput io.Writer
|
||||||
|
|
||||||
|
// Logger is a user-provided logger. If nil, a logger writing to LogOutput
|
||||||
|
// is used.
|
||||||
|
Logger *log.Logger
|
||||||
|
}
|
||||||
|
|
||||||
|
// DefaultConfig returns a Config with usable defaults.
|
||||||
|
func DefaultConfig() *Config {
|
||||||
|
return &Config{
|
||||||
|
HeartbeatTimeout: 1000 * time.Millisecond,
|
||||||
|
ElectionTimeout: 1000 * time.Millisecond,
|
||||||
|
CommitTimeout: 50 * time.Millisecond,
|
||||||
|
MaxAppendEntries: 64,
|
||||||
|
ShutdownOnRemove: true,
|
||||||
|
DisableBootstrapAfterElect: true,
|
||||||
|
TrailingLogs: 10240,
|
||||||
|
SnapshotInterval: 120 * time.Second,
|
||||||
|
SnapshotThreshold: 8192,
|
||||||
|
EnableSingleNode: false,
|
||||||
|
LeaderLeaseTimeout: 500 * time.Millisecond,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ValidateConfig is used to validate a sane configuration
|
||||||
|
func ValidateConfig(config *Config) error {
|
||||||
|
if config.HeartbeatTimeout < 5*time.Millisecond {
|
||||||
|
return fmt.Errorf("Heartbeat timeout is too low")
|
||||||
|
}
|
||||||
|
if config.ElectionTimeout < 5*time.Millisecond {
|
||||||
|
return fmt.Errorf("Election timeout is too low")
|
||||||
|
}
|
||||||
|
if config.CommitTimeout < time.Millisecond {
|
||||||
|
return fmt.Errorf("Commit timeout is too low")
|
||||||
|
}
|
||||||
|
if config.MaxAppendEntries <= 0 {
|
||||||
|
return fmt.Errorf("MaxAppendEntries must be positive")
|
||||||
|
}
|
||||||
|
if config.MaxAppendEntries > 1024 {
|
||||||
|
return fmt.Errorf("MaxAppendEntries is too large")
|
||||||
|
}
|
||||||
|
if config.SnapshotInterval < 5*time.Millisecond {
|
||||||
|
return fmt.Errorf("Snapshot interval is too low")
|
||||||
|
}
|
||||||
|
if config.LeaderLeaseTimeout < 5*time.Millisecond {
|
||||||
|
return fmt.Errorf("Leader lease timeout is too low")
|
||||||
|
}
|
||||||
|
if config.LeaderLeaseTimeout > config.HeartbeatTimeout {
|
||||||
|
return fmt.Errorf("Leader lease timeout cannot be larger than heartbeat timeout")
|
||||||
|
}
|
||||||
|
if config.ElectionTimeout < config.HeartbeatTimeout {
|
||||||
|
return fmt.Errorf("Election timeout must be equal or greater than Heartbeat Timeout")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
@ -0,0 +1,48 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
)
|
||||||
|
|
||||||
|
// DiscardSnapshotStore is used to successfully snapshot while
|
||||||
|
// always discarding the snapshot. This is useful for when the
|
||||||
|
// log should be truncated but no snapshot should be retained.
|
||||||
|
// This should never be used for production use, and is only
|
||||||
|
// suitable for testing.
|
||||||
|
type DiscardSnapshotStore struct{}
|
||||||
|
|
||||||
|
type DiscardSnapshotSink struct{}
|
||||||
|
|
||||||
|
// NewDiscardSnapshotStore is used to create a new DiscardSnapshotStore.
|
||||||
|
func NewDiscardSnapshotStore() *DiscardSnapshotStore {
|
||||||
|
return &DiscardSnapshotStore{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *DiscardSnapshotStore) Create(index, term uint64, peers []byte) (SnapshotSink, error) {
|
||||||
|
return &DiscardSnapshotSink{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *DiscardSnapshotStore) List() ([]*SnapshotMeta, error) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *DiscardSnapshotStore) Open(id string) (*SnapshotMeta, io.ReadCloser, error) {
|
||||||
|
return nil, nil, fmt.Errorf("open is not supported")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *DiscardSnapshotSink) Write(b []byte) (int, error) {
|
||||||
|
return len(b), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *DiscardSnapshotSink) Close() error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *DiscardSnapshotSink) ID() string {
|
||||||
|
return "discard"
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *DiscardSnapshotSink) Cancel() error {
|
||||||
|
return nil
|
||||||
|
}
|
@ -0,0 +1,17 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestDiscardSnapshotStoreImpl(t *testing.T) {
|
||||||
|
var impl interface{} = &DiscardSnapshotStore{}
|
||||||
|
if _, ok := impl.(SnapshotStore); !ok {
|
||||||
|
t.Fatalf("DiscardSnapshotStore not a SnapshotStore")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDiscardSnapshotSinkImpl(t *testing.T) {
|
||||||
|
var impl interface{} = &DiscardSnapshotSink{}
|
||||||
|
if _, ok := impl.(SnapshotSink); !ok {
|
||||||
|
t.Fatalf("DiscardSnapshotSink not a SnapshotSink")
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,513 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"bytes"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"hash"
|
||||||
|
"hash/crc64"
|
||||||
|
"io"
|
||||||
|
"io/ioutil"
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"runtime"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
testPath = "permTest"
|
||||||
|
snapPath = "snapshots"
|
||||||
|
metaFilePath = "meta.json"
|
||||||
|
stateFilePath = "state.bin"
|
||||||
|
tmpSuffix = ".tmp"
|
||||||
|
)
|
||||||
|
|
||||||
|
// FileSnapshotStore implements the SnapshotStore interface and allows
|
||||||
|
// snapshots to be made on the local disk.
|
||||||
|
type FileSnapshotStore struct {
|
||||||
|
path string
|
||||||
|
retain int
|
||||||
|
logger *log.Logger
|
||||||
|
}
|
||||||
|
|
||||||
|
type snapMetaSlice []*fileSnapshotMeta
|
||||||
|
|
||||||
|
// FileSnapshotSink implements SnapshotSink with a file.
|
||||||
|
type FileSnapshotSink struct {
|
||||||
|
store *FileSnapshotStore
|
||||||
|
logger *log.Logger
|
||||||
|
dir string
|
||||||
|
parentDir string
|
||||||
|
meta fileSnapshotMeta
|
||||||
|
|
||||||
|
stateFile *os.File
|
||||||
|
stateHash hash.Hash64
|
||||||
|
buffered *bufio.Writer
|
||||||
|
|
||||||
|
closed bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// fileSnapshotMeta is stored on disk. We also put a CRC
|
||||||
|
// on disk so that we can verify the snapshot.
|
||||||
|
type fileSnapshotMeta struct {
|
||||||
|
SnapshotMeta
|
||||||
|
CRC []byte
|
||||||
|
}
|
||||||
|
|
||||||
|
// bufferedFile is returned when we open a snapshot. This way
|
||||||
|
// reads are buffered and the file still gets closed.
|
||||||
|
type bufferedFile struct {
|
||||||
|
bh *bufio.Reader
|
||||||
|
fh *os.File
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *bufferedFile) Read(p []byte) (n int, err error) {
|
||||||
|
return b.bh.Read(p)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *bufferedFile) Close() error {
|
||||||
|
return b.fh.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewFileSnapshotStoreWithLogger creates a new FileSnapshotStore based
|
||||||
|
// on a base directory. The `retain` parameter controls how many
|
||||||
|
// snapshots are retained. Must be at least 1.
|
||||||
|
func NewFileSnapshotStoreWithLogger(base string, retain int, logger *log.Logger) (*FileSnapshotStore, error) {
|
||||||
|
if retain < 1 {
|
||||||
|
return nil, fmt.Errorf("must retain at least one snapshot")
|
||||||
|
}
|
||||||
|
if logger == nil {
|
||||||
|
logger = log.New(os.Stderr, "", log.LstdFlags)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure our path exists
|
||||||
|
path := filepath.Join(base, snapPath)
|
||||||
|
if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) {
|
||||||
|
return nil, fmt.Errorf("snapshot path not accessible: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Setup the store
|
||||||
|
store := &FileSnapshotStore{
|
||||||
|
path: path,
|
||||||
|
retain: retain,
|
||||||
|
logger: logger,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Do a permissions test
|
||||||
|
if err := store.testPermissions(); err != nil {
|
||||||
|
return nil, fmt.Errorf("permissions test failed: %v", err)
|
||||||
|
}
|
||||||
|
return store, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewFileSnapshotStore creates a new FileSnapshotStore based
|
||||||
|
// on a base directory. The `retain` parameter controls how many
|
||||||
|
// snapshots are retained. Must be at least 1.
|
||||||
|
func NewFileSnapshotStore(base string, retain int, logOutput io.Writer) (*FileSnapshotStore, error) {
|
||||||
|
if logOutput == nil {
|
||||||
|
logOutput = os.Stderr
|
||||||
|
}
|
||||||
|
return NewFileSnapshotStoreWithLogger(base, retain, log.New(logOutput, "", log.LstdFlags))
|
||||||
|
}
|
||||||
|
|
||||||
|
// testPermissions tries to touch a file in our path to see if it works.
|
||||||
|
func (f *FileSnapshotStore) testPermissions() error {
|
||||||
|
path := filepath.Join(f.path, testPath)
|
||||||
|
fh, err := os.Create(path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err = fh.Close(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err = os.Remove(path); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// snapshotName generates a name for the snapshot.
|
||||||
|
func snapshotName(term, index uint64) string {
|
||||||
|
now := time.Now()
|
||||||
|
msec := now.UnixNano() / int64(time.Millisecond)
|
||||||
|
return fmt.Sprintf("%d-%d-%d", term, index, msec)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create is used to start a new snapshot
|
||||||
|
func (f *FileSnapshotStore) Create(index, term uint64, peers []byte) (SnapshotSink, error) {
|
||||||
|
// Create a new path
|
||||||
|
name := snapshotName(term, index)
|
||||||
|
path := filepath.Join(f.path, name+tmpSuffix)
|
||||||
|
f.logger.Printf("[INFO] snapshot: Creating new snapshot at %s", path)
|
||||||
|
|
||||||
|
// Make the directory
|
||||||
|
if err := os.MkdirAll(path, 0755); err != nil {
|
||||||
|
f.logger.Printf("[ERR] snapshot: Failed to make snapshot directory: %v", err)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the sink
|
||||||
|
sink := &FileSnapshotSink{
|
||||||
|
store: f,
|
||||||
|
logger: f.logger,
|
||||||
|
dir: path,
|
||||||
|
parentDir: f.path,
|
||||||
|
meta: fileSnapshotMeta{
|
||||||
|
SnapshotMeta: SnapshotMeta{
|
||||||
|
ID: name,
|
||||||
|
Index: index,
|
||||||
|
Term: term,
|
||||||
|
Peers: peers,
|
||||||
|
},
|
||||||
|
CRC: nil,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write out the meta data
|
||||||
|
if err := sink.writeMeta(); err != nil {
|
||||||
|
f.logger.Printf("[ERR] snapshot: Failed to write metadata: %v", err)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Open the state file
|
||||||
|
statePath := filepath.Join(path, stateFilePath)
|
||||||
|
fh, err := os.Create(statePath)
|
||||||
|
if err != nil {
|
||||||
|
f.logger.Printf("[ERR] snapshot: Failed to create state file: %v", err)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
sink.stateFile = fh
|
||||||
|
|
||||||
|
// Create a CRC64 hash
|
||||||
|
sink.stateHash = crc64.New(crc64.MakeTable(crc64.ECMA))
|
||||||
|
|
||||||
|
// Wrap both the hash and file in a MultiWriter with buffering
|
||||||
|
multi := io.MultiWriter(sink.stateFile, sink.stateHash)
|
||||||
|
sink.buffered = bufio.NewWriter(multi)
|
||||||
|
|
||||||
|
// Done
|
||||||
|
return sink, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// List returns available snapshots in the store.
|
||||||
|
func (f *FileSnapshotStore) List() ([]*SnapshotMeta, error) {
|
||||||
|
// Get the eligible snapshots
|
||||||
|
snapshots, err := f.getSnapshots()
|
||||||
|
if err != nil {
|
||||||
|
f.logger.Printf("[ERR] snapshot: Failed to get snapshots: %v", err)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var snapMeta []*SnapshotMeta
|
||||||
|
for _, meta := range snapshots {
|
||||||
|
snapMeta = append(snapMeta, &meta.SnapshotMeta)
|
||||||
|
if len(snapMeta) == f.retain {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return snapMeta, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// getSnapshots returns all the known snapshots.
|
||||||
|
func (f *FileSnapshotStore) getSnapshots() ([]*fileSnapshotMeta, error) {
|
||||||
|
// Get the eligible snapshots
|
||||||
|
snapshots, err := ioutil.ReadDir(f.path)
|
||||||
|
if err != nil {
|
||||||
|
f.logger.Printf("[ERR] snapshot: Failed to scan snapshot dir: %v", err)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Populate the metadata
|
||||||
|
var snapMeta []*fileSnapshotMeta
|
||||||
|
for _, snap := range snapshots {
|
||||||
|
// Ignore any files
|
||||||
|
if !snap.IsDir() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ignore any temporary snapshots
|
||||||
|
dirName := snap.Name()
|
||||||
|
if strings.HasSuffix(dirName, tmpSuffix) {
|
||||||
|
f.logger.Printf("[WARN] snapshot: Found temporary snapshot: %v", dirName)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to read the meta data
|
||||||
|
meta, err := f.readMeta(dirName)
|
||||||
|
if err != nil {
|
||||||
|
f.logger.Printf("[WARN] snapshot: Failed to read metadata for %v: %v", dirName, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Append, but only return up to the retain count
|
||||||
|
snapMeta = append(snapMeta, meta)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort the snapshot, reverse so we get new -> old
|
||||||
|
sort.Sort(sort.Reverse(snapMetaSlice(snapMeta)))
|
||||||
|
|
||||||
|
return snapMeta, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// readMeta is used to read the meta data for a given named backup
|
||||||
|
func (f *FileSnapshotStore) readMeta(name string) (*fileSnapshotMeta, error) {
|
||||||
|
// Open the meta file
|
||||||
|
metaPath := filepath.Join(f.path, name, metaFilePath)
|
||||||
|
fh, err := os.Open(metaPath)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer fh.Close()
|
||||||
|
|
||||||
|
// Buffer the file IO
|
||||||
|
buffered := bufio.NewReader(fh)
|
||||||
|
|
||||||
|
// Read in the JSON
|
||||||
|
meta := &fileSnapshotMeta{}
|
||||||
|
dec := json.NewDecoder(buffered)
|
||||||
|
if err := dec.Decode(meta); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return meta, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Open takes a snapshot ID and returns a ReadCloser for that snapshot.
|
||||||
|
func (f *FileSnapshotStore) Open(id string) (*SnapshotMeta, io.ReadCloser, error) {
|
||||||
|
// Get the metadata
|
||||||
|
meta, err := f.readMeta(id)
|
||||||
|
if err != nil {
|
||||||
|
f.logger.Printf("[ERR] snapshot: Failed to get meta data to open snapshot: %v", err)
|
||||||
|
return nil, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Open the state file
|
||||||
|
statePath := filepath.Join(f.path, id, stateFilePath)
|
||||||
|
fh, err := os.Open(statePath)
|
||||||
|
if err != nil {
|
||||||
|
f.logger.Printf("[ERR] snapshot: Failed to open state file: %v", err)
|
||||||
|
return nil, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a CRC64 hash
|
||||||
|
stateHash := crc64.New(crc64.MakeTable(crc64.ECMA))
|
||||||
|
|
||||||
|
// Compute the hash
|
||||||
|
_, err = io.Copy(stateHash, fh)
|
||||||
|
if err != nil {
|
||||||
|
f.logger.Printf("[ERR] snapshot: Failed to read state file: %v", err)
|
||||||
|
fh.Close()
|
||||||
|
return nil, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify the hash
|
||||||
|
computed := stateHash.Sum(nil)
|
||||||
|
if bytes.Compare(meta.CRC, computed) != 0 {
|
||||||
|
f.logger.Printf("[ERR] snapshot: CRC checksum failed (stored: %v computed: %v)",
|
||||||
|
meta.CRC, computed)
|
||||||
|
fh.Close()
|
||||||
|
return nil, nil, fmt.Errorf("CRC mismatch")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Seek to the start
|
||||||
|
if _, err := fh.Seek(0, 0); err != nil {
|
||||||
|
f.logger.Printf("[ERR] snapshot: State file seek failed: %v", err)
|
||||||
|
fh.Close()
|
||||||
|
return nil, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return a buffered file
|
||||||
|
buffered := &bufferedFile{
|
||||||
|
bh: bufio.NewReader(fh),
|
||||||
|
fh: fh,
|
||||||
|
}
|
||||||
|
|
||||||
|
return &meta.SnapshotMeta, buffered, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ReapSnapshots reaps any snapshots beyond the retain count.
|
||||||
|
func (f *FileSnapshotStore) ReapSnapshots() error {
|
||||||
|
snapshots, err := f.getSnapshots()
|
||||||
|
if err != nil {
|
||||||
|
f.logger.Printf("[ERR] snapshot: Failed to get snapshots: %v", err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := f.retain; i < len(snapshots); i++ {
|
||||||
|
path := filepath.Join(f.path, snapshots[i].ID)
|
||||||
|
f.logger.Printf("[INFO] snapshot: reaping snapshot %v", path)
|
||||||
|
if err := os.RemoveAll(path); err != nil {
|
||||||
|
f.logger.Printf("[ERR] snapshot: Failed to reap snapshot %v: %v", path, err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ID returns the ID of the snapshot, can be used with Open()
|
||||||
|
// after the snapshot is finalized.
|
||||||
|
func (s *FileSnapshotSink) ID() string {
|
||||||
|
return s.meta.ID
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write is used to append to the state file. We write to the
|
||||||
|
// buffered IO object to reduce the amount of context switches.
|
||||||
|
func (s *FileSnapshotSink) Write(b []byte) (int, error) {
|
||||||
|
return s.buffered.Write(b)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close is used to indicate a successful end.
|
||||||
|
func (s *FileSnapshotSink) Close() error {
|
||||||
|
// Make sure close is idempotent
|
||||||
|
if s.closed {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
s.closed = true
|
||||||
|
|
||||||
|
// Close the open handles
|
||||||
|
if err := s.finalize(); err != nil {
|
||||||
|
s.logger.Printf("[ERR] snapshot: Failed to finalize snapshot: %v", err)
|
||||||
|
if delErr := os.RemoveAll(s.dir); delErr != nil {
|
||||||
|
s.logger.Printf("[ERR] snapshot: Failed to delete temporary snapshot at path %v: %v", s.dir, delErr)
|
||||||
|
return delErr
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write out the meta data
|
||||||
|
if err := s.writeMeta(); err != nil {
|
||||||
|
s.logger.Printf("[ERR] snapshot: Failed to write metadata: %v", err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Move the directory into place
|
||||||
|
newPath := strings.TrimSuffix(s.dir, tmpSuffix)
|
||||||
|
if err := os.Rename(s.dir, newPath); err != nil {
|
||||||
|
s.logger.Printf("[ERR] snapshot: Failed to move snapshot into place: %v", err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if runtime.GOOS != "windows" { //skipping fsync for directory entry edits on Windows, only needed for *nix style file systems
|
||||||
|
parentFH, err := os.Open(s.parentDir)
|
||||||
|
defer parentFH.Close()
|
||||||
|
if err != nil {
|
||||||
|
s.logger.Printf("[ERR] snapshot: Failed to open snapshot parent directory %v, error: %v", s.parentDir, err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err = parentFH.Sync(); err != nil {
|
||||||
|
s.logger.Printf("[ERR] snapshot: Failed syncing parent directory %v, error: %v", s.parentDir, err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reap any old snapshots
|
||||||
|
if err := s.store.ReapSnapshots(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cancel is used to indicate an unsuccessful end.
|
||||||
|
func (s *FileSnapshotSink) Cancel() error {
|
||||||
|
// Make sure close is idempotent
|
||||||
|
if s.closed {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
s.closed = true
|
||||||
|
|
||||||
|
// Close the open handles
|
||||||
|
if err := s.finalize(); err != nil {
|
||||||
|
s.logger.Printf("[ERR] snapshot: Failed to finalize snapshot: %v", err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Attempt to remove all artifacts
|
||||||
|
return os.RemoveAll(s.dir)
|
||||||
|
}
|
||||||
|
|
||||||
|
// finalize is used to close all of our resources.
|
||||||
|
func (s *FileSnapshotSink) finalize() error {
|
||||||
|
// Flush any remaining data
|
||||||
|
if err := s.buffered.Flush(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sync to force fsync to disk
|
||||||
|
if err := s.stateFile.Sync(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the file size
|
||||||
|
stat, statErr := s.stateFile.Stat()
|
||||||
|
|
||||||
|
// Close the file
|
||||||
|
if err := s.stateFile.Close(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set the file size, check after we close
|
||||||
|
if statErr != nil {
|
||||||
|
return statErr
|
||||||
|
}
|
||||||
|
s.meta.Size = stat.Size()
|
||||||
|
|
||||||
|
// Set the CRC
|
||||||
|
s.meta.CRC = s.stateHash.Sum(nil)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// writeMeta is used to write out the metadata we have.
|
||||||
|
func (s *FileSnapshotSink) writeMeta() error {
|
||||||
|
// Open the meta file
|
||||||
|
metaPath := filepath.Join(s.dir, metaFilePath)
|
||||||
|
fh, err := os.Create(metaPath)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer fh.Close()
|
||||||
|
|
||||||
|
// Buffer the file IO
|
||||||
|
buffered := bufio.NewWriter(fh)
|
||||||
|
|
||||||
|
// Write out as JSON
|
||||||
|
enc := json.NewEncoder(buffered)
|
||||||
|
if err := enc.Encode(&s.meta); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err = buffered.Flush(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err = fh.Sync(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Implement the sort interface for []*fileSnapshotMeta.
|
||||||
|
func (s snapMetaSlice) Len() int {
|
||||||
|
return len(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s snapMetaSlice) Less(i, j int) bool {
|
||||||
|
if s[i].Term != s[j].Term {
|
||||||
|
return s[i].Term < s[j].Term
|
||||||
|
}
|
||||||
|
if s[i].Index != s[j].Index {
|
||||||
|
return s[i].Index < s[j].Index
|
||||||
|
}
|
||||||
|
return s[i].ID < s[j].ID
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s snapMetaSlice) Swap(i, j int) {
|
||||||
|
s[i], s[j] = s[j], s[i]
|
||||||
|
}
|
@ -0,0 +1,343 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"io"
|
||||||
|
"io/ioutil"
|
||||||
|
"os"
|
||||||
|
"runtime"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func FileSnapTest(t *testing.T) (string, *FileSnapshotStore) {
|
||||||
|
// Create a test dir
|
||||||
|
dir, err := ioutil.TempDir("", "raft")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v ", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
return dir, snap
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFileSnapshotStoreImpl(t *testing.T) {
|
||||||
|
var impl interface{} = &FileSnapshotStore{}
|
||||||
|
if _, ok := impl.(SnapshotStore); !ok {
|
||||||
|
t.Fatalf("FileSnapshotStore not a SnapshotStore")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFileSnapshotSinkImpl(t *testing.T) {
|
||||||
|
var impl interface{} = &FileSnapshotSink{}
|
||||||
|
if _, ok := impl.(SnapshotSink); !ok {
|
||||||
|
t.Fatalf("FileSnapshotSink not a SnapshotSink")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFileSS_CreateSnapshotMissingParentDir(t *testing.T) {
|
||||||
|
parent, err := ioutil.TempDir("", "raft")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v ", err)
|
||||||
|
}
|
||||||
|
defer os.RemoveAll(parent)
|
||||||
|
|
||||||
|
dir, err := ioutil.TempDir(parent, "raft")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v ", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
os.RemoveAll(parent)
|
||||||
|
peers := []byte("all my lovely friends")
|
||||||
|
_, err = snap.Create(10, 3, peers)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("should not fail when using non existing parent")
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
func TestFileSS_CreateSnapshot(t *testing.T) {
|
||||||
|
// Create a test dir
|
||||||
|
dir, err := ioutil.TempDir("", "raft")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v ", err)
|
||||||
|
}
|
||||||
|
defer os.RemoveAll(dir)
|
||||||
|
|
||||||
|
snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check no snapshots
|
||||||
|
snaps, err := snap.List()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
if len(snaps) != 0 {
|
||||||
|
t.Fatalf("did not expect any snapshots: %v", snaps)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a new sink
|
||||||
|
peers := []byte("all my lovely friends")
|
||||||
|
sink, err := snap.Create(10, 3, peers)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// The sink is not done, should not be in a list!
|
||||||
|
snaps, err = snap.List()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
if len(snaps) != 0 {
|
||||||
|
t.Fatalf("did not expect any snapshots: %v", snaps)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write to the sink
|
||||||
|
_, err = sink.Write([]byte("first\n"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
_, err = sink.Write([]byte("second\n"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Done!
|
||||||
|
err = sink.Close()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Should have a snapshot!
|
||||||
|
snaps, err = snap.List()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
if len(snaps) != 1 {
|
||||||
|
t.Fatalf("expect a snapshots: %v", snaps)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check the latest
|
||||||
|
latest := snaps[0]
|
||||||
|
if latest.Index != 10 {
|
||||||
|
t.Fatalf("bad snapshot: %v", *latest)
|
||||||
|
}
|
||||||
|
if latest.Term != 3 {
|
||||||
|
t.Fatalf("bad snapshot: %v", *latest)
|
||||||
|
}
|
||||||
|
if bytes.Compare(latest.Peers, peers) != 0 {
|
||||||
|
t.Fatalf("bad snapshot: %v", *latest)
|
||||||
|
}
|
||||||
|
if latest.Size != 13 {
|
||||||
|
t.Fatalf("bad snapshot: %v", *latest)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read the snapshot
|
||||||
|
_, r, err := snap.Open(latest.ID)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read out everything
|
||||||
|
var buf bytes.Buffer
|
||||||
|
if _, err := io.Copy(&buf, r); err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
if err := r.Close(); err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure a match
|
||||||
|
if bytes.Compare(buf.Bytes(), []byte("first\nsecond\n")) != 0 {
|
||||||
|
t.Fatalf("content mismatch")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFileSS_CancelSnapshot(t *testing.T) {
|
||||||
|
// Create a test dir
|
||||||
|
dir, err := ioutil.TempDir("", "raft")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v ", err)
|
||||||
|
}
|
||||||
|
defer os.RemoveAll(dir)
|
||||||
|
|
||||||
|
snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a new sink
|
||||||
|
peers := []byte("all my lovely friends")
|
||||||
|
sink, err := snap.Create(10, 3, peers)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cancel the snapshot! Should delete
|
||||||
|
err = sink.Cancel()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// The sink is canceled, should not be in a list!
|
||||||
|
snaps, err := snap.List()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
if len(snaps) != 0 {
|
||||||
|
t.Fatalf("did not expect any snapshots: %v", snaps)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFileSS_Retention(t *testing.T) {
|
||||||
|
// Create a test dir
|
||||||
|
dir, err := ioutil.TempDir("", "raft")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v ", err)
|
||||||
|
}
|
||||||
|
defer os.RemoveAll(dir)
|
||||||
|
|
||||||
|
snap, err := NewFileSnapshotStoreWithLogger(dir, 2, newTestLogger(t))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a new sink
|
||||||
|
peers := []byte("all my lovely friends")
|
||||||
|
|
||||||
|
// Create a few snapshots
|
||||||
|
for i := 10; i < 15; i++ {
|
||||||
|
sink, err := snap.Create(uint64(i), 3, peers)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
err = sink.Close()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Should only have 2 listed!
|
||||||
|
snaps, err := snap.List()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
if len(snaps) != 2 {
|
||||||
|
t.Fatalf("expect 2 snapshots: %v", snaps)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check they are the latest
|
||||||
|
if snaps[0].Index != 14 {
|
||||||
|
t.Fatalf("bad snap: %#v", *snaps[0])
|
||||||
|
}
|
||||||
|
if snaps[1].Index != 13 {
|
||||||
|
t.Fatalf("bad snap: %#v", *snaps[1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFileSS_BadPerm(t *testing.T) {
|
||||||
|
if runtime.GOOS == "windows" {
|
||||||
|
t.Skip("skipping file permission test on windows")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a temp dir
|
||||||
|
dir1, err := ioutil.TempDir("", "raft")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
defer os.RemoveAll(dir1)
|
||||||
|
|
||||||
|
// Create a sub dir and remove all permissions
|
||||||
|
dir2, err := ioutil.TempDir(dir1, "badperm")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
if err := os.Chmod(dir2, 000); err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
defer os.Chmod(dir2, 777) // Set perms back for delete
|
||||||
|
|
||||||
|
// Should fail
|
||||||
|
if _, err := NewFileSnapshotStore(dir2, 3, nil); err == nil {
|
||||||
|
t.Fatalf("should fail to use dir with bad perms")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFileSS_MissingParentDir(t *testing.T) {
|
||||||
|
parent, err := ioutil.TempDir("", "raft")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v ", err)
|
||||||
|
}
|
||||||
|
defer os.RemoveAll(parent)
|
||||||
|
|
||||||
|
dir, err := ioutil.TempDir(parent, "raft")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v ", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
os.RemoveAll(parent)
|
||||||
|
_, err = NewFileSnapshotStore(dir, 3, nil)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("should not fail when using non existing parent")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFileSS_Ordering(t *testing.T) {
|
||||||
|
// Create a test dir
|
||||||
|
dir, err := ioutil.TempDir("", "raft")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v ", err)
|
||||||
|
}
|
||||||
|
defer os.RemoveAll(dir)
|
||||||
|
|
||||||
|
snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a new sink
|
||||||
|
peers := []byte("all my lovely friends")
|
||||||
|
|
||||||
|
sink, err := snap.Create(130350, 5, peers)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
err = sink.Close()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
sink, err = snap.Create(204917, 36, peers)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
err = sink.Close()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Should only have 2 listed!
|
||||||
|
snaps, err := snap.List()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
if len(snaps) != 2 {
|
||||||
|
t.Fatalf("expect 2 snapshots: %v", snaps)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check they are ordered
|
||||||
|
if snaps[0].Term != 36 {
|
||||||
|
t.Fatalf("bad snap: %#v", *snaps[0])
|
||||||
|
}
|
||||||
|
if snaps[1].Term != 5 {
|
||||||
|
t.Fatalf("bad snap: %#v", *snaps[1])
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,40 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"io"
|
||||||
|
)
|
||||||
|
|
||||||
|
// FSM provides an interface that can be implemented by
|
||||||
|
// clients to make use of the replicated log.
|
||||||
|
type FSM interface {
|
||||||
|
// Apply log is invoked once a log entry is committed.
|
||||||
|
// It returns a value which will be made available in the
|
||||||
|
// ApplyFuture returned by Raft.Apply method if that
|
||||||
|
// method was called on the same Raft node as the FSM.
|
||||||
|
Apply(*Log) interface{}
|
||||||
|
|
||||||
|
// Snapshot is used to support log compaction. This call should
|
||||||
|
// return an FSMSnapshot which can be used to save a point-in-time
|
||||||
|
// snapshot of the FSM. Apply and Snapshot are not called in multiple
|
||||||
|
// threads, but Apply will be called concurrently with Persist. This means
|
||||||
|
// the FSM should be implemented in a fashion that allows for concurrent
|
||||||
|
// updates while a snapshot is happening.
|
||||||
|
Snapshot() (FSMSnapshot, error)
|
||||||
|
|
||||||
|
// Restore is used to restore an FSM from a snapshot. It is not called
|
||||||
|
// concurrently with any other command. The FSM must discard all previous
|
||||||
|
// state.
|
||||||
|
Restore(io.ReadCloser) error
|
||||||
|
}
|
||||||
|
|
||||||
|
// FSMSnapshot is returned by an FSM in response to a Snapshot
|
||||||
|
// It must be safe to invoke FSMSnapshot methods with concurrent
|
||||||
|
// calls to Apply.
|
||||||
|
type FSMSnapshot interface {
|
||||||
|
// Persist should dump all necessary state to the WriteCloser 'sink',
|
||||||
|
// and call sink.Close() when finished or call sink.Cancel() on error.
|
||||||
|
Persist(sink SnapshotSink) error
|
||||||
|
|
||||||
|
// Release is invoked when we are finished with the snapshot.
|
||||||
|
Release()
|
||||||
|
}
|
@ -0,0 +1,203 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Future is used to represent an action that may occur in the future.
|
||||||
|
type Future interface {
|
||||||
|
// Error blocks until the future arrives and then
|
||||||
|
// returns the error status of the future.
|
||||||
|
// This may be called any number of times - all
|
||||||
|
// calls will return the same value.
|
||||||
|
// Note that it is not OK to call this method
|
||||||
|
// twice concurrently on the same Future instance.
|
||||||
|
Error() error
|
||||||
|
}
|
||||||
|
|
||||||
|
// ApplyFuture is used for Apply() and may return the FSM response.
|
||||||
|
type ApplyFuture interface {
|
||||||
|
Future
|
||||||
|
|
||||||
|
// Response returns the FSM response as returned
|
||||||
|
// by the FSM.Apply method. This must not be called
|
||||||
|
// until after the Error method has returned.
|
||||||
|
Response() interface{}
|
||||||
|
|
||||||
|
// Index holds the index of the newly applied log entry.
|
||||||
|
// This must not be called
|
||||||
|
// until after the Error method has returned.
|
||||||
|
Index() uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
// errorFuture is used to return a static error.
|
||||||
|
type errorFuture struct {
|
||||||
|
err error
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e errorFuture) Error() error {
|
||||||
|
return e.err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e errorFuture) Response() interface{} {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e errorFuture) Index() uint64 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// deferError can be embedded to allow a future
|
||||||
|
// to provide an error in the future.
|
||||||
|
type deferError struct {
|
||||||
|
err error
|
||||||
|
errCh chan error
|
||||||
|
responded bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *deferError) init() {
|
||||||
|
d.errCh = make(chan error, 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *deferError) Error() error {
|
||||||
|
if d.err != nil {
|
||||||
|
// Note that when we've received a nil error, this
|
||||||
|
// won't trigger, but the channel is closed after
|
||||||
|
// send so we'll still return nil below.
|
||||||
|
return d.err
|
||||||
|
}
|
||||||
|
if d.errCh == nil {
|
||||||
|
panic("waiting for response on nil channel")
|
||||||
|
}
|
||||||
|
d.err = <-d.errCh
|
||||||
|
return d.err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *deferError) respond(err error) {
|
||||||
|
if d.errCh == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if d.responded {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
d.errCh <- err
|
||||||
|
close(d.errCh)
|
||||||
|
d.responded = true
|
||||||
|
}
|
||||||
|
|
||||||
|
// logFuture is used to apply a log entry and waits until
|
||||||
|
// the log is considered committed.
|
||||||
|
type logFuture struct {
|
||||||
|
deferError
|
||||||
|
log Log
|
||||||
|
policy quorumPolicy
|
||||||
|
response interface{}
|
||||||
|
dispatch time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
func (l *logFuture) Response() interface{} {
|
||||||
|
return l.response
|
||||||
|
}
|
||||||
|
|
||||||
|
func (l *logFuture) Index() uint64 {
|
||||||
|
return l.log.Index
|
||||||
|
}
|
||||||
|
|
||||||
|
type peerFuture struct {
|
||||||
|
deferError
|
||||||
|
peers []string
|
||||||
|
}
|
||||||
|
|
||||||
|
type shutdownFuture struct {
|
||||||
|
raft *Raft
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *shutdownFuture) Error() error {
|
||||||
|
if s.raft == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
s.raft.waitShutdown()
|
||||||
|
if closeable, ok := s.raft.trans.(WithClose); ok {
|
||||||
|
closeable.Close()
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// snapshotFuture is used for waiting on a snapshot to complete.
|
||||||
|
type snapshotFuture struct {
|
||||||
|
deferError
|
||||||
|
}
|
||||||
|
|
||||||
|
// reqSnapshotFuture is used for requesting a snapshot start.
|
||||||
|
// It is only used internally.
|
||||||
|
type reqSnapshotFuture struct {
|
||||||
|
deferError
|
||||||
|
|
||||||
|
// snapshot details provided by the FSM runner before responding
|
||||||
|
index uint64
|
||||||
|
term uint64
|
||||||
|
peers []string
|
||||||
|
snapshot FSMSnapshot
|
||||||
|
}
|
||||||
|
|
||||||
|
// restoreFuture is used for requesting an FSM to perform a
|
||||||
|
// snapshot restore. Used internally only.
|
||||||
|
type restoreFuture struct {
|
||||||
|
deferError
|
||||||
|
ID string
|
||||||
|
}
|
||||||
|
|
||||||
|
// verifyFuture is used to verify the current node is still
|
||||||
|
// the leader. This is to prevent a stale read.
|
||||||
|
type verifyFuture struct {
|
||||||
|
deferError
|
||||||
|
notifyCh chan *verifyFuture
|
||||||
|
quorumSize int
|
||||||
|
votes int
|
||||||
|
voteLock sync.Mutex
|
||||||
|
}
|
||||||
|
|
||||||
|
// vote is used to respond to a verifyFuture.
|
||||||
|
// This may block when responding on the notifyCh.
|
||||||
|
func (v *verifyFuture) vote(leader bool) {
|
||||||
|
v.voteLock.Lock()
|
||||||
|
defer v.voteLock.Unlock()
|
||||||
|
|
||||||
|
// Guard against having notified already
|
||||||
|
if v.notifyCh == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if leader {
|
||||||
|
v.votes++
|
||||||
|
if v.votes >= v.quorumSize {
|
||||||
|
v.notifyCh <- v
|
||||||
|
v.notifyCh = nil
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
v.notifyCh <- v
|
||||||
|
v.notifyCh = nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// appendFuture is used for waiting on a pipelined append
|
||||||
|
// entries RPC.
|
||||||
|
type appendFuture struct {
|
||||||
|
deferError
|
||||||
|
start time.Time
|
||||||
|
args *AppendEntriesRequest
|
||||||
|
resp *AppendEntriesResponse
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *appendFuture) Start() time.Time {
|
||||||
|
return a.start
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *appendFuture) Request() *AppendEntriesRequest {
|
||||||
|
return a.args
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *appendFuture) Response() *AppendEntriesResponse {
|
||||||
|
return a.resp
|
||||||
|
}
|
@ -0,0 +1,42 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestDeferFutureSuccess(t *testing.T) {
|
||||||
|
var f deferError
|
||||||
|
f.init()
|
||||||
|
f.respond(nil)
|
||||||
|
if err := f.Error(); err != nil {
|
||||||
|
t.Fatalf("unexpected error result; got %#v want nil", err)
|
||||||
|
}
|
||||||
|
if err := f.Error(); err != nil {
|
||||||
|
t.Fatalf("unexpected error result; got %#v want nil", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDeferFutureError(t *testing.T) {
|
||||||
|
want := errors.New("x")
|
||||||
|
var f deferError
|
||||||
|
f.init()
|
||||||
|
f.respond(want)
|
||||||
|
if got := f.Error(); got != want {
|
||||||
|
t.Fatalf("unexpected error result; got %#v want %#v", got, want)
|
||||||
|
}
|
||||||
|
if got := f.Error(); got != want {
|
||||||
|
t.Fatalf("unexpected error result; got %#v want %#v", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDeferFutureConcurrent(t *testing.T) {
|
||||||
|
// Food for the race detector.
|
||||||
|
want := errors.New("x")
|
||||||
|
var f deferError
|
||||||
|
f.init()
|
||||||
|
go f.respond(want)
|
||||||
|
if got := f.Error(); got != want {
|
||||||
|
t.Errorf("unexpected error result; got %#v want %#v", got, want)
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,213 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"container/list"
|
||||||
|
"sync"
|
||||||
|
)
|
||||||
|
|
||||||
|
// QuorumPolicy allows individual logFutures to have different
|
||||||
|
// commitment rules while still using the inflight mechanism.
|
||||||
|
type quorumPolicy interface {
|
||||||
|
// Checks if a commit from a given peer is enough to
|
||||||
|
// satisfy the commitment rules
|
||||||
|
Commit() bool
|
||||||
|
|
||||||
|
// Checks if a commit is committed
|
||||||
|
IsCommitted() bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// MajorityQuorum is used by Apply transactions and requires
|
||||||
|
// a simple majority of nodes.
|
||||||
|
type majorityQuorum struct {
|
||||||
|
count int
|
||||||
|
votesNeeded int
|
||||||
|
}
|
||||||
|
|
||||||
|
func newMajorityQuorum(clusterSize int) *majorityQuorum {
|
||||||
|
votesNeeded := (clusterSize / 2) + 1
|
||||||
|
return &majorityQuorum{count: 0, votesNeeded: votesNeeded}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *majorityQuorum) Commit() bool {
|
||||||
|
m.count++
|
||||||
|
return m.count >= m.votesNeeded
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *majorityQuorum) IsCommitted() bool {
|
||||||
|
return m.count >= m.votesNeeded
|
||||||
|
}
|
||||||
|
|
||||||
|
// Inflight is used to track operations that are still in-flight.
|
||||||
|
type inflight struct {
|
||||||
|
sync.Mutex
|
||||||
|
committed *list.List
|
||||||
|
commitCh chan struct{}
|
||||||
|
minCommit uint64
|
||||||
|
maxCommit uint64
|
||||||
|
operations map[uint64]*logFuture
|
||||||
|
stopCh chan struct{}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewInflight returns an inflight struct that notifies
|
||||||
|
// the provided channel when logs are finished committing.
|
||||||
|
func newInflight(commitCh chan struct{}) *inflight {
|
||||||
|
return &inflight{
|
||||||
|
committed: list.New(),
|
||||||
|
commitCh: commitCh,
|
||||||
|
minCommit: 0,
|
||||||
|
maxCommit: 0,
|
||||||
|
operations: make(map[uint64]*logFuture),
|
||||||
|
stopCh: make(chan struct{}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start is used to mark a logFuture as being inflight. It
|
||||||
|
// also commits the entry, as it is assumed the leader is
|
||||||
|
// starting.
|
||||||
|
func (i *inflight) Start(l *logFuture) {
|
||||||
|
i.Lock()
|
||||||
|
defer i.Unlock()
|
||||||
|
i.start(l)
|
||||||
|
}
|
||||||
|
|
||||||
|
// StartAll is used to mark a list of logFuture's as being
|
||||||
|
// inflight. It also commits each entry as the leader is
|
||||||
|
// assumed to be starting.
|
||||||
|
func (i *inflight) StartAll(logs []*logFuture) {
|
||||||
|
i.Lock()
|
||||||
|
defer i.Unlock()
|
||||||
|
for _, l := range logs {
|
||||||
|
i.start(l)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// start is used to mark a single entry as inflight,
|
||||||
|
// must be invoked with the lock held.
|
||||||
|
func (i *inflight) start(l *logFuture) {
|
||||||
|
idx := l.log.Index
|
||||||
|
i.operations[idx] = l
|
||||||
|
|
||||||
|
if idx > i.maxCommit {
|
||||||
|
i.maxCommit = idx
|
||||||
|
}
|
||||||
|
if i.minCommit == 0 {
|
||||||
|
i.minCommit = idx
|
||||||
|
}
|
||||||
|
i.commit(idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cancel is used to cancel all in-flight operations.
|
||||||
|
// This is done when the leader steps down, and all futures
|
||||||
|
// are sent the given error.
|
||||||
|
func (i *inflight) Cancel(err error) {
|
||||||
|
// Close the channel first to unblock any pending commits
|
||||||
|
close(i.stopCh)
|
||||||
|
|
||||||
|
// Lock after close to avoid deadlock
|
||||||
|
i.Lock()
|
||||||
|
defer i.Unlock()
|
||||||
|
|
||||||
|
// Respond to all inflight operations
|
||||||
|
for _, op := range i.operations {
|
||||||
|
op.respond(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clear all the committed but not processed
|
||||||
|
for e := i.committed.Front(); e != nil; e = e.Next() {
|
||||||
|
e.Value.(*logFuture).respond(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clear the map
|
||||||
|
i.operations = make(map[uint64]*logFuture)
|
||||||
|
|
||||||
|
// Clear the list of committed
|
||||||
|
i.committed = list.New()
|
||||||
|
|
||||||
|
// Close the commmitCh
|
||||||
|
close(i.commitCh)
|
||||||
|
|
||||||
|
// Reset indexes
|
||||||
|
i.minCommit = 0
|
||||||
|
i.maxCommit = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// Committed returns all the committed operations in order.
|
||||||
|
func (i *inflight) Committed() (l *list.List) {
|
||||||
|
i.Lock()
|
||||||
|
l, i.committed = i.committed, list.New()
|
||||||
|
i.Unlock()
|
||||||
|
return l
|
||||||
|
}
|
||||||
|
|
||||||
|
// Commit is used by leader replication routines to indicate that
|
||||||
|
// a follower was finished committing a log to disk.
|
||||||
|
func (i *inflight) Commit(index uint64) {
|
||||||
|
i.Lock()
|
||||||
|
defer i.Unlock()
|
||||||
|
i.commit(index)
|
||||||
|
}
|
||||||
|
|
||||||
|
// CommitRange is used to commit a range of indexes inclusively.
|
||||||
|
// It is optimized to avoid commits for indexes that are not tracked.
|
||||||
|
func (i *inflight) CommitRange(minIndex, maxIndex uint64) {
|
||||||
|
i.Lock()
|
||||||
|
defer i.Unlock()
|
||||||
|
|
||||||
|
// Update the minimum index
|
||||||
|
minIndex = max(i.minCommit, minIndex)
|
||||||
|
|
||||||
|
// Commit each index
|
||||||
|
for idx := minIndex; idx <= maxIndex; idx++ {
|
||||||
|
i.commit(idx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// commit is used to commit a single index. Must be called with the lock held.
|
||||||
|
func (i *inflight) commit(index uint64) {
|
||||||
|
op, ok := i.operations[index]
|
||||||
|
if !ok {
|
||||||
|
// Ignore if not in the map, as it may be committed already
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if we've satisfied the commit
|
||||||
|
if !op.policy.Commit() {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cannot commit if this is not the minimum inflight. This can happen
|
||||||
|
// if the quorum size changes, meaning a previous commit requires a larger
|
||||||
|
// quorum that this commit. We MUST block until the previous log is committed,
|
||||||
|
// otherwise logs will be applied out of order.
|
||||||
|
if index != i.minCommit {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
NOTIFY:
|
||||||
|
// Add the operation to the committed list
|
||||||
|
i.committed.PushBack(op)
|
||||||
|
|
||||||
|
// Stop tracking since it is committed
|
||||||
|
delete(i.operations, index)
|
||||||
|
|
||||||
|
// Update the indexes
|
||||||
|
if index == i.maxCommit {
|
||||||
|
i.minCommit = 0
|
||||||
|
i.maxCommit = 0
|
||||||
|
|
||||||
|
} else {
|
||||||
|
i.minCommit++
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if the next in-flight operation is ready
|
||||||
|
if i.minCommit != 0 {
|
||||||
|
op = i.operations[i.minCommit]
|
||||||
|
if op.policy.IsCommitted() {
|
||||||
|
index = i.minCommit
|
||||||
|
goto NOTIFY
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Async notify of ready operations
|
||||||
|
asyncNotifyCh(i.commitCh)
|
||||||
|
}
|
@ -0,0 +1,150 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestInflight_StartCommit(t *testing.T) {
|
||||||
|
commitCh := make(chan struct{}, 1)
|
||||||
|
in := newInflight(commitCh)
|
||||||
|
|
||||||
|
// Commit a transaction as being in flight
|
||||||
|
l := &logFuture{log: Log{Index: 1}}
|
||||||
|
l.policy = newMajorityQuorum(5)
|
||||||
|
in.Start(l)
|
||||||
|
|
||||||
|
// Commit 3 times
|
||||||
|
in.Commit(1)
|
||||||
|
if in.Committed().Len() != 0 {
|
||||||
|
t.Fatalf("should not be commited")
|
||||||
|
}
|
||||||
|
|
||||||
|
in.Commit(1)
|
||||||
|
if in.Committed().Len() != 1 {
|
||||||
|
t.Fatalf("should be commited")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Already committed but should work anyways
|
||||||
|
in.Commit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestInflight_Cancel(t *testing.T) {
|
||||||
|
commitCh := make(chan struct{}, 1)
|
||||||
|
in := newInflight(commitCh)
|
||||||
|
|
||||||
|
// Commit a transaction as being in flight
|
||||||
|
l := &logFuture{
|
||||||
|
log: Log{Index: 1},
|
||||||
|
}
|
||||||
|
l.init()
|
||||||
|
l.policy = newMajorityQuorum(3)
|
||||||
|
in.Start(l)
|
||||||
|
|
||||||
|
// Cancel with an error
|
||||||
|
err := fmt.Errorf("error 1")
|
||||||
|
in.Cancel(err)
|
||||||
|
|
||||||
|
// Should get an error return
|
||||||
|
if l.Error() != err {
|
||||||
|
t.Fatalf("expected error")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestInflight_StartAll(t *testing.T) {
|
||||||
|
commitCh := make(chan struct{}, 1)
|
||||||
|
in := newInflight(commitCh)
|
||||||
|
|
||||||
|
// Commit a few transaction as being in flight
|
||||||
|
l1 := &logFuture{log: Log{Index: 2}}
|
||||||
|
l1.policy = newMajorityQuorum(5)
|
||||||
|
l2 := &logFuture{log: Log{Index: 3}}
|
||||||
|
l2.policy = newMajorityQuorum(5)
|
||||||
|
l3 := &logFuture{log: Log{Index: 4}}
|
||||||
|
l3.policy = newMajorityQuorum(5)
|
||||||
|
|
||||||
|
// Start all the entries
|
||||||
|
in.StartAll([]*logFuture{l1, l2, l3})
|
||||||
|
|
||||||
|
// Commit ranges
|
||||||
|
in.CommitRange(1, 5)
|
||||||
|
in.CommitRange(1, 4)
|
||||||
|
in.CommitRange(1, 10)
|
||||||
|
|
||||||
|
// Should get 3 back
|
||||||
|
if in.Committed().Len() != 3 {
|
||||||
|
t.Fatalf("expected all 3 to commit")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestInflight_CommitRange(t *testing.T) {
|
||||||
|
commitCh := make(chan struct{}, 1)
|
||||||
|
in := newInflight(commitCh)
|
||||||
|
|
||||||
|
// Commit a few transaction as being in flight
|
||||||
|
l1 := &logFuture{log: Log{Index: 2}}
|
||||||
|
l1.policy = newMajorityQuorum(5)
|
||||||
|
in.Start(l1)
|
||||||
|
|
||||||
|
l2 := &logFuture{log: Log{Index: 3}}
|
||||||
|
l2.policy = newMajorityQuorum(5)
|
||||||
|
in.Start(l2)
|
||||||
|
|
||||||
|
l3 := &logFuture{log: Log{Index: 4}}
|
||||||
|
l3.policy = newMajorityQuorum(5)
|
||||||
|
in.Start(l3)
|
||||||
|
|
||||||
|
// Commit ranges
|
||||||
|
in.CommitRange(1, 5)
|
||||||
|
in.CommitRange(1, 4)
|
||||||
|
in.CommitRange(1, 10)
|
||||||
|
|
||||||
|
// Should get 3 back
|
||||||
|
if in.Committed().Len() != 3 {
|
||||||
|
t.Fatalf("expected all 3 to commit")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Should panic if we commit non contiguously!
|
||||||
|
func TestInflight_NonContiguous(t *testing.T) {
|
||||||
|
commitCh := make(chan struct{}, 1)
|
||||||
|
in := newInflight(commitCh)
|
||||||
|
|
||||||
|
// Commit a few transaction as being in flight
|
||||||
|
l1 := &logFuture{log: Log{Index: 2}}
|
||||||
|
l1.policy = newMajorityQuorum(5)
|
||||||
|
in.Start(l1)
|
||||||
|
|
||||||
|
l2 := &logFuture{log: Log{Index: 3}}
|
||||||
|
l2.policy = newMajorityQuorum(5)
|
||||||
|
in.Start(l2)
|
||||||
|
|
||||||
|
in.Commit(3)
|
||||||
|
in.Commit(3)
|
||||||
|
in.Commit(3) // panic!
|
||||||
|
|
||||||
|
if in.Committed().Len() != 0 {
|
||||||
|
t.Fatalf("should not commit")
|
||||||
|
}
|
||||||
|
|
||||||
|
in.Commit(2)
|
||||||
|
in.Commit(2)
|
||||||
|
in.Commit(2) // panic!
|
||||||
|
|
||||||
|
committed := in.Committed()
|
||||||
|
if committed.Len() != 2 {
|
||||||
|
t.Fatalf("should commit both")
|
||||||
|
}
|
||||||
|
|
||||||
|
current := committed.Front()
|
||||||
|
l := current.Value.(*logFuture)
|
||||||
|
if l.log.Index != 2 {
|
||||||
|
t.Fatalf("bad: %v", *l)
|
||||||
|
}
|
||||||
|
|
||||||
|
current = current.Next()
|
||||||
|
l = current.Value.(*logFuture)
|
||||||
|
if l.log.Index != 3 {
|
||||||
|
t.Fatalf("bad: %v", *l)
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,116 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"sync"
|
||||||
|
)
|
||||||
|
|
||||||
|
// InmemStore implements the LogStore and StableStore interface.
|
||||||
|
// It should NOT EVER be used for production. It is used only for
|
||||||
|
// unit tests. Use the MDBStore implementation instead.
|
||||||
|
type InmemStore struct {
|
||||||
|
l sync.RWMutex
|
||||||
|
lowIndex uint64
|
||||||
|
highIndex uint64
|
||||||
|
logs map[uint64]*Log
|
||||||
|
kv map[string][]byte
|
||||||
|
kvInt map[string]uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewInmemStore returns a new in-memory backend. Do not ever
|
||||||
|
// use for production. Only for testing.
|
||||||
|
func NewInmemStore() *InmemStore {
|
||||||
|
i := &InmemStore{
|
||||||
|
logs: make(map[uint64]*Log),
|
||||||
|
kv: make(map[string][]byte),
|
||||||
|
kvInt: make(map[string]uint64),
|
||||||
|
}
|
||||||
|
return i
|
||||||
|
}
|
||||||
|
|
||||||
|
// FirstIndex implements the LogStore interface.
|
||||||
|
func (i *InmemStore) FirstIndex() (uint64, error) {
|
||||||
|
i.l.RLock()
|
||||||
|
defer i.l.RUnlock()
|
||||||
|
return i.lowIndex, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// LastIndex implements the LogStore interface.
|
||||||
|
func (i *InmemStore) LastIndex() (uint64, error) {
|
||||||
|
i.l.RLock()
|
||||||
|
defer i.l.RUnlock()
|
||||||
|
return i.highIndex, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetLog implements the LogStore interface.
|
||||||
|
func (i *InmemStore) GetLog(index uint64, log *Log) error {
|
||||||
|
i.l.RLock()
|
||||||
|
defer i.l.RUnlock()
|
||||||
|
l, ok := i.logs[index]
|
||||||
|
if !ok {
|
||||||
|
return ErrLogNotFound
|
||||||
|
}
|
||||||
|
*log = *l
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// StoreLog implements the LogStore interface.
|
||||||
|
func (i *InmemStore) StoreLog(log *Log) error {
|
||||||
|
return i.StoreLogs([]*Log{log})
|
||||||
|
}
|
||||||
|
|
||||||
|
// StoreLogs implements the LogStore interface.
|
||||||
|
func (i *InmemStore) StoreLogs(logs []*Log) error {
|
||||||
|
i.l.Lock()
|
||||||
|
defer i.l.Unlock()
|
||||||
|
for _, l := range logs {
|
||||||
|
i.logs[l.Index] = l
|
||||||
|
if i.lowIndex == 0 {
|
||||||
|
i.lowIndex = l.Index
|
||||||
|
}
|
||||||
|
if l.Index > i.highIndex {
|
||||||
|
i.highIndex = l.Index
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeleteRange implements the LogStore interface.
|
||||||
|
func (i *InmemStore) DeleteRange(min, max uint64) error {
|
||||||
|
i.l.Lock()
|
||||||
|
defer i.l.Unlock()
|
||||||
|
for j := min; j <= max; j++ {
|
||||||
|
delete(i.logs, j)
|
||||||
|
}
|
||||||
|
i.lowIndex = max + 1
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set implements the StableStore interface.
|
||||||
|
func (i *InmemStore) Set(key []byte, val []byte) error {
|
||||||
|
i.l.Lock()
|
||||||
|
defer i.l.Unlock()
|
||||||
|
i.kv[string(key)] = val
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get implements the StableStore interface.
|
||||||
|
func (i *InmemStore) Get(key []byte) ([]byte, error) {
|
||||||
|
i.l.RLock()
|
||||||
|
defer i.l.RUnlock()
|
||||||
|
return i.kv[string(key)], nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// SetUint64 implements the StableStore interface.
|
||||||
|
func (i *InmemStore) SetUint64(key []byte, val uint64) error {
|
||||||
|
i.l.Lock()
|
||||||
|
defer i.l.Unlock()
|
||||||
|
i.kvInt[string(key)] = val
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetUint64 implements the StableStore interface.
|
||||||
|
func (i *InmemStore) GetUint64(key []byte) (uint64, error) {
|
||||||
|
i.l.RLock()
|
||||||
|
defer i.l.RUnlock()
|
||||||
|
return i.kvInt[string(key)], nil
|
||||||
|
}
|
@ -0,0 +1,324 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// NewInmemAddr returns a new in-memory addr with
|
||||||
|
// a randomly generate UUID as the ID.
|
||||||
|
func NewInmemAddr() string {
|
||||||
|
return generateUUID()
|
||||||
|
}
|
||||||
|
|
||||||
|
// inmemPipeline is used to pipeline requests for the in-mem transport.
|
||||||
|
type inmemPipeline struct {
|
||||||
|
trans *InmemTransport
|
||||||
|
peer *InmemTransport
|
||||||
|
peerAddr string
|
||||||
|
|
||||||
|
doneCh chan AppendFuture
|
||||||
|
inprogressCh chan *inmemPipelineInflight
|
||||||
|
|
||||||
|
shutdown bool
|
||||||
|
shutdownCh chan struct{}
|
||||||
|
shutdownLock sync.Mutex
|
||||||
|
}
|
||||||
|
|
||||||
|
type inmemPipelineInflight struct {
|
||||||
|
future *appendFuture
|
||||||
|
respCh <-chan RPCResponse
|
||||||
|
}
|
||||||
|
|
||||||
|
// InmemTransport Implements the Transport interface, to allow Raft to be
|
||||||
|
// tested in-memory without going over a network.
|
||||||
|
type InmemTransport struct {
|
||||||
|
sync.RWMutex
|
||||||
|
consumerCh chan RPC
|
||||||
|
localAddr string
|
||||||
|
peers map[string]*InmemTransport
|
||||||
|
pipelines []*inmemPipeline
|
||||||
|
timeout time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewInmemTransport is used to initialize a new transport
|
||||||
|
// and generates a random local address if none is specified
|
||||||
|
func NewInmemTransport(addr string) (string, *InmemTransport) {
|
||||||
|
if addr == "" {
|
||||||
|
addr = NewInmemAddr()
|
||||||
|
}
|
||||||
|
trans := &InmemTransport{
|
||||||
|
consumerCh: make(chan RPC, 16),
|
||||||
|
localAddr: addr,
|
||||||
|
peers: make(map[string]*InmemTransport),
|
||||||
|
timeout: 50 * time.Millisecond,
|
||||||
|
}
|
||||||
|
return addr, trans
|
||||||
|
}
|
||||||
|
|
||||||
|
// SetHeartbeatHandler is used to set optional fast-path for
|
||||||
|
// heartbeats, not supported for this transport.
|
||||||
|
func (i *InmemTransport) SetHeartbeatHandler(cb func(RPC)) {
|
||||||
|
}
|
||||||
|
|
||||||
|
// Consumer implements the Transport interface.
|
||||||
|
func (i *InmemTransport) Consumer() <-chan RPC {
|
||||||
|
return i.consumerCh
|
||||||
|
}
|
||||||
|
|
||||||
|
// LocalAddr implements the Transport interface.
|
||||||
|
func (i *InmemTransport) LocalAddr() string {
|
||||||
|
return i.localAddr
|
||||||
|
}
|
||||||
|
|
||||||
|
// AppendEntriesPipeline returns an interface that can be used to pipeline
|
||||||
|
// AppendEntries requests.
|
||||||
|
func (i *InmemTransport) AppendEntriesPipeline(target string) (AppendPipeline, error) {
|
||||||
|
i.RLock()
|
||||||
|
peer, ok := i.peers[target]
|
||||||
|
i.RUnlock()
|
||||||
|
if !ok {
|
||||||
|
return nil, fmt.Errorf("failed to connect to peer: %v", target)
|
||||||
|
}
|
||||||
|
pipeline := newInmemPipeline(i, peer, target)
|
||||||
|
i.Lock()
|
||||||
|
i.pipelines = append(i.pipelines, pipeline)
|
||||||
|
i.Unlock()
|
||||||
|
return pipeline, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// AppendEntries implements the Transport interface.
|
||||||
|
func (i *InmemTransport) AppendEntries(target string, args *AppendEntriesRequest, resp *AppendEntriesResponse) error {
|
||||||
|
rpcResp, err := i.makeRPC(target, args, nil, i.timeout)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Copy the result back
|
||||||
|
out := rpcResp.Response.(*AppendEntriesResponse)
|
||||||
|
*resp = *out
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// RequestVote implements the Transport interface.
|
||||||
|
func (i *InmemTransport) RequestVote(target string, args *RequestVoteRequest, resp *RequestVoteResponse) error {
|
||||||
|
rpcResp, err := i.makeRPC(target, args, nil, i.timeout)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Copy the result back
|
||||||
|
out := rpcResp.Response.(*RequestVoteResponse)
|
||||||
|
*resp = *out
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// InstallSnapshot implements the Transport interface.
|
||||||
|
func (i *InmemTransport) InstallSnapshot(target string, args *InstallSnapshotRequest, resp *InstallSnapshotResponse, data io.Reader) error {
|
||||||
|
rpcResp, err := i.makeRPC(target, args, data, 10*i.timeout)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Copy the result back
|
||||||
|
out := rpcResp.Response.(*InstallSnapshotResponse)
|
||||||
|
*resp = *out
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (i *InmemTransport) makeRPC(target string, args interface{}, r io.Reader, timeout time.Duration) (rpcResp RPCResponse, err error) {
|
||||||
|
i.RLock()
|
||||||
|
peer, ok := i.peers[target]
|
||||||
|
i.RUnlock()
|
||||||
|
|
||||||
|
if !ok {
|
||||||
|
err = fmt.Errorf("failed to connect to peer: %v", target)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send the RPC over
|
||||||
|
respCh := make(chan RPCResponse)
|
||||||
|
peer.consumerCh <- RPC{
|
||||||
|
Command: args,
|
||||||
|
Reader: r,
|
||||||
|
RespChan: respCh,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for a response
|
||||||
|
select {
|
||||||
|
case rpcResp = <-respCh:
|
||||||
|
if rpcResp.Error != nil {
|
||||||
|
err = rpcResp.Error
|
||||||
|
}
|
||||||
|
case <-time.After(timeout):
|
||||||
|
err = fmt.Errorf("command timed out")
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// EncodePeer implements the Transport interface. It uses the UUID as the
|
||||||
|
// address directly.
|
||||||
|
func (i *InmemTransport) EncodePeer(p string) []byte {
|
||||||
|
return []byte(p)
|
||||||
|
}
|
||||||
|
|
||||||
|
// DecodePeer implements the Transport interface. It wraps the UUID in an
|
||||||
|
// InmemAddr.
|
||||||
|
func (i *InmemTransport) DecodePeer(buf []byte) string {
|
||||||
|
return string(buf)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Connect is used to connect this transport to another transport for
|
||||||
|
// a given peer name. This allows for local routing.
|
||||||
|
func (i *InmemTransport) Connect(peer string, t Transport) {
|
||||||
|
trans := t.(*InmemTransport)
|
||||||
|
i.Lock()
|
||||||
|
defer i.Unlock()
|
||||||
|
i.peers[peer] = trans
|
||||||
|
}
|
||||||
|
|
||||||
|
// Disconnect is used to remove the ability to route to a given peer.
|
||||||
|
func (i *InmemTransport) Disconnect(peer string) {
|
||||||
|
i.Lock()
|
||||||
|
defer i.Unlock()
|
||||||
|
delete(i.peers, peer)
|
||||||
|
|
||||||
|
// Disconnect any pipelines
|
||||||
|
n := len(i.pipelines)
|
||||||
|
for idx := 0; idx < n; idx++ {
|
||||||
|
if i.pipelines[idx].peerAddr == peer {
|
||||||
|
i.pipelines[idx].Close()
|
||||||
|
i.pipelines[idx], i.pipelines[n-1] = i.pipelines[n-1], nil
|
||||||
|
idx--
|
||||||
|
n--
|
||||||
|
}
|
||||||
|
}
|
||||||
|
i.pipelines = i.pipelines[:n]
|
||||||
|
}
|
||||||
|
|
||||||
|
// DisconnectAll is used to remove all routes to peers.
|
||||||
|
func (i *InmemTransport) DisconnectAll() {
|
||||||
|
i.Lock()
|
||||||
|
defer i.Unlock()
|
||||||
|
i.peers = make(map[string]*InmemTransport)
|
||||||
|
|
||||||
|
// Handle pipelines
|
||||||
|
for _, pipeline := range i.pipelines {
|
||||||
|
pipeline.Close()
|
||||||
|
}
|
||||||
|
i.pipelines = nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close is used to permanently disable the transport
|
||||||
|
func (i *InmemTransport) Close() error {
|
||||||
|
i.DisconnectAll()
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func newInmemPipeline(trans *InmemTransport, peer *InmemTransport, addr string) *inmemPipeline {
|
||||||
|
i := &inmemPipeline{
|
||||||
|
trans: trans,
|
||||||
|
peer: peer,
|
||||||
|
peerAddr: addr,
|
||||||
|
doneCh: make(chan AppendFuture, 16),
|
||||||
|
inprogressCh: make(chan *inmemPipelineInflight, 16),
|
||||||
|
shutdownCh: make(chan struct{}),
|
||||||
|
}
|
||||||
|
go i.decodeResponses()
|
||||||
|
return i
|
||||||
|
}
|
||||||
|
|
||||||
|
func (i *inmemPipeline) decodeResponses() {
|
||||||
|
timeout := i.trans.timeout
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case inp := <-i.inprogressCh:
|
||||||
|
var timeoutCh <-chan time.Time
|
||||||
|
if timeout > 0 {
|
||||||
|
timeoutCh = time.After(timeout)
|
||||||
|
}
|
||||||
|
|
||||||
|
select {
|
||||||
|
case rpcResp := <-inp.respCh:
|
||||||
|
// Copy the result back
|
||||||
|
*inp.future.resp = *rpcResp.Response.(*AppendEntriesResponse)
|
||||||
|
inp.future.respond(rpcResp.Error)
|
||||||
|
|
||||||
|
select {
|
||||||
|
case i.doneCh <- inp.future:
|
||||||
|
case <-i.shutdownCh:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
case <-timeoutCh:
|
||||||
|
inp.future.respond(fmt.Errorf("command timed out"))
|
||||||
|
select {
|
||||||
|
case i.doneCh <- inp.future:
|
||||||
|
case <-i.shutdownCh:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
case <-i.shutdownCh:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
case <-i.shutdownCh:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (i *inmemPipeline) AppendEntries(args *AppendEntriesRequest, resp *AppendEntriesResponse) (AppendFuture, error) {
|
||||||
|
// Create a new future
|
||||||
|
future := &appendFuture{
|
||||||
|
start: time.Now(),
|
||||||
|
args: args,
|
||||||
|
resp: resp,
|
||||||
|
}
|
||||||
|
future.init()
|
||||||
|
|
||||||
|
// Handle a timeout
|
||||||
|
var timeout <-chan time.Time
|
||||||
|
if i.trans.timeout > 0 {
|
||||||
|
timeout = time.After(i.trans.timeout)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send the RPC over
|
||||||
|
respCh := make(chan RPCResponse, 1)
|
||||||
|
rpc := RPC{
|
||||||
|
Command: args,
|
||||||
|
RespChan: respCh,
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case i.peer.consumerCh <- rpc:
|
||||||
|
case <-timeout:
|
||||||
|
return nil, fmt.Errorf("command enqueue timeout")
|
||||||
|
case <-i.shutdownCh:
|
||||||
|
return nil, ErrPipelineShutdown
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send to be decoded
|
||||||
|
select {
|
||||||
|
case i.inprogressCh <- &inmemPipelineInflight{future, respCh}:
|
||||||
|
return future, nil
|
||||||
|
case <-i.shutdownCh:
|
||||||
|
return nil, ErrPipelineShutdown
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (i *inmemPipeline) Consumer() <-chan AppendFuture {
|
||||||
|
return i.doneCh
|
||||||
|
}
|
||||||
|
|
||||||
|
func (i *inmemPipeline) Close() error {
|
||||||
|
i.shutdownLock.Lock()
|
||||||
|
defer i.shutdownLock.Unlock()
|
||||||
|
if i.shutdown {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
i.shutdown = true
|
||||||
|
close(i.shutdownCh)
|
||||||
|
return nil
|
||||||
|
}
|
@ -0,0 +1,18 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestInmemTransportImpl(t *testing.T) {
|
||||||
|
var inm interface{} = &InmemTransport{}
|
||||||
|
if _, ok := inm.(Transport); !ok {
|
||||||
|
t.Fatalf("InmemTransport is not a Transport")
|
||||||
|
}
|
||||||
|
if _, ok := inm.(LoopbackTransport); !ok {
|
||||||
|
t.Fatalf("InmemTransport is not a Loopback Transport")
|
||||||
|
}
|
||||||
|
if _, ok := inm.(WithPeers); !ok {
|
||||||
|
t.Fatalf("InmemTransport is not a WithPeers Transport")
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,336 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"fmt"
|
||||||
|
"io/ioutil"
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// CheckInteg will skip a test if integration testing is not enabled.
|
||||||
|
func CheckInteg(t *testing.T) {
|
||||||
|
if !IsInteg() {
|
||||||
|
t.SkipNow()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsInteg returns a boolean telling you if we're in integ testing mode.
|
||||||
|
func IsInteg() bool {
|
||||||
|
return os.Getenv("INTEG_TESTS") != ""
|
||||||
|
}
|
||||||
|
|
||||||
|
type RaftEnv struct {
|
||||||
|
dir string
|
||||||
|
conf *Config
|
||||||
|
fsm *MockFSM
|
||||||
|
store *InmemStore
|
||||||
|
snapshot *FileSnapshotStore
|
||||||
|
peers *JSONPeers
|
||||||
|
trans *NetworkTransport
|
||||||
|
raft *Raft
|
||||||
|
logger *log.Logger
|
||||||
|
}
|
||||||
|
|
||||||
|
// Release shuts down and cleans up any stored data, its not restartable after this
|
||||||
|
func (r *RaftEnv) Release() {
|
||||||
|
r.Shutdown()
|
||||||
|
os.RemoveAll(r.dir)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Shutdown shuts down raft & transport, but keeps track of its data, its restartable
|
||||||
|
// after a Shutdown() by calling Start()
|
||||||
|
func (r *RaftEnv) Shutdown() {
|
||||||
|
r.logger.Printf("[WARN] Shutdown node at %v", r.raft.localAddr)
|
||||||
|
f := r.raft.Shutdown()
|
||||||
|
if err := f.Error(); err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
r.trans.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Restart will start a raft node that was previously Shutdown()
|
||||||
|
func (r *RaftEnv) Restart(t *testing.T) {
|
||||||
|
trans, err := NewTCPTransport(r.raft.localAddr, nil, 2, time.Second, nil)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
r.trans = trans
|
||||||
|
r.logger.Printf("[INFO] Starting node at %v", trans.LocalAddr())
|
||||||
|
raft, err := NewRaft(r.conf, r.fsm, r.store, r.store, r.snapshot, r.peers, r.trans)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
r.raft = raft
|
||||||
|
}
|
||||||
|
|
||||||
|
func MakeRaft(t *testing.T, conf *Config) *RaftEnv {
|
||||||
|
// Set the config
|
||||||
|
if conf == nil {
|
||||||
|
conf = inmemConfig(t)
|
||||||
|
}
|
||||||
|
|
||||||
|
dir, err := ioutil.TempDir("", "raft")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v ", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
stable := NewInmemStore()
|
||||||
|
|
||||||
|
snap, err := NewFileSnapshotStore(dir, 3, nil)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
env := &RaftEnv{
|
||||||
|
conf: conf,
|
||||||
|
dir: dir,
|
||||||
|
store: stable,
|
||||||
|
snapshot: snap,
|
||||||
|
fsm: &MockFSM{},
|
||||||
|
}
|
||||||
|
|
||||||
|
trans, err := NewTCPTransport("127.0.0.1:0", nil, 2, time.Second, nil)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
env.logger = log.New(os.Stdout, trans.LocalAddr()+" :", log.Lmicroseconds)
|
||||||
|
env.trans = trans
|
||||||
|
|
||||||
|
env.peers = NewJSONPeers(dir, trans)
|
||||||
|
|
||||||
|
env.logger.Printf("[INFO] Starting node at %v", trans.LocalAddr())
|
||||||
|
conf.Logger = env.logger
|
||||||
|
raft, err := NewRaft(conf, env.fsm, stable, stable, snap, env.peers, trans)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
env.raft = raft
|
||||||
|
return env
|
||||||
|
}
|
||||||
|
|
||||||
|
func WaitFor(env *RaftEnv, state RaftState) error {
|
||||||
|
limit := time.Now().Add(200 * time.Millisecond)
|
||||||
|
for env.raft.State() != state {
|
||||||
|
if time.Now().Before(limit) {
|
||||||
|
time.Sleep(10 * time.Millisecond)
|
||||||
|
} else {
|
||||||
|
return fmt.Errorf("failed to transition to state %v", state)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func WaitForAny(state RaftState, envs []*RaftEnv) (*RaftEnv, error) {
|
||||||
|
limit := time.Now().Add(200 * time.Millisecond)
|
||||||
|
CHECK:
|
||||||
|
for _, env := range envs {
|
||||||
|
if env.raft.State() == state {
|
||||||
|
return env, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if time.Now().Before(limit) {
|
||||||
|
goto WAIT
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("failed to find node in %v state", state)
|
||||||
|
WAIT:
|
||||||
|
time.Sleep(10 * time.Millisecond)
|
||||||
|
goto CHECK
|
||||||
|
}
|
||||||
|
|
||||||
|
func WaitFuture(f Future, t *testing.T) error {
|
||||||
|
timer := time.AfterFunc(200*time.Millisecond, func() {
|
||||||
|
panic(fmt.Errorf("timeout waiting for future %v", f))
|
||||||
|
})
|
||||||
|
defer timer.Stop()
|
||||||
|
return f.Error()
|
||||||
|
}
|
||||||
|
|
||||||
|
func NoErr(err error, t *testing.T) {
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func CheckConsistent(envs []*RaftEnv, t *testing.T) {
|
||||||
|
limit := time.Now().Add(400 * time.Millisecond)
|
||||||
|
first := envs[0]
|
||||||
|
first.fsm.Lock()
|
||||||
|
defer first.fsm.Unlock()
|
||||||
|
var err error
|
||||||
|
CHECK:
|
||||||
|
l1 := len(first.fsm.logs)
|
||||||
|
for i := 1; i < len(envs); i++ {
|
||||||
|
env := envs[i]
|
||||||
|
env.fsm.Lock()
|
||||||
|
l2 := len(env.fsm.logs)
|
||||||
|
if l1 != l2 {
|
||||||
|
err = fmt.Errorf("log length mismatch %d %d", l1, l2)
|
||||||
|
env.fsm.Unlock()
|
||||||
|
goto ERR
|
||||||
|
}
|
||||||
|
for idx, log := range first.fsm.logs {
|
||||||
|
other := env.fsm.logs[idx]
|
||||||
|
if bytes.Compare(log, other) != 0 {
|
||||||
|
err = fmt.Errorf("log entry %d mismatch between %s/%s : '%s' / '%s'", idx, first.raft.localAddr, env.raft.localAddr, log, other)
|
||||||
|
env.fsm.Unlock()
|
||||||
|
goto ERR
|
||||||
|
}
|
||||||
|
}
|
||||||
|
env.fsm.Unlock()
|
||||||
|
}
|
||||||
|
return
|
||||||
|
ERR:
|
||||||
|
if time.Now().After(limit) {
|
||||||
|
t.Fatalf("%v", err)
|
||||||
|
}
|
||||||
|
first.fsm.Unlock()
|
||||||
|
time.Sleep(20 * time.Millisecond)
|
||||||
|
first.fsm.Lock()
|
||||||
|
goto CHECK
|
||||||
|
}
|
||||||
|
|
||||||
|
// return a log entry that's at least sz long that has the prefix 'test i '
|
||||||
|
func logBytes(i, sz int) []byte {
|
||||||
|
var logBuffer bytes.Buffer
|
||||||
|
fmt.Fprintf(&logBuffer, "test %d ", i)
|
||||||
|
for logBuffer.Len() < sz {
|
||||||
|
logBuffer.WriteByte('x')
|
||||||
|
}
|
||||||
|
return logBuffer.Bytes()
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tests Raft by creating a cluster, growing it to 5 nodes while
|
||||||
|
// causing various stressful conditions
|
||||||
|
func TestRaft_Integ(t *testing.T) {
|
||||||
|
CheckInteg(t)
|
||||||
|
conf := DefaultConfig()
|
||||||
|
conf.HeartbeatTimeout = 50 * time.Millisecond
|
||||||
|
conf.ElectionTimeout = 50 * time.Millisecond
|
||||||
|
conf.LeaderLeaseTimeout = 50 * time.Millisecond
|
||||||
|
conf.CommitTimeout = 5 * time.Millisecond
|
||||||
|
conf.SnapshotThreshold = 100
|
||||||
|
conf.TrailingLogs = 10
|
||||||
|
conf.EnableSingleNode = true
|
||||||
|
|
||||||
|
// Create a single node
|
||||||
|
env1 := MakeRaft(t, conf)
|
||||||
|
NoErr(WaitFor(env1, Leader), t)
|
||||||
|
|
||||||
|
totalApplied := 0
|
||||||
|
applyAndWait := func(leader *RaftEnv, n int, sz int) {
|
||||||
|
// Do some commits
|
||||||
|
var futures []ApplyFuture
|
||||||
|
for i := 0; i < n; i++ {
|
||||||
|
futures = append(futures, leader.raft.Apply(logBytes(i, sz), 0))
|
||||||
|
}
|
||||||
|
for _, f := range futures {
|
||||||
|
NoErr(WaitFuture(f, t), t)
|
||||||
|
leader.logger.Printf("[DEBUG] Applied at %d, size %d", f.Index(), sz)
|
||||||
|
}
|
||||||
|
totalApplied += n
|
||||||
|
}
|
||||||
|
// Do some commits
|
||||||
|
applyAndWait(env1, 100, 10)
|
||||||
|
|
||||||
|
// Do a snapshot
|
||||||
|
NoErr(WaitFuture(env1.raft.Snapshot(), t), t)
|
||||||
|
|
||||||
|
// Join a few nodes!
|
||||||
|
var envs []*RaftEnv
|
||||||
|
for i := 0; i < 4; i++ {
|
||||||
|
env := MakeRaft(t, conf)
|
||||||
|
addr := env.trans.LocalAddr()
|
||||||
|
NoErr(WaitFuture(env1.raft.AddPeer(addr), t), t)
|
||||||
|
envs = append(envs, env)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for a leader
|
||||||
|
leader, err := WaitForAny(Leader, append([]*RaftEnv{env1}, envs...))
|
||||||
|
NoErr(err, t)
|
||||||
|
|
||||||
|
// Do some more commits
|
||||||
|
applyAndWait(leader, 100, 10)
|
||||||
|
|
||||||
|
// snapshot the leader
|
||||||
|
NoErr(WaitFuture(leader.raft.Snapshot(), t), t)
|
||||||
|
|
||||||
|
CheckConsistent(append([]*RaftEnv{env1}, envs...), t)
|
||||||
|
|
||||||
|
// shutdown a follower
|
||||||
|
disconnected := envs[len(envs)-1]
|
||||||
|
disconnected.Shutdown()
|
||||||
|
|
||||||
|
// Do some more commits [make sure the resulting snapshot will be a reasonable size]
|
||||||
|
applyAndWait(leader, 100, 10000)
|
||||||
|
|
||||||
|
// snapshot the leader [leaders log should be compacted past the disconnected follower log now]
|
||||||
|
NoErr(WaitFuture(leader.raft.Snapshot(), t), t)
|
||||||
|
|
||||||
|
// Unfortuantly we need to wait for the leader to start backing off RPCs to the down follower
|
||||||
|
// such that when the follower comes back up it'll run an election before it gets an rpc from
|
||||||
|
// the leader
|
||||||
|
time.Sleep(time.Second * 5)
|
||||||
|
|
||||||
|
// start the now out of date follower back up
|
||||||
|
disconnected.Restart(t)
|
||||||
|
|
||||||
|
// wait for it to get caught up
|
||||||
|
timeout := time.Now().Add(time.Second * 10)
|
||||||
|
for disconnected.raft.getLastApplied() < leader.raft.getLastApplied() {
|
||||||
|
time.Sleep(time.Millisecond)
|
||||||
|
if time.Now().After(timeout) {
|
||||||
|
t.Fatalf("Gave up waiting for follower to get caught up to leader")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
CheckConsistent(append([]*RaftEnv{env1}, envs...), t)
|
||||||
|
|
||||||
|
// Shoot two nodes in the head!
|
||||||
|
rm1, rm2 := envs[0], envs[1]
|
||||||
|
rm1.Release()
|
||||||
|
rm2.Release()
|
||||||
|
envs = envs[2:]
|
||||||
|
time.Sleep(10 * time.Millisecond)
|
||||||
|
|
||||||
|
// Wait for a leader
|
||||||
|
leader, err = WaitForAny(Leader, append([]*RaftEnv{env1}, envs...))
|
||||||
|
NoErr(err, t)
|
||||||
|
|
||||||
|
// Do some more commits
|
||||||
|
applyAndWait(leader, 100, 10)
|
||||||
|
|
||||||
|
// Join a few new nodes!
|
||||||
|
for i := 0; i < 2; i++ {
|
||||||
|
env := MakeRaft(t, conf)
|
||||||
|
addr := env.trans.LocalAddr()
|
||||||
|
NoErr(WaitFuture(leader.raft.AddPeer(addr), t), t)
|
||||||
|
envs = append(envs, env)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove the old nodes
|
||||||
|
NoErr(WaitFuture(leader.raft.RemovePeer(rm1.raft.localAddr), t), t)
|
||||||
|
NoErr(WaitFuture(leader.raft.RemovePeer(rm2.raft.localAddr), t), t)
|
||||||
|
|
||||||
|
// Shoot the leader
|
||||||
|
env1.Release()
|
||||||
|
time.Sleep(3 * conf.HeartbeatTimeout)
|
||||||
|
|
||||||
|
// Wait for a leader
|
||||||
|
leader, err = WaitForAny(Leader, envs)
|
||||||
|
NoErr(err, t)
|
||||||
|
|
||||||
|
allEnvs := append([]*RaftEnv{env1}, envs...)
|
||||||
|
CheckConsistent(allEnvs, t)
|
||||||
|
|
||||||
|
if len(env1.fsm.logs) != totalApplied {
|
||||||
|
t.Fatalf("should apply %d logs! %d", totalApplied, len(env1.fsm.logs))
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, e := range envs {
|
||||||
|
e.Release()
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,67 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
// LogType describes various types of log entries.
|
||||||
|
type LogType uint8
|
||||||
|
|
||||||
|
const (
|
||||||
|
// LogCommand is applied to a user FSM.
|
||||||
|
LogCommand LogType = iota
|
||||||
|
|
||||||
|
// LogNoop is used to assert leadership.
|
||||||
|
LogNoop
|
||||||
|
|
||||||
|
// LogAddPeer is used to add a new peer.
|
||||||
|
LogAddPeer
|
||||||
|
|
||||||
|
// LogRemovePeer is used to remove an existing peer.
|
||||||
|
LogRemovePeer
|
||||||
|
|
||||||
|
// LogBarrier is used to ensure all preceding operations have been
|
||||||
|
// applied to the FSM. It is similar to LogNoop, but instead of returning
|
||||||
|
// once committed, it only returns once the FSM manager acks it. Otherwise
|
||||||
|
// it is possible there are operations committed but not yet applied to
|
||||||
|
// the FSM.
|
||||||
|
LogBarrier
|
||||||
|
)
|
||||||
|
|
||||||
|
// Log entries are replicated to all members of the Raft cluster
|
||||||
|
// and form the heart of the replicated state machine.
|
||||||
|
type Log struct {
|
||||||
|
// Index holds the index of the log entry.
|
||||||
|
Index uint64
|
||||||
|
|
||||||
|
// Term holds the election term of the log entry.
|
||||||
|
Term uint64
|
||||||
|
|
||||||
|
// Type holds the type of the log entry.
|
||||||
|
Type LogType
|
||||||
|
|
||||||
|
// Data holds the log entry's type-specific data.
|
||||||
|
Data []byte
|
||||||
|
|
||||||
|
// peer is not exported since it is not transmitted, only used
|
||||||
|
// internally to construct the Data field.
|
||||||
|
peer string
|
||||||
|
}
|
||||||
|
|
||||||
|
// LogStore is used to provide an interface for storing
|
||||||
|
// and retrieving logs in a durable fashion.
|
||||||
|
type LogStore interface {
|
||||||
|
// FirstIndex returns the first index written. 0 for no entries.
|
||||||
|
FirstIndex() (uint64, error)
|
||||||
|
|
||||||
|
// LastIndex returns the last index written. 0 for no entries.
|
||||||
|
LastIndex() (uint64, error)
|
||||||
|
|
||||||
|
// GetLog gets a log entry at a given index.
|
||||||
|
GetLog(index uint64, log *Log) error
|
||||||
|
|
||||||
|
// StoreLog stores a log entry.
|
||||||
|
StoreLog(log *Log) error
|
||||||
|
|
||||||
|
// StoreLogs stores multiple log entries.
|
||||||
|
StoreLogs(logs []*Log) error
|
||||||
|
|
||||||
|
// DeleteRange deletes a range of log entries. The range is inclusive.
|
||||||
|
DeleteRange(min, max uint64) error
|
||||||
|
}
|
@ -0,0 +1,79 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"sync"
|
||||||
|
)
|
||||||
|
|
||||||
|
// LogCache wraps any LogStore implementation to provide an
|
||||||
|
// in-memory ring buffer. This is used to cache access to
|
||||||
|
// the recently written entries. For implementations that do not
|
||||||
|
// cache themselves, this can provide a substantial boost by
|
||||||
|
// avoiding disk I/O on recent entries.
|
||||||
|
type LogCache struct {
|
||||||
|
store LogStore
|
||||||
|
|
||||||
|
cache []*Log
|
||||||
|
l sync.RWMutex
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewLogCache is used to create a new LogCache with the
|
||||||
|
// given capacity and backend store.
|
||||||
|
func NewLogCache(capacity int, store LogStore) (*LogCache, error) {
|
||||||
|
if capacity <= 0 {
|
||||||
|
return nil, fmt.Errorf("capacity must be positive")
|
||||||
|
}
|
||||||
|
c := &LogCache{
|
||||||
|
store: store,
|
||||||
|
cache: make([]*Log, capacity),
|
||||||
|
}
|
||||||
|
return c, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *LogCache) GetLog(idx uint64, log *Log) error {
|
||||||
|
// Check the buffer for an entry
|
||||||
|
c.l.RLock()
|
||||||
|
cached := c.cache[idx%uint64(len(c.cache))]
|
||||||
|
c.l.RUnlock()
|
||||||
|
|
||||||
|
// Check if entry is valid
|
||||||
|
if cached != nil && cached.Index == idx {
|
||||||
|
*log = *cached
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Forward request on cache miss
|
||||||
|
return c.store.GetLog(idx, log)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *LogCache) StoreLog(log *Log) error {
|
||||||
|
return c.StoreLogs([]*Log{log})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *LogCache) StoreLogs(logs []*Log) error {
|
||||||
|
// Insert the logs into the ring buffer
|
||||||
|
c.l.Lock()
|
||||||
|
for _, l := range logs {
|
||||||
|
c.cache[l.Index%uint64(len(c.cache))] = l
|
||||||
|
}
|
||||||
|
c.l.Unlock()
|
||||||
|
|
||||||
|
return c.store.StoreLogs(logs)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *LogCache) FirstIndex() (uint64, error) {
|
||||||
|
return c.store.FirstIndex()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *LogCache) LastIndex() (uint64, error) {
|
||||||
|
return c.store.LastIndex()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *LogCache) DeleteRange(min, max uint64) error {
|
||||||
|
// Invalidate the cache on deletes
|
||||||
|
c.l.Lock()
|
||||||
|
c.cache = make([]*Log, len(c.cache))
|
||||||
|
c.l.Unlock()
|
||||||
|
|
||||||
|
return c.store.DeleteRange(min, max)
|
||||||
|
}
|
@ -0,0 +1,88 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestLogCache(t *testing.T) {
|
||||||
|
store := NewInmemStore()
|
||||||
|
c, _ := NewLogCache(16, store)
|
||||||
|
|
||||||
|
// Insert into the in-mem store
|
||||||
|
for i := 0; i < 32; i++ {
|
||||||
|
log := &Log{Index: uint64(i) + 1}
|
||||||
|
store.StoreLog(log)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check the indexes
|
||||||
|
if idx, _ := c.FirstIndex(); idx != 1 {
|
||||||
|
t.Fatalf("bad: %d", idx)
|
||||||
|
}
|
||||||
|
if idx, _ := c.LastIndex(); idx != 32 {
|
||||||
|
t.Fatalf("bad: %d", idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try get log with a miss
|
||||||
|
var out Log
|
||||||
|
err := c.GetLog(1, &out)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
if out.Index != 1 {
|
||||||
|
t.Fatalf("bad: %#v", out)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store logs
|
||||||
|
l1 := &Log{Index: 33}
|
||||||
|
l2 := &Log{Index: 34}
|
||||||
|
err = c.StoreLogs([]*Log{l1, l2})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if idx, _ := c.LastIndex(); idx != 34 {
|
||||||
|
t.Fatalf("bad: %d", idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check that it wrote-through
|
||||||
|
err = store.GetLog(33, &out)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
err = store.GetLog(34, &out)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete in the backend
|
||||||
|
err = store.DeleteRange(33, 34)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Should be in the ring buffer
|
||||||
|
err = c.GetLog(33, &out)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
err = c.GetLog(34, &out)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Purge the ring buffer
|
||||||
|
err = c.DeleteRange(33, 34)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Should not be in the ring buffer
|
||||||
|
err = c.GetLog(33, &out)
|
||||||
|
if err != ErrLogNotFound {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
err = c.GetLog(34, &out)
|
||||||
|
if err != ErrLogNotFound {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,622 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"log"
|
||||||
|
"net"
|
||||||
|
"os"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/hashicorp/go-msgpack/codec"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
rpcAppendEntries uint8 = iota
|
||||||
|
rpcRequestVote
|
||||||
|
rpcInstallSnapshot
|
||||||
|
|
||||||
|
// DefaultTimeoutScale is the default TimeoutScale in a NetworkTransport.
|
||||||
|
DefaultTimeoutScale = 256 * 1024 // 256KB
|
||||||
|
|
||||||
|
// rpcMaxPipeline controls the maximum number of outstanding
|
||||||
|
// AppendEntries RPC calls.
|
||||||
|
rpcMaxPipeline = 128
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
// ErrTransportShutdown is returned when operations on a transport are
|
||||||
|
// invoked after it's been terminated.
|
||||||
|
ErrTransportShutdown = errors.New("transport shutdown")
|
||||||
|
|
||||||
|
// ErrPipelineShutdown is returned when the pipeline is closed.
|
||||||
|
ErrPipelineShutdown = errors.New("append pipeline closed")
|
||||||
|
)
|
||||||
|
|
||||||
|
/*
|
||||||
|
|
||||||
|
NetworkTransport provides a network based transport that can be
|
||||||
|
used to communicate with Raft on remote machines. It requires
|
||||||
|
an underlying stream layer to provide a stream abstraction, which can
|
||||||
|
be simple TCP, TLS, etc.
|
||||||
|
|
||||||
|
This transport is very simple and lightweight. Each RPC request is
|
||||||
|
framed by sending a byte that indicates the message type, followed
|
||||||
|
by the MsgPack encoded request.
|
||||||
|
|
||||||
|
The response is an error string followed by the response object,
|
||||||
|
both are encoded using MsgPack.
|
||||||
|
|
||||||
|
InstallSnapshot is special, in that after the RPC request we stream
|
||||||
|
the entire state. That socket is not re-used as the connection state
|
||||||
|
is not known if there is an error.
|
||||||
|
|
||||||
|
*/
|
||||||
|
type NetworkTransport struct {
|
||||||
|
connPool map[string][]*netConn
|
||||||
|
connPoolLock sync.Mutex
|
||||||
|
|
||||||
|
consumeCh chan RPC
|
||||||
|
|
||||||
|
heartbeatFn func(RPC)
|
||||||
|
heartbeatFnLock sync.Mutex
|
||||||
|
|
||||||
|
logger *log.Logger
|
||||||
|
|
||||||
|
maxPool int
|
||||||
|
|
||||||
|
shutdown bool
|
||||||
|
shutdownCh chan struct{}
|
||||||
|
shutdownLock sync.Mutex
|
||||||
|
|
||||||
|
stream StreamLayer
|
||||||
|
|
||||||
|
timeout time.Duration
|
||||||
|
TimeoutScale int
|
||||||
|
}
|
||||||
|
|
||||||
|
// StreamLayer is used with the NetworkTransport to provide
|
||||||
|
// the low level stream abstraction.
|
||||||
|
type StreamLayer interface {
|
||||||
|
net.Listener
|
||||||
|
|
||||||
|
// Dial is used to create a new outgoing connection
|
||||||
|
Dial(address string, timeout time.Duration) (net.Conn, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
type netConn struct {
|
||||||
|
target string
|
||||||
|
conn net.Conn
|
||||||
|
r *bufio.Reader
|
||||||
|
w *bufio.Writer
|
||||||
|
dec *codec.Decoder
|
||||||
|
enc *codec.Encoder
|
||||||
|
}
|
||||||
|
|
||||||
|
func (n *netConn) Release() error {
|
||||||
|
return n.conn.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
type netPipeline struct {
|
||||||
|
conn *netConn
|
||||||
|
trans *NetworkTransport
|
||||||
|
|
||||||
|
doneCh chan AppendFuture
|
||||||
|
inprogressCh chan *appendFuture
|
||||||
|
|
||||||
|
shutdown bool
|
||||||
|
shutdownCh chan struct{}
|
||||||
|
shutdownLock sync.Mutex
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewNetworkTransport creates a new network transport with the given dialer
|
||||||
|
// and listener. The maxPool controls how many connections we will pool. The
|
||||||
|
// timeout is used to apply I/O deadlines. For InstallSnapshot, we multiply
|
||||||
|
// the timeout by (SnapshotSize / TimeoutScale).
|
||||||
|
func NewNetworkTransport(
|
||||||
|
stream StreamLayer,
|
||||||
|
maxPool int,
|
||||||
|
timeout time.Duration,
|
||||||
|
logOutput io.Writer,
|
||||||
|
) *NetworkTransport {
|
||||||
|
if logOutput == nil {
|
||||||
|
logOutput = os.Stderr
|
||||||
|
}
|
||||||
|
return NewNetworkTransportWithLogger(stream, maxPool, timeout, log.New(logOutput, "", log.LstdFlags))
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewNetworkTransportWithLogger creates a new network transport with the given dialer
|
||||||
|
// and listener. The maxPool controls how many connections we will pool. The
|
||||||
|
// timeout is used to apply I/O deadlines. For InstallSnapshot, we multiply
|
||||||
|
// the timeout by (SnapshotSize / TimeoutScale).
|
||||||
|
func NewNetworkTransportWithLogger(
|
||||||
|
stream StreamLayer,
|
||||||
|
maxPool int,
|
||||||
|
timeout time.Duration,
|
||||||
|
logger *log.Logger,
|
||||||
|
) *NetworkTransport {
|
||||||
|
if logger == nil {
|
||||||
|
logger = log.New(os.Stderr, "", log.LstdFlags)
|
||||||
|
}
|
||||||
|
trans := &NetworkTransport{
|
||||||
|
connPool: make(map[string][]*netConn),
|
||||||
|
consumeCh: make(chan RPC),
|
||||||
|
logger: logger,
|
||||||
|
maxPool: maxPool,
|
||||||
|
shutdownCh: make(chan struct{}),
|
||||||
|
stream: stream,
|
||||||
|
timeout: timeout,
|
||||||
|
TimeoutScale: DefaultTimeoutScale,
|
||||||
|
}
|
||||||
|
go trans.listen()
|
||||||
|
return trans
|
||||||
|
}
|
||||||
|
|
||||||
|
// SetHeartbeatHandler is used to setup a heartbeat handler
|
||||||
|
// as a fast-pass. This is to avoid head-of-line blocking from
|
||||||
|
// disk IO.
|
||||||
|
func (n *NetworkTransport) SetHeartbeatHandler(cb func(rpc RPC)) {
|
||||||
|
n.heartbeatFnLock.Lock()
|
||||||
|
defer n.heartbeatFnLock.Unlock()
|
||||||
|
n.heartbeatFn = cb
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close is used to stop the network transport.
|
||||||
|
func (n *NetworkTransport) Close() error {
|
||||||
|
n.shutdownLock.Lock()
|
||||||
|
defer n.shutdownLock.Unlock()
|
||||||
|
|
||||||
|
if !n.shutdown {
|
||||||
|
close(n.shutdownCh)
|
||||||
|
n.stream.Close()
|
||||||
|
n.shutdown = true
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Consumer implements the Transport interface.
|
||||||
|
func (n *NetworkTransport) Consumer() <-chan RPC {
|
||||||
|
return n.consumeCh
|
||||||
|
}
|
||||||
|
|
||||||
|
// LocalAddr implements the Transport interface.
|
||||||
|
func (n *NetworkTransport) LocalAddr() string {
|
||||||
|
return n.stream.Addr().String()
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsShutdown is used to check if the transport is shutdown.
|
||||||
|
func (n *NetworkTransport) IsShutdown() bool {
|
||||||
|
select {
|
||||||
|
case <-n.shutdownCh:
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// getExistingConn is used to grab a pooled connection.
|
||||||
|
func (n *NetworkTransport) getPooledConn(target string) *netConn {
|
||||||
|
n.connPoolLock.Lock()
|
||||||
|
defer n.connPoolLock.Unlock()
|
||||||
|
|
||||||
|
conns, ok := n.connPool[target]
|
||||||
|
if !ok || len(conns) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var conn *netConn
|
||||||
|
num := len(conns)
|
||||||
|
conn, conns[num-1] = conns[num-1], nil
|
||||||
|
n.connPool[target] = conns[:num-1]
|
||||||
|
return conn
|
||||||
|
}
|
||||||
|
|
||||||
|
// getConn is used to get a connection from the pool.
|
||||||
|
func (n *NetworkTransport) getConn(target string) (*netConn, error) {
|
||||||
|
// Check for a pooled conn
|
||||||
|
if conn := n.getPooledConn(target); conn != nil {
|
||||||
|
return conn, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dial a new connection
|
||||||
|
conn, err := n.stream.Dial(target, n.timeout)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wrap the conn
|
||||||
|
netConn := &netConn{
|
||||||
|
target: target,
|
||||||
|
conn: conn,
|
||||||
|
r: bufio.NewReader(conn),
|
||||||
|
w: bufio.NewWriter(conn),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Setup encoder/decoders
|
||||||
|
netConn.dec = codec.NewDecoder(netConn.r, &codec.MsgpackHandle{})
|
||||||
|
netConn.enc = codec.NewEncoder(netConn.w, &codec.MsgpackHandle{})
|
||||||
|
|
||||||
|
// Done
|
||||||
|
return netConn, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// returnConn returns a connection back to the pool.
|
||||||
|
func (n *NetworkTransport) returnConn(conn *netConn) {
|
||||||
|
n.connPoolLock.Lock()
|
||||||
|
defer n.connPoolLock.Unlock()
|
||||||
|
|
||||||
|
key := conn.target
|
||||||
|
conns, _ := n.connPool[key]
|
||||||
|
|
||||||
|
if !n.IsShutdown() && len(conns) < n.maxPool {
|
||||||
|
n.connPool[key] = append(conns, conn)
|
||||||
|
} else {
|
||||||
|
conn.Release()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// AppendEntriesPipeline returns an interface that can be used to pipeline
|
||||||
|
// AppendEntries requests.
|
||||||
|
func (n *NetworkTransport) AppendEntriesPipeline(target string) (AppendPipeline, error) {
|
||||||
|
// Get a connection
|
||||||
|
conn, err := n.getConn(target)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the pipeline
|
||||||
|
return newNetPipeline(n, conn), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// AppendEntries implements the Transport interface.
|
||||||
|
func (n *NetworkTransport) AppendEntries(target string, args *AppendEntriesRequest, resp *AppendEntriesResponse) error {
|
||||||
|
return n.genericRPC(target, rpcAppendEntries, args, resp)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RequestVote implements the Transport interface.
|
||||||
|
func (n *NetworkTransport) RequestVote(target string, args *RequestVoteRequest, resp *RequestVoteResponse) error {
|
||||||
|
return n.genericRPC(target, rpcRequestVote, args, resp)
|
||||||
|
}
|
||||||
|
|
||||||
|
// genericRPC handles a simple request/response RPC.
|
||||||
|
func (n *NetworkTransport) genericRPC(target string, rpcType uint8, args interface{}, resp interface{}) error {
|
||||||
|
// Get a conn
|
||||||
|
conn, err := n.getConn(target)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set a deadline
|
||||||
|
if n.timeout > 0 {
|
||||||
|
conn.conn.SetDeadline(time.Now().Add(n.timeout))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send the RPC
|
||||||
|
if err = sendRPC(conn, rpcType, args); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decode the response
|
||||||
|
canReturn, err := decodeResponse(conn, resp)
|
||||||
|
if canReturn {
|
||||||
|
n.returnConn(conn)
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// InstallSnapshot implements the Transport interface.
|
||||||
|
func (n *NetworkTransport) InstallSnapshot(target string, args *InstallSnapshotRequest, resp *InstallSnapshotResponse, data io.Reader) error {
|
||||||
|
// Get a conn, always close for InstallSnapshot
|
||||||
|
conn, err := n.getConn(target)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer conn.Release()
|
||||||
|
|
||||||
|
// Set a deadline, scaled by request size
|
||||||
|
if n.timeout > 0 {
|
||||||
|
timeout := n.timeout * time.Duration(args.Size/int64(n.TimeoutScale))
|
||||||
|
if timeout < n.timeout {
|
||||||
|
timeout = n.timeout
|
||||||
|
}
|
||||||
|
conn.conn.SetDeadline(time.Now().Add(timeout))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send the RPC
|
||||||
|
if err = sendRPC(conn, rpcInstallSnapshot, args); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stream the state
|
||||||
|
if _, err = io.Copy(conn.w, data); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Flush
|
||||||
|
if err = conn.w.Flush(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decode the response, do not return conn
|
||||||
|
_, err = decodeResponse(conn, resp)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// EncodePeer implements the Transport interface.
|
||||||
|
func (n *NetworkTransport) EncodePeer(p string) []byte {
|
||||||
|
return []byte(p)
|
||||||
|
}
|
||||||
|
|
||||||
|
// DecodePeer implements the Transport interface.
|
||||||
|
func (n *NetworkTransport) DecodePeer(buf []byte) string {
|
||||||
|
return string(buf)
|
||||||
|
}
|
||||||
|
|
||||||
|
// listen is used to handling incoming connections.
|
||||||
|
func (n *NetworkTransport) listen() {
|
||||||
|
for {
|
||||||
|
// Accept incoming connections
|
||||||
|
conn, err := n.stream.Accept()
|
||||||
|
if err != nil {
|
||||||
|
if n.IsShutdown() {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
n.logger.Printf("[ERR] raft-net: Failed to accept connection: %v", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
n.logger.Printf("[DEBUG] raft-net: %v accepted connection from: %v", n.LocalAddr(), conn.RemoteAddr())
|
||||||
|
|
||||||
|
// Handle the connection in dedicated routine
|
||||||
|
go n.handleConn(conn)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleConn is used to handle an inbound connection for its lifespan.
|
||||||
|
func (n *NetworkTransport) handleConn(conn net.Conn) {
|
||||||
|
defer conn.Close()
|
||||||
|
r := bufio.NewReader(conn)
|
||||||
|
w := bufio.NewWriter(conn)
|
||||||
|
dec := codec.NewDecoder(r, &codec.MsgpackHandle{})
|
||||||
|
enc := codec.NewEncoder(w, &codec.MsgpackHandle{})
|
||||||
|
|
||||||
|
for {
|
||||||
|
if err := n.handleCommand(r, dec, enc); err != nil {
|
||||||
|
if err != io.EOF {
|
||||||
|
n.logger.Printf("[ERR] raft-net: Failed to decode incoming command: %v", err)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := w.Flush(); err != nil {
|
||||||
|
n.logger.Printf("[ERR] raft-net: Failed to flush response: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleCommand is used to decode and dispatch a single command.
|
||||||
|
func (n *NetworkTransport) handleCommand(r *bufio.Reader, dec *codec.Decoder, enc *codec.Encoder) error {
|
||||||
|
// Get the rpc type
|
||||||
|
rpcType, err := r.ReadByte()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the RPC object
|
||||||
|
respCh := make(chan RPCResponse, 1)
|
||||||
|
rpc := RPC{
|
||||||
|
RespChan: respCh,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decode the command
|
||||||
|
isHeartbeat := false
|
||||||
|
switch rpcType {
|
||||||
|
case rpcAppendEntries:
|
||||||
|
var req AppendEntriesRequest
|
||||||
|
if err := dec.Decode(&req); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
rpc.Command = &req
|
||||||
|
|
||||||
|
// Check if this is a heartbeat
|
||||||
|
if req.Term != 0 && req.Leader != nil &&
|
||||||
|
req.PrevLogEntry == 0 && req.PrevLogTerm == 0 &&
|
||||||
|
len(req.Entries) == 0 && req.LeaderCommitIndex == 0 {
|
||||||
|
isHeartbeat = true
|
||||||
|
}
|
||||||
|
|
||||||
|
case rpcRequestVote:
|
||||||
|
var req RequestVoteRequest
|
||||||
|
if err := dec.Decode(&req); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
rpc.Command = &req
|
||||||
|
|
||||||
|
case rpcInstallSnapshot:
|
||||||
|
var req InstallSnapshotRequest
|
||||||
|
if err := dec.Decode(&req); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
rpc.Command = &req
|
||||||
|
rpc.Reader = io.LimitReader(r, req.Size)
|
||||||
|
|
||||||
|
default:
|
||||||
|
return fmt.Errorf("unknown rpc type %d", rpcType)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for heartbeat fast-path
|
||||||
|
if isHeartbeat {
|
||||||
|
n.heartbeatFnLock.Lock()
|
||||||
|
fn := n.heartbeatFn
|
||||||
|
n.heartbeatFnLock.Unlock()
|
||||||
|
if fn != nil {
|
||||||
|
fn(rpc)
|
||||||
|
goto RESP
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dispatch the RPC
|
||||||
|
select {
|
||||||
|
case n.consumeCh <- rpc:
|
||||||
|
case <-n.shutdownCh:
|
||||||
|
return ErrTransportShutdown
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for response
|
||||||
|
RESP:
|
||||||
|
select {
|
||||||
|
case resp := <-respCh:
|
||||||
|
// Send the error first
|
||||||
|
respErr := ""
|
||||||
|
if resp.Error != nil {
|
||||||
|
respErr = resp.Error.Error()
|
||||||
|
}
|
||||||
|
if err := enc.Encode(respErr); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send the response
|
||||||
|
if err := enc.Encode(resp.Response); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
case <-n.shutdownCh:
|
||||||
|
return ErrTransportShutdown
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// decodeResponse is used to decode an RPC response and reports whether
|
||||||
|
// the connection can be reused.
|
||||||
|
func decodeResponse(conn *netConn, resp interface{}) (bool, error) {
|
||||||
|
// Decode the error if any
|
||||||
|
var rpcError string
|
||||||
|
if err := conn.dec.Decode(&rpcError); err != nil {
|
||||||
|
conn.Release()
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decode the response
|
||||||
|
if err := conn.dec.Decode(resp); err != nil {
|
||||||
|
conn.Release()
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Format an error if any
|
||||||
|
if rpcError != "" {
|
||||||
|
return true, fmt.Errorf(rpcError)
|
||||||
|
}
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// sendRPC is used to encode and send the RPC.
|
||||||
|
func sendRPC(conn *netConn, rpcType uint8, args interface{}) error {
|
||||||
|
// Write the request type
|
||||||
|
if err := conn.w.WriteByte(rpcType); err != nil {
|
||||||
|
conn.Release()
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send the request
|
||||||
|
if err := conn.enc.Encode(args); err != nil {
|
||||||
|
conn.Release()
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Flush
|
||||||
|
if err := conn.w.Flush(); err != nil {
|
||||||
|
conn.Release()
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// newNetPipeline is used to construct a netPipeline from a given
|
||||||
|
// transport and connection.
|
||||||
|
func newNetPipeline(trans *NetworkTransport, conn *netConn) *netPipeline {
|
||||||
|
n := &netPipeline{
|
||||||
|
conn: conn,
|
||||||
|
trans: trans,
|
||||||
|
doneCh: make(chan AppendFuture, rpcMaxPipeline),
|
||||||
|
inprogressCh: make(chan *appendFuture, rpcMaxPipeline),
|
||||||
|
shutdownCh: make(chan struct{}),
|
||||||
|
}
|
||||||
|
go n.decodeResponses()
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
|
||||||
|
// decodeResponses is a long running routine that decodes the responses
|
||||||
|
// sent on the connection.
|
||||||
|
func (n *netPipeline) decodeResponses() {
|
||||||
|
timeout := n.trans.timeout
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case future := <-n.inprogressCh:
|
||||||
|
if timeout > 0 {
|
||||||
|
n.conn.conn.SetReadDeadline(time.Now().Add(timeout))
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err := decodeResponse(n.conn, future.resp)
|
||||||
|
future.respond(err)
|
||||||
|
select {
|
||||||
|
case n.doneCh <- future:
|
||||||
|
case <-n.shutdownCh:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
case <-n.shutdownCh:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// AppendEntries is used to pipeline a new append entries request.
|
||||||
|
func (n *netPipeline) AppendEntries(args *AppendEntriesRequest, resp *AppendEntriesResponse) (AppendFuture, error) {
|
||||||
|
// Create a new future
|
||||||
|
future := &appendFuture{
|
||||||
|
start: time.Now(),
|
||||||
|
args: args,
|
||||||
|
resp: resp,
|
||||||
|
}
|
||||||
|
future.init()
|
||||||
|
|
||||||
|
// Add a send timeout
|
||||||
|
if timeout := n.trans.timeout; timeout > 0 {
|
||||||
|
n.conn.conn.SetWriteDeadline(time.Now().Add(timeout))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send the RPC
|
||||||
|
if err := sendRPC(n.conn, rpcAppendEntries, future.args); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Hand-off for decoding, this can also cause back-pressure
|
||||||
|
// to prevent too many inflight requests
|
||||||
|
select {
|
||||||
|
case n.inprogressCh <- future:
|
||||||
|
return future, nil
|
||||||
|
case <-n.shutdownCh:
|
||||||
|
return nil, ErrPipelineShutdown
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Consumer returns a channel that can be used to consume complete futures.
|
||||||
|
func (n *netPipeline) Consumer() <-chan AppendFuture {
|
||||||
|
return n.doneCh
|
||||||
|
}
|
||||||
|
|
||||||
|
// Closed is used to shutdown the pipeline connection.
|
||||||
|
func (n *netPipeline) Close() error {
|
||||||
|
n.shutdownLock.Lock()
|
||||||
|
defer n.shutdownLock.Unlock()
|
||||||
|
if n.shutdown {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Release the connection
|
||||||
|
n.conn.Release()
|
||||||
|
|
||||||
|
n.shutdown = true
|
||||||
|
close(n.shutdownCh)
|
||||||
|
return nil
|
||||||
|
}
|
@ -0,0 +1,449 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"reflect"
|
||||||
|
"sync"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestNetworkTransport_StartStop(t *testing.T) {
|
||||||
|
trans, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
trans.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNetworkTransport_Heartbeat_FastPath(t *testing.T) {
|
||||||
|
// Transport 1 is consumer
|
||||||
|
trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
defer trans1.Close()
|
||||||
|
|
||||||
|
// Make the RPC request
|
||||||
|
args := AppendEntriesRequest{
|
||||||
|
Term: 10,
|
||||||
|
Leader: []byte("cartman"),
|
||||||
|
}
|
||||||
|
resp := AppendEntriesResponse{
|
||||||
|
Term: 4,
|
||||||
|
LastLog: 90,
|
||||||
|
Success: true,
|
||||||
|
}
|
||||||
|
|
||||||
|
invoked := false
|
||||||
|
fastpath := func(rpc RPC) {
|
||||||
|
// Verify the command
|
||||||
|
req := rpc.Command.(*AppendEntriesRequest)
|
||||||
|
if !reflect.DeepEqual(req, &args) {
|
||||||
|
t.Fatalf("command mismatch: %#v %#v", *req, args)
|
||||||
|
}
|
||||||
|
|
||||||
|
rpc.Respond(&resp, nil)
|
||||||
|
invoked = true
|
||||||
|
}
|
||||||
|
trans1.SetHeartbeatHandler(fastpath)
|
||||||
|
|
||||||
|
// Transport 2 makes outbound request
|
||||||
|
trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
defer trans2.Close()
|
||||||
|
|
||||||
|
var out AppendEntriesResponse
|
||||||
|
if err := trans2.AppendEntries(trans1.LocalAddr(), &args, &out); err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify the response
|
||||||
|
if !reflect.DeepEqual(resp, out) {
|
||||||
|
t.Fatalf("command mismatch: %#v %#v", resp, out)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure fast-path is used
|
||||||
|
if !invoked {
|
||||||
|
t.Fatalf("fast-path not used")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNetworkTransport_AppendEntries(t *testing.T) {
|
||||||
|
// Transport 1 is consumer
|
||||||
|
trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
defer trans1.Close()
|
||||||
|
rpcCh := trans1.Consumer()
|
||||||
|
|
||||||
|
// Make the RPC request
|
||||||
|
args := AppendEntriesRequest{
|
||||||
|
Term: 10,
|
||||||
|
Leader: []byte("cartman"),
|
||||||
|
PrevLogEntry: 100,
|
||||||
|
PrevLogTerm: 4,
|
||||||
|
Entries: []*Log{
|
||||||
|
&Log{
|
||||||
|
Index: 101,
|
||||||
|
Term: 4,
|
||||||
|
Type: LogNoop,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
LeaderCommitIndex: 90,
|
||||||
|
}
|
||||||
|
resp := AppendEntriesResponse{
|
||||||
|
Term: 4,
|
||||||
|
LastLog: 90,
|
||||||
|
Success: true,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Listen for a request
|
||||||
|
go func() {
|
||||||
|
select {
|
||||||
|
case rpc := <-rpcCh:
|
||||||
|
// Verify the command
|
||||||
|
req := rpc.Command.(*AppendEntriesRequest)
|
||||||
|
if !reflect.DeepEqual(req, &args) {
|
||||||
|
t.Fatalf("command mismatch: %#v %#v", *req, args)
|
||||||
|
}
|
||||||
|
|
||||||
|
rpc.Respond(&resp, nil)
|
||||||
|
|
||||||
|
case <-time.After(200 * time.Millisecond):
|
||||||
|
t.Fatalf("timeout")
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Transport 2 makes outbound request
|
||||||
|
trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
defer trans2.Close()
|
||||||
|
|
||||||
|
var out AppendEntriesResponse
|
||||||
|
if err := trans2.AppendEntries(trans1.LocalAddr(), &args, &out); err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify the response
|
||||||
|
if !reflect.DeepEqual(resp, out) {
|
||||||
|
t.Fatalf("command mismatch: %#v %#v", resp, out)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNetworkTransport_AppendEntriesPipeline(t *testing.T) {
|
||||||
|
// Transport 1 is consumer
|
||||||
|
trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
defer trans1.Close()
|
||||||
|
rpcCh := trans1.Consumer()
|
||||||
|
|
||||||
|
// Make the RPC request
|
||||||
|
args := AppendEntriesRequest{
|
||||||
|
Term: 10,
|
||||||
|
Leader: []byte("cartman"),
|
||||||
|
PrevLogEntry: 100,
|
||||||
|
PrevLogTerm: 4,
|
||||||
|
Entries: []*Log{
|
||||||
|
&Log{
|
||||||
|
Index: 101,
|
||||||
|
Term: 4,
|
||||||
|
Type: LogNoop,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
LeaderCommitIndex: 90,
|
||||||
|
}
|
||||||
|
resp := AppendEntriesResponse{
|
||||||
|
Term: 4,
|
||||||
|
LastLog: 90,
|
||||||
|
Success: true,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Listen for a request
|
||||||
|
go func() {
|
||||||
|
for i := 0; i < 10; i++ {
|
||||||
|
select {
|
||||||
|
case rpc := <-rpcCh:
|
||||||
|
// Verify the command
|
||||||
|
req := rpc.Command.(*AppendEntriesRequest)
|
||||||
|
if !reflect.DeepEqual(req, &args) {
|
||||||
|
t.Fatalf("command mismatch: %#v %#v", *req, args)
|
||||||
|
}
|
||||||
|
rpc.Respond(&resp, nil)
|
||||||
|
|
||||||
|
case <-time.After(200 * time.Millisecond):
|
||||||
|
t.Fatalf("timeout")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Transport 2 makes outbound request
|
||||||
|
trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
defer trans2.Close()
|
||||||
|
|
||||||
|
pipeline, err := trans2.AppendEntriesPipeline(trans1.LocalAddr())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
defer pipeline.Close()
|
||||||
|
for i := 0; i < 10; i++ {
|
||||||
|
out := new(AppendEntriesResponse)
|
||||||
|
if _, err := pipeline.AppendEntries(&args, out); err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
respCh := pipeline.Consumer()
|
||||||
|
for i := 0; i < 10; i++ {
|
||||||
|
select {
|
||||||
|
case ready := <-respCh:
|
||||||
|
// Verify the response
|
||||||
|
if !reflect.DeepEqual(&resp, ready.Response()) {
|
||||||
|
t.Fatalf("command mismatch: %#v %#v", &resp, ready.Response())
|
||||||
|
}
|
||||||
|
case <-time.After(200 * time.Millisecond):
|
||||||
|
t.Fatalf("timeout")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNetworkTransport_RequestVote(t *testing.T) {
|
||||||
|
// Transport 1 is consumer
|
||||||
|
trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
defer trans1.Close()
|
||||||
|
rpcCh := trans1.Consumer()
|
||||||
|
|
||||||
|
// Make the RPC request
|
||||||
|
args := RequestVoteRequest{
|
||||||
|
Term: 20,
|
||||||
|
Candidate: []byte("butters"),
|
||||||
|
LastLogIndex: 100,
|
||||||
|
LastLogTerm: 19,
|
||||||
|
}
|
||||||
|
resp := RequestVoteResponse{
|
||||||
|
Term: 100,
|
||||||
|
Peers: []byte("blah"),
|
||||||
|
Granted: false,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Listen for a request
|
||||||
|
go func() {
|
||||||
|
select {
|
||||||
|
case rpc := <-rpcCh:
|
||||||
|
// Verify the command
|
||||||
|
req := rpc.Command.(*RequestVoteRequest)
|
||||||
|
if !reflect.DeepEqual(req, &args) {
|
||||||
|
t.Fatalf("command mismatch: %#v %#v", *req, args)
|
||||||
|
}
|
||||||
|
|
||||||
|
rpc.Respond(&resp, nil)
|
||||||
|
|
||||||
|
case <-time.After(200 * time.Millisecond):
|
||||||
|
t.Fatalf("timeout")
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Transport 2 makes outbound request
|
||||||
|
trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
defer trans2.Close()
|
||||||
|
|
||||||
|
var out RequestVoteResponse
|
||||||
|
if err := trans2.RequestVote(trans1.LocalAddr(), &args, &out); err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify the response
|
||||||
|
if !reflect.DeepEqual(resp, out) {
|
||||||
|
t.Fatalf("command mismatch: %#v %#v", resp, out)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNetworkTransport_InstallSnapshot(t *testing.T) {
|
||||||
|
// Transport 1 is consumer
|
||||||
|
trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
defer trans1.Close()
|
||||||
|
rpcCh := trans1.Consumer()
|
||||||
|
|
||||||
|
// Make the RPC request
|
||||||
|
args := InstallSnapshotRequest{
|
||||||
|
Term: 10,
|
||||||
|
Leader: []byte("kyle"),
|
||||||
|
LastLogIndex: 100,
|
||||||
|
LastLogTerm: 9,
|
||||||
|
Peers: []byte("blah blah"),
|
||||||
|
Size: 10,
|
||||||
|
}
|
||||||
|
resp := InstallSnapshotResponse{
|
||||||
|
Term: 10,
|
||||||
|
Success: true,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Listen for a request
|
||||||
|
go func() {
|
||||||
|
select {
|
||||||
|
case rpc := <-rpcCh:
|
||||||
|
// Verify the command
|
||||||
|
req := rpc.Command.(*InstallSnapshotRequest)
|
||||||
|
if !reflect.DeepEqual(req, &args) {
|
||||||
|
t.Fatalf("command mismatch: %#v %#v", *req, args)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to read the bytes
|
||||||
|
buf := make([]byte, 10)
|
||||||
|
rpc.Reader.Read(buf)
|
||||||
|
|
||||||
|
// Compare
|
||||||
|
if bytes.Compare(buf, []byte("0123456789")) != 0 {
|
||||||
|
t.Fatalf("bad buf %v", buf)
|
||||||
|
}
|
||||||
|
|
||||||
|
rpc.Respond(&resp, nil)
|
||||||
|
|
||||||
|
case <-time.After(200 * time.Millisecond):
|
||||||
|
t.Fatalf("timeout")
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Transport 2 makes outbound request
|
||||||
|
trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
defer trans2.Close()
|
||||||
|
|
||||||
|
// Create a buffer
|
||||||
|
buf := bytes.NewBuffer([]byte("0123456789"))
|
||||||
|
|
||||||
|
var out InstallSnapshotResponse
|
||||||
|
if err := trans2.InstallSnapshot(trans1.LocalAddr(), &args, &out, buf); err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify the response
|
||||||
|
if !reflect.DeepEqual(resp, out) {
|
||||||
|
t.Fatalf("command mismatch: %#v %#v", resp, out)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNetworkTransport_EncodeDecode(t *testing.T) {
|
||||||
|
// Transport 1 is consumer
|
||||||
|
trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
defer trans1.Close()
|
||||||
|
|
||||||
|
local := trans1.LocalAddr()
|
||||||
|
enc := trans1.EncodePeer(local)
|
||||||
|
dec := trans1.DecodePeer(enc)
|
||||||
|
|
||||||
|
if dec != local {
|
||||||
|
t.Fatalf("enc/dec fail: %v %v", dec, local)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNetworkTransport_PooledConn(t *testing.T) {
|
||||||
|
// Transport 1 is consumer
|
||||||
|
trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
defer trans1.Close()
|
||||||
|
rpcCh := trans1.Consumer()
|
||||||
|
|
||||||
|
// Make the RPC request
|
||||||
|
args := AppendEntriesRequest{
|
||||||
|
Term: 10,
|
||||||
|
Leader: []byte("cartman"),
|
||||||
|
PrevLogEntry: 100,
|
||||||
|
PrevLogTerm: 4,
|
||||||
|
Entries: []*Log{
|
||||||
|
&Log{
|
||||||
|
Index: 101,
|
||||||
|
Term: 4,
|
||||||
|
Type: LogNoop,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
LeaderCommitIndex: 90,
|
||||||
|
}
|
||||||
|
resp := AppendEntriesResponse{
|
||||||
|
Term: 4,
|
||||||
|
LastLog: 90,
|
||||||
|
Success: true,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Listen for a request
|
||||||
|
go func() {
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case rpc := <-rpcCh:
|
||||||
|
// Verify the command
|
||||||
|
req := rpc.Command.(*AppendEntriesRequest)
|
||||||
|
if !reflect.DeepEqual(req, &args) {
|
||||||
|
t.Fatalf("command mismatch: %#v %#v", *req, args)
|
||||||
|
}
|
||||||
|
rpc.Respond(&resp, nil)
|
||||||
|
|
||||||
|
case <-time.After(200 * time.Millisecond):
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Transport 2 makes outbound request, 3 conn pool
|
||||||
|
trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 3, time.Second, newTestLogger(t))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
defer trans2.Close()
|
||||||
|
|
||||||
|
// Create wait group
|
||||||
|
wg := &sync.WaitGroup{}
|
||||||
|
wg.Add(5)
|
||||||
|
|
||||||
|
appendFunc := func() {
|
||||||
|
defer wg.Done()
|
||||||
|
var out AppendEntriesResponse
|
||||||
|
if err := trans2.AppendEntries(trans1.LocalAddr(), &args, &out); err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify the response
|
||||||
|
if !reflect.DeepEqual(resp, out) {
|
||||||
|
t.Fatalf("command mismatch: %#v %#v", resp, out)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to do parallel appends, should stress the conn pool
|
||||||
|
for i := 0; i < 5; i++ {
|
||||||
|
go appendFunc()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for the routines to finish
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
// Check the conn pool size
|
||||||
|
addr := trans1.LocalAddr()
|
||||||
|
if len(trans2.connPool[addr]) != 3 {
|
||||||
|
t.Fatalf("Expected 2 pooled conns!")
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,122 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"sync/atomic"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Observation is sent along the given channel to observers when an event occurs.
|
||||||
|
type Observation struct {
|
||||||
|
// Raft holds the Raft instance generating the observation.
|
||||||
|
Raft *Raft
|
||||||
|
// Data holds observation-specific data. Possible types are
|
||||||
|
// *RequestVoteRequest, RaftState and LeaderObservation.
|
||||||
|
Data interface{}
|
||||||
|
}
|
||||||
|
|
||||||
|
// LeaderObservation is used in Observation.Data when leadership changes.
|
||||||
|
type LeaderObservation struct {
|
||||||
|
Leader string
|
||||||
|
}
|
||||||
|
|
||||||
|
// nextObserverId is used to provide a unique ID for each observer to aid in
|
||||||
|
// deregistration.
|
||||||
|
var nextObserverID uint64
|
||||||
|
|
||||||
|
// FilterFn is a function that can be registered in order to filter observations.
|
||||||
|
// The function reports whether the observation should be included - if
|
||||||
|
// it returns false, the observation will be filtered out.
|
||||||
|
type FilterFn func(o *Observation) bool
|
||||||
|
|
||||||
|
// Observer describes what to do with a given observation.
|
||||||
|
type Observer struct {
|
||||||
|
// numObserved and numDropped are performance counters for this observer.
|
||||||
|
// 64 bit types must be 64 bit aligned to use with atomic operations on
|
||||||
|
// 32 bit platforms, so keep them at the top of the struct.
|
||||||
|
numObserved uint64
|
||||||
|
numDropped uint64
|
||||||
|
|
||||||
|
// channel receives observations.
|
||||||
|
channel chan Observation
|
||||||
|
|
||||||
|
// blocking, if true, will cause Raft to block when sending an observation
|
||||||
|
// to this observer. This should generally be set to false.
|
||||||
|
blocking bool
|
||||||
|
|
||||||
|
// filter will be called to determine if an observation should be sent to
|
||||||
|
// the channel.
|
||||||
|
filter FilterFn
|
||||||
|
|
||||||
|
// id is the ID of this observer in the Raft map.
|
||||||
|
id uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewObserver creates a new observer that can be registered
|
||||||
|
// to make observations on a Raft instance. Observations
|
||||||
|
// will be sent on the given channel if they satisfy the
|
||||||
|
// given filter.
|
||||||
|
//
|
||||||
|
// If blocking is true, the observer will block when it can't
|
||||||
|
// send on the channel, otherwise it may discard events.
|
||||||
|
func NewObserver(channel chan Observation, blocking bool, filter FilterFn) *Observer {
|
||||||
|
return &Observer{
|
||||||
|
channel: channel,
|
||||||
|
blocking: blocking,
|
||||||
|
filter: filter,
|
||||||
|
id: atomic.AddUint64(&nextObserverID, 1),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetNumObserved returns the number of observations.
|
||||||
|
func (or *Observer) GetNumObserved() uint64 {
|
||||||
|
return atomic.LoadUint64(&or.numObserved)
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetNumDropped returns the number of dropped observations due to blocking.
|
||||||
|
func (or *Observer) GetNumDropped() uint64 {
|
||||||
|
return atomic.LoadUint64(&or.numDropped)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RegisterObserver registers a new observer.
|
||||||
|
func (r *Raft) RegisterObserver(or *Observer) {
|
||||||
|
r.observersLock.Lock()
|
||||||
|
defer r.observersLock.Unlock()
|
||||||
|
r.observers[or.id] = or
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeregisterObserver deregisters an observer.
|
||||||
|
func (r *Raft) DeregisterObserver(or *Observer) {
|
||||||
|
r.observersLock.Lock()
|
||||||
|
defer r.observersLock.Unlock()
|
||||||
|
delete(r.observers, or.id)
|
||||||
|
}
|
||||||
|
|
||||||
|
// observe sends an observation to every observer.
|
||||||
|
func (r *Raft) observe(o interface{}) {
|
||||||
|
// In general observers should not block. But in any case this isn't
|
||||||
|
// disastrous as we only hold a read lock, which merely prevents
|
||||||
|
// registration / deregistration of observers.
|
||||||
|
r.observersLock.RLock()
|
||||||
|
defer r.observersLock.RUnlock()
|
||||||
|
for _, or := range r.observers {
|
||||||
|
// It's wasteful to do this in the loop, but for the common case
|
||||||
|
// where there are no observers we won't create any objects.
|
||||||
|
ob := Observation{Raft: r, Data: o}
|
||||||
|
if or.filter != nil && !or.filter(&ob) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if or.channel == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if or.blocking {
|
||||||
|
or.channel <- ob
|
||||||
|
atomic.AddUint64(&or.numObserved, 1)
|
||||||
|
} else {
|
||||||
|
select {
|
||||||
|
case or.channel <- ob:
|
||||||
|
atomic.AddUint64(&or.numObserved, 1)
|
||||||
|
default:
|
||||||
|
atomic.AddUint64(&or.numDropped, 1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,122 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"encoding/json"
|
||||||
|
"io/ioutil"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"sync"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
jsonPeerPath = "peers.json"
|
||||||
|
)
|
||||||
|
|
||||||
|
// PeerStore provides an interface for persistent storage and
|
||||||
|
// retrieval of peers. We use a separate interface than StableStore
|
||||||
|
// since the peers may need to be edited by a human operator. For example,
|
||||||
|
// in a two node cluster, the failure of either node requires human intervention
|
||||||
|
// since consensus is impossible.
|
||||||
|
type PeerStore interface {
|
||||||
|
// Peers returns the list of known peers.
|
||||||
|
Peers() ([]string, error)
|
||||||
|
|
||||||
|
// SetPeers sets the list of known peers. This is invoked when a peer is
|
||||||
|
// added or removed.
|
||||||
|
SetPeers([]string) error
|
||||||
|
}
|
||||||
|
|
||||||
|
// StaticPeers is used to provide a static list of peers.
|
||||||
|
type StaticPeers struct {
|
||||||
|
StaticPeers []string
|
||||||
|
l sync.Mutex
|
||||||
|
}
|
||||||
|
|
||||||
|
// Peers implements the PeerStore interface.
|
||||||
|
func (s *StaticPeers) Peers() ([]string, error) {
|
||||||
|
s.l.Lock()
|
||||||
|
peers := s.StaticPeers
|
||||||
|
s.l.Unlock()
|
||||||
|
return peers, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// SetPeers implements the PeerStore interface.
|
||||||
|
func (s *StaticPeers) SetPeers(p []string) error {
|
||||||
|
s.l.Lock()
|
||||||
|
s.StaticPeers = p
|
||||||
|
s.l.Unlock()
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// JSONPeers is used to provide peer persistence on disk in the form
|
||||||
|
// of a JSON file. This allows human operators to manipulate the file.
|
||||||
|
type JSONPeers struct {
|
||||||
|
l sync.Mutex
|
||||||
|
path string
|
||||||
|
trans Transport
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewJSONPeers creates a new JSONPeers store. Requires a transport
|
||||||
|
// to handle the serialization of network addresses.
|
||||||
|
func NewJSONPeers(base string, trans Transport) *JSONPeers {
|
||||||
|
path := filepath.Join(base, jsonPeerPath)
|
||||||
|
store := &JSONPeers{
|
||||||
|
path: path,
|
||||||
|
trans: trans,
|
||||||
|
}
|
||||||
|
return store
|
||||||
|
}
|
||||||
|
|
||||||
|
// Peers implements the PeerStore interface.
|
||||||
|
func (j *JSONPeers) Peers() ([]string, error) {
|
||||||
|
j.l.Lock()
|
||||||
|
defer j.l.Unlock()
|
||||||
|
|
||||||
|
// Read the file
|
||||||
|
buf, err := ioutil.ReadFile(j.path)
|
||||||
|
if err != nil && !os.IsNotExist(err) {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for no peers
|
||||||
|
if len(buf) == 0 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decode the peers
|
||||||
|
var peerSet []string
|
||||||
|
dec := json.NewDecoder(bytes.NewReader(buf))
|
||||||
|
if err := dec.Decode(&peerSet); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Deserialize each peer
|
||||||
|
var peers []string
|
||||||
|
for _, p := range peerSet {
|
||||||
|
peers = append(peers, j.trans.DecodePeer([]byte(p)))
|
||||||
|
}
|
||||||
|
return peers, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// SetPeers implements the PeerStore interface.
|
||||||
|
func (j *JSONPeers) SetPeers(peers []string) error {
|
||||||
|
j.l.Lock()
|
||||||
|
defer j.l.Unlock()
|
||||||
|
|
||||||
|
// Encode each peer
|
||||||
|
var peerSet []string
|
||||||
|
for _, p := range peers {
|
||||||
|
peerSet = append(peerSet, string(j.trans.EncodePeer(p)))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert to JSON
|
||||||
|
var buf bytes.Buffer
|
||||||
|
enc := json.NewEncoder(&buf)
|
||||||
|
if err := enc.Encode(peerSet); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write out as JSON
|
||||||
|
return ioutil.WriteFile(j.path, buf.Bytes(), 0755)
|
||||||
|
}
|
@ -0,0 +1,44 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"io/ioutil"
|
||||||
|
"os"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestJSONPeers(t *testing.T) {
|
||||||
|
// Create a test dir
|
||||||
|
dir, err := ioutil.TempDir("", "raft")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v ", err)
|
||||||
|
}
|
||||||
|
defer os.RemoveAll(dir)
|
||||||
|
|
||||||
|
// Create the store
|
||||||
|
_, trans := NewInmemTransport("")
|
||||||
|
store := NewJSONPeers(dir, trans)
|
||||||
|
|
||||||
|
// Try a read, should get nothing
|
||||||
|
peers, err := store.Peers()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
if len(peers) != 0 {
|
||||||
|
t.Fatalf("peers: %v", peers)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize some peers
|
||||||
|
newPeers := []string{NewInmemAddr(), NewInmemAddr(), NewInmemAddr()}
|
||||||
|
if err := store.SetPeers(newPeers); err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try a read, should peers
|
||||||
|
peers, err = store.Peers()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
if len(peers) != 3 {
|
||||||
|
t.Fatalf("peers: %v", peers)
|
||||||
|
}
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,522 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/armon/go-metrics"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
maxFailureScale = 12
|
||||||
|
failureWait = 10 * time.Millisecond
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
// ErrLogNotFound indicates a given log entry is not available.
|
||||||
|
ErrLogNotFound = errors.New("log not found")
|
||||||
|
|
||||||
|
// ErrPipelineReplicationNotSupported can be returned by the transport to
|
||||||
|
// signal that pipeline replication is not supported in general, and that
|
||||||
|
// no error message should be produced.
|
||||||
|
ErrPipelineReplicationNotSupported = errors.New("pipeline replication not supported")
|
||||||
|
)
|
||||||
|
|
||||||
|
type followerReplication struct {
|
||||||
|
peer string
|
||||||
|
inflight *inflight
|
||||||
|
|
||||||
|
stopCh chan uint64
|
||||||
|
triggerCh chan struct{}
|
||||||
|
|
||||||
|
currentTerm uint64
|
||||||
|
matchIndex uint64
|
||||||
|
nextIndex uint64
|
||||||
|
|
||||||
|
lastContact time.Time
|
||||||
|
lastContactLock sync.RWMutex
|
||||||
|
|
||||||
|
failures uint64
|
||||||
|
|
||||||
|
notifyCh chan struct{}
|
||||||
|
notify []*verifyFuture
|
||||||
|
notifyLock sync.Mutex
|
||||||
|
|
||||||
|
// stepDown is used to indicate to the leader that we
|
||||||
|
// should step down based on information from a follower.
|
||||||
|
stepDown chan struct{}
|
||||||
|
|
||||||
|
// allowPipeline is used to control it seems like
|
||||||
|
// pipeline replication should be enabled.
|
||||||
|
allowPipeline bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// notifyAll is used to notify all the waiting verify futures
|
||||||
|
// if the follower believes we are still the leader.
|
||||||
|
func (s *followerReplication) notifyAll(leader bool) {
|
||||||
|
// Clear the waiting notifies minimizing lock time
|
||||||
|
s.notifyLock.Lock()
|
||||||
|
n := s.notify
|
||||||
|
s.notify = nil
|
||||||
|
s.notifyLock.Unlock()
|
||||||
|
|
||||||
|
// Submit our votes
|
||||||
|
for _, v := range n {
|
||||||
|
v.vote(leader)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// LastContact returns the time of last contact.
|
||||||
|
func (s *followerReplication) LastContact() time.Time {
|
||||||
|
s.lastContactLock.RLock()
|
||||||
|
last := s.lastContact
|
||||||
|
s.lastContactLock.RUnlock()
|
||||||
|
return last
|
||||||
|
}
|
||||||
|
|
||||||
|
// setLastContact sets the last contact to the current time.
|
||||||
|
func (s *followerReplication) setLastContact() {
|
||||||
|
s.lastContactLock.Lock()
|
||||||
|
s.lastContact = time.Now()
|
||||||
|
s.lastContactLock.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
// replicate is a long running routine that is used to manage
|
||||||
|
// the process of replicating logs to our followers.
|
||||||
|
func (r *Raft) replicate(s *followerReplication) {
|
||||||
|
// Start an async heartbeating routing
|
||||||
|
stopHeartbeat := make(chan struct{})
|
||||||
|
defer close(stopHeartbeat)
|
||||||
|
r.goFunc(func() { r.heartbeat(s, stopHeartbeat) })
|
||||||
|
|
||||||
|
RPC:
|
||||||
|
shouldStop := false
|
||||||
|
for !shouldStop {
|
||||||
|
select {
|
||||||
|
case maxIndex := <-s.stopCh:
|
||||||
|
// Make a best effort to replicate up to this index
|
||||||
|
if maxIndex > 0 {
|
||||||
|
r.replicateTo(s, maxIndex)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
case <-s.triggerCh:
|
||||||
|
lastLogIdx, _ := r.getLastLog()
|
||||||
|
shouldStop = r.replicateTo(s, lastLogIdx)
|
||||||
|
case <-randomTimeout(r.conf.CommitTimeout):
|
||||||
|
lastLogIdx, _ := r.getLastLog()
|
||||||
|
shouldStop = r.replicateTo(s, lastLogIdx)
|
||||||
|
}
|
||||||
|
|
||||||
|
// If things looks healthy, switch to pipeline mode
|
||||||
|
if !shouldStop && s.allowPipeline {
|
||||||
|
goto PIPELINE
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return
|
||||||
|
|
||||||
|
PIPELINE:
|
||||||
|
// Disable until re-enabled
|
||||||
|
s.allowPipeline = false
|
||||||
|
|
||||||
|
// Replicates using a pipeline for high performance. This method
|
||||||
|
// is not able to gracefully recover from errors, and so we fall back
|
||||||
|
// to standard mode on failure.
|
||||||
|
if err := r.pipelineReplicate(s); err != nil {
|
||||||
|
if err != ErrPipelineReplicationNotSupported {
|
||||||
|
r.logger.Printf("[ERR] raft: Failed to start pipeline replication to %s: %s", s.peer, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
goto RPC
|
||||||
|
}
|
||||||
|
|
||||||
|
// replicateTo is used to replicate the logs up to a given last index.
|
||||||
|
// If the follower log is behind, we take care to bring them up to date.
|
||||||
|
func (r *Raft) replicateTo(s *followerReplication, lastIndex uint64) (shouldStop bool) {
|
||||||
|
// Create the base request
|
||||||
|
var req AppendEntriesRequest
|
||||||
|
var resp AppendEntriesResponse
|
||||||
|
var start time.Time
|
||||||
|
START:
|
||||||
|
// Prevent an excessive retry rate on errors
|
||||||
|
if s.failures > 0 {
|
||||||
|
select {
|
||||||
|
case <-time.After(backoff(failureWait, s.failures, maxFailureScale)):
|
||||||
|
case <-r.shutdownCh:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Setup the request
|
||||||
|
if err := r.setupAppendEntries(s, &req, s.nextIndex, lastIndex); err == ErrLogNotFound {
|
||||||
|
goto SEND_SNAP
|
||||||
|
} else if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make the RPC call
|
||||||
|
start = time.Now()
|
||||||
|
if err := r.trans.AppendEntries(s.peer, &req, &resp); err != nil {
|
||||||
|
r.logger.Printf("[ERR] raft: Failed to AppendEntries to %v: %v", s.peer, err)
|
||||||
|
s.failures++
|
||||||
|
return
|
||||||
|
}
|
||||||
|
appendStats(s.peer, start, float32(len(req.Entries)))
|
||||||
|
|
||||||
|
// Check for a newer term, stop running
|
||||||
|
if resp.Term > req.Term {
|
||||||
|
r.handleStaleTerm(s)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update the last contact
|
||||||
|
s.setLastContact()
|
||||||
|
|
||||||
|
// Update s based on success
|
||||||
|
if resp.Success {
|
||||||
|
// Update our replication state
|
||||||
|
updateLastAppended(s, &req)
|
||||||
|
|
||||||
|
// Clear any failures, allow pipelining
|
||||||
|
s.failures = 0
|
||||||
|
s.allowPipeline = true
|
||||||
|
} else {
|
||||||
|
s.nextIndex = max(min(s.nextIndex-1, resp.LastLog+1), 1)
|
||||||
|
s.matchIndex = s.nextIndex - 1
|
||||||
|
if resp.NoRetryBackoff {
|
||||||
|
s.failures = 0
|
||||||
|
} else {
|
||||||
|
s.failures++
|
||||||
|
}
|
||||||
|
r.logger.Printf("[WARN] raft: AppendEntries to %v rejected, sending older logs (next: %d)", s.peer, s.nextIndex)
|
||||||
|
}
|
||||||
|
|
||||||
|
CHECK_MORE:
|
||||||
|
// Check if there are more logs to replicate
|
||||||
|
if s.nextIndex <= lastIndex {
|
||||||
|
goto START
|
||||||
|
}
|
||||||
|
return
|
||||||
|
|
||||||
|
// SEND_SNAP is used when we fail to get a log, usually because the follower
|
||||||
|
// is too far behind, and we must ship a snapshot down instead
|
||||||
|
SEND_SNAP:
|
||||||
|
if stop, err := r.sendLatestSnapshot(s); stop {
|
||||||
|
return true
|
||||||
|
} else if err != nil {
|
||||||
|
r.logger.Printf("[ERR] raft: Failed to send snapshot to %v: %v", s.peer, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if there is more to replicate
|
||||||
|
goto CHECK_MORE
|
||||||
|
}
|
||||||
|
|
||||||
|
// sendLatestSnapshot is used to send the latest snapshot we have
|
||||||
|
// down to our follower.
|
||||||
|
func (r *Raft) sendLatestSnapshot(s *followerReplication) (bool, error) {
|
||||||
|
// Get the snapshots
|
||||||
|
snapshots, err := r.snapshots.List()
|
||||||
|
if err != nil {
|
||||||
|
r.logger.Printf("[ERR] raft: Failed to list snapshots: %v", err)
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check we have at least a single snapshot
|
||||||
|
if len(snapshots) == 0 {
|
||||||
|
return false, fmt.Errorf("no snapshots found")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Open the most recent snapshot
|
||||||
|
snapID := snapshots[0].ID
|
||||||
|
meta, snapshot, err := r.snapshots.Open(snapID)
|
||||||
|
if err != nil {
|
||||||
|
r.logger.Printf("[ERR] raft: Failed to open snapshot %v: %v", snapID, err)
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
defer snapshot.Close()
|
||||||
|
|
||||||
|
// Setup the request
|
||||||
|
req := InstallSnapshotRequest{
|
||||||
|
Term: s.currentTerm,
|
||||||
|
Leader: r.trans.EncodePeer(r.localAddr),
|
||||||
|
LastLogIndex: meta.Index,
|
||||||
|
LastLogTerm: meta.Term,
|
||||||
|
Peers: meta.Peers,
|
||||||
|
Size: meta.Size,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make the call
|
||||||
|
start := time.Now()
|
||||||
|
var resp InstallSnapshotResponse
|
||||||
|
if err := r.trans.InstallSnapshot(s.peer, &req, &resp, snapshot); err != nil {
|
||||||
|
r.logger.Printf("[ERR] raft: Failed to install snapshot %v: %v", snapID, err)
|
||||||
|
s.failures++
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
metrics.MeasureSince([]string{"raft", "replication", "installSnapshot", s.peer}, start)
|
||||||
|
|
||||||
|
// Check for a newer term, stop running
|
||||||
|
if resp.Term > req.Term {
|
||||||
|
r.handleStaleTerm(s)
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update the last contact
|
||||||
|
s.setLastContact()
|
||||||
|
|
||||||
|
// Check for success
|
||||||
|
if resp.Success {
|
||||||
|
// Mark any inflight logs as committed
|
||||||
|
s.inflight.CommitRange(s.matchIndex+1, meta.Index)
|
||||||
|
|
||||||
|
// Update the indexes
|
||||||
|
s.matchIndex = meta.Index
|
||||||
|
s.nextIndex = s.matchIndex + 1
|
||||||
|
|
||||||
|
// Clear any failures
|
||||||
|
s.failures = 0
|
||||||
|
|
||||||
|
// Notify we are still leader
|
||||||
|
s.notifyAll(true)
|
||||||
|
} else {
|
||||||
|
s.failures++
|
||||||
|
r.logger.Printf("[WARN] raft: InstallSnapshot to %v rejected", s.peer)
|
||||||
|
}
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// heartbeat is used to periodically invoke AppendEntries on a peer
|
||||||
|
// to ensure they don't time out. This is done async of replicate(),
|
||||||
|
// since that routine could potentially be blocked on disk IO.
|
||||||
|
func (r *Raft) heartbeat(s *followerReplication, stopCh chan struct{}) {
|
||||||
|
var failures uint64
|
||||||
|
req := AppendEntriesRequest{
|
||||||
|
Term: s.currentTerm,
|
||||||
|
Leader: r.trans.EncodePeer(r.localAddr),
|
||||||
|
}
|
||||||
|
var resp AppendEntriesResponse
|
||||||
|
for {
|
||||||
|
// Wait for the next heartbeat interval or forced notify
|
||||||
|
select {
|
||||||
|
case <-s.notifyCh:
|
||||||
|
case <-randomTimeout(r.conf.HeartbeatTimeout / 10):
|
||||||
|
case <-stopCh:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
|
if err := r.trans.AppendEntries(s.peer, &req, &resp); err != nil {
|
||||||
|
r.logger.Printf("[ERR] raft: Failed to heartbeat to %v: %v", s.peer, err)
|
||||||
|
failures++
|
||||||
|
select {
|
||||||
|
case <-time.After(backoff(failureWait, failures, maxFailureScale)):
|
||||||
|
case <-stopCh:
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
s.setLastContact()
|
||||||
|
failures = 0
|
||||||
|
metrics.MeasureSince([]string{"raft", "replication", "heartbeat", s.peer}, start)
|
||||||
|
s.notifyAll(resp.Success)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// pipelineReplicate is used when we have synchronized our state with the follower,
|
||||||
|
// and want to switch to a higher performance pipeline mode of replication.
|
||||||
|
// We only pipeline AppendEntries commands, and if we ever hit an error, we fall
|
||||||
|
// back to the standard replication which can handle more complex situations.
|
||||||
|
func (r *Raft) pipelineReplicate(s *followerReplication) error {
|
||||||
|
// Create a new pipeline
|
||||||
|
pipeline, err := r.trans.AppendEntriesPipeline(s.peer)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer pipeline.Close()
|
||||||
|
|
||||||
|
// Log start and stop of pipeline
|
||||||
|
r.logger.Printf("[INFO] raft: pipelining replication to peer %v", s.peer)
|
||||||
|
defer r.logger.Printf("[INFO] raft: aborting pipeline replication to peer %v", s.peer)
|
||||||
|
|
||||||
|
// Create a shutdown and finish channel
|
||||||
|
stopCh := make(chan struct{})
|
||||||
|
finishCh := make(chan struct{})
|
||||||
|
|
||||||
|
// Start a dedicated decoder
|
||||||
|
r.goFunc(func() { r.pipelineDecode(s, pipeline, stopCh, finishCh) })
|
||||||
|
|
||||||
|
// Start pipeline sends at the last good nextIndex
|
||||||
|
nextIndex := s.nextIndex
|
||||||
|
|
||||||
|
shouldStop := false
|
||||||
|
SEND:
|
||||||
|
for !shouldStop {
|
||||||
|
select {
|
||||||
|
case <-finishCh:
|
||||||
|
break SEND
|
||||||
|
case maxIndex := <-s.stopCh:
|
||||||
|
if maxIndex > 0 {
|
||||||
|
r.pipelineSend(s, pipeline, &nextIndex, maxIndex)
|
||||||
|
}
|
||||||
|
break SEND
|
||||||
|
case <-s.triggerCh:
|
||||||
|
lastLogIdx, _ := r.getLastLog()
|
||||||
|
shouldStop = r.pipelineSend(s, pipeline, &nextIndex, lastLogIdx)
|
||||||
|
case <-randomTimeout(r.conf.CommitTimeout):
|
||||||
|
lastLogIdx, _ := r.getLastLog()
|
||||||
|
shouldStop = r.pipelineSend(s, pipeline, &nextIndex, lastLogIdx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stop our decoder, and wait for it to finish
|
||||||
|
close(stopCh)
|
||||||
|
select {
|
||||||
|
case <-finishCh:
|
||||||
|
case <-r.shutdownCh:
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// pipelineSend is used to send data over a pipeline.
|
||||||
|
func (r *Raft) pipelineSend(s *followerReplication, p AppendPipeline, nextIdx *uint64, lastIndex uint64) (shouldStop bool) {
|
||||||
|
// Create a new append request
|
||||||
|
req := new(AppendEntriesRequest)
|
||||||
|
if err := r.setupAppendEntries(s, req, *nextIdx, lastIndex); err != nil {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pipeline the append entries
|
||||||
|
if _, err := p.AppendEntries(req, new(AppendEntriesResponse)); err != nil {
|
||||||
|
r.logger.Printf("[ERR] raft: Failed to pipeline AppendEntries to %v: %v", s.peer, err)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// Increase the next send log to avoid re-sending old logs
|
||||||
|
if n := len(req.Entries); n > 0 {
|
||||||
|
last := req.Entries[n-1]
|
||||||
|
*nextIdx = last.Index + 1
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// pipelineDecode is used to decode the responses of pipelined requests.
|
||||||
|
func (r *Raft) pipelineDecode(s *followerReplication, p AppendPipeline, stopCh, finishCh chan struct{}) {
|
||||||
|
defer close(finishCh)
|
||||||
|
respCh := p.Consumer()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case ready := <-respCh:
|
||||||
|
req, resp := ready.Request(), ready.Response()
|
||||||
|
appendStats(s.peer, ready.Start(), float32(len(req.Entries)))
|
||||||
|
|
||||||
|
// Check for a newer term, stop running
|
||||||
|
if resp.Term > req.Term {
|
||||||
|
r.handleStaleTerm(s)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update the last contact
|
||||||
|
s.setLastContact()
|
||||||
|
|
||||||
|
// Abort pipeline if not successful
|
||||||
|
if !resp.Success {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update our replication state
|
||||||
|
updateLastAppended(s, req)
|
||||||
|
case <-stopCh:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// setupAppendEntries is used to setup an append entries request.
|
||||||
|
func (r *Raft) setupAppendEntries(s *followerReplication, req *AppendEntriesRequest, nextIndex, lastIndex uint64) error {
|
||||||
|
req.Term = s.currentTerm
|
||||||
|
req.Leader = r.trans.EncodePeer(r.localAddr)
|
||||||
|
req.LeaderCommitIndex = r.getCommitIndex()
|
||||||
|
if err := r.setPreviousLog(req, nextIndex); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := r.setNewLogs(req, nextIndex, lastIndex); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// setPreviousLog is used to setup the PrevLogEntry and PrevLogTerm for an
|
||||||
|
// AppendEntriesRequest given the next index to replicate.
|
||||||
|
func (r *Raft) setPreviousLog(req *AppendEntriesRequest, nextIndex uint64) error {
|
||||||
|
// Guard for the first index, since there is no 0 log entry
|
||||||
|
// Guard against the previous index being a snapshot as well
|
||||||
|
lastSnapIdx, lastSnapTerm := r.getLastSnapshot()
|
||||||
|
if nextIndex == 1 {
|
||||||
|
req.PrevLogEntry = 0
|
||||||
|
req.PrevLogTerm = 0
|
||||||
|
|
||||||
|
} else if (nextIndex - 1) == lastSnapIdx {
|
||||||
|
req.PrevLogEntry = lastSnapIdx
|
||||||
|
req.PrevLogTerm = lastSnapTerm
|
||||||
|
|
||||||
|
} else {
|
||||||
|
var l Log
|
||||||
|
if err := r.logs.GetLog(nextIndex-1, &l); err != nil {
|
||||||
|
r.logger.Printf("[ERR] raft: Failed to get log at index %d: %v",
|
||||||
|
nextIndex-1, err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set the previous index and term (0 if nextIndex is 1)
|
||||||
|
req.PrevLogEntry = l.Index
|
||||||
|
req.PrevLogTerm = l.Term
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// setNewLogs is used to setup the logs which should be appended for a request.
|
||||||
|
func (r *Raft) setNewLogs(req *AppendEntriesRequest, nextIndex, lastIndex uint64) error {
|
||||||
|
// Append up to MaxAppendEntries or up to the lastIndex
|
||||||
|
req.Entries = make([]*Log, 0, r.conf.MaxAppendEntries)
|
||||||
|
maxIndex := min(nextIndex+uint64(r.conf.MaxAppendEntries)-1, lastIndex)
|
||||||
|
for i := nextIndex; i <= maxIndex; i++ {
|
||||||
|
oldLog := new(Log)
|
||||||
|
if err := r.logs.GetLog(i, oldLog); err != nil {
|
||||||
|
r.logger.Printf("[ERR] raft: Failed to get log at index %d: %v", i, err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
req.Entries = append(req.Entries, oldLog)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// appendStats is used to emit stats about an AppendEntries invocation.
|
||||||
|
func appendStats(peer string, start time.Time, logs float32) {
|
||||||
|
metrics.MeasureSince([]string{"raft", "replication", "appendEntries", "rpc", peer}, start)
|
||||||
|
metrics.IncrCounter([]string{"raft", "replication", "appendEntries", "logs", peer}, logs)
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleStaleTerm is used when a follower indicates that we have a stale term.
|
||||||
|
func (r *Raft) handleStaleTerm(s *followerReplication) {
|
||||||
|
r.logger.Printf("[ERR] raft: peer %v has newer term, stopping replication", s.peer)
|
||||||
|
s.notifyAll(false) // No longer leader
|
||||||
|
asyncNotifyCh(s.stepDown)
|
||||||
|
}
|
||||||
|
|
||||||
|
// updateLastAppended is used to update follower replication state after a successful
|
||||||
|
// AppendEntries RPC.
|
||||||
|
func updateLastAppended(s *followerReplication, req *AppendEntriesRequest) {
|
||||||
|
// Mark any inflight logs as committed
|
||||||
|
if logs := req.Entries; len(logs) > 0 {
|
||||||
|
first := logs[0]
|
||||||
|
last := logs[len(logs)-1]
|
||||||
|
s.inflight.CommitRange(first.Index, last.Index)
|
||||||
|
|
||||||
|
// Update the indexes
|
||||||
|
s.matchIndex = last.Index
|
||||||
|
s.nextIndex = last.Index + 1
|
||||||
|
}
|
||||||
|
|
||||||
|
// Notify still leader
|
||||||
|
s.notifyAll(true)
|
||||||
|
}
|
@ -0,0 +1,40 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"io"
|
||||||
|
)
|
||||||
|
|
||||||
|
// SnapshotMeta is for metadata of a snapshot.
|
||||||
|
type SnapshotMeta struct {
|
||||||
|
ID string // ID is opaque to the store, and is used for opening
|
||||||
|
Index uint64
|
||||||
|
Term uint64
|
||||||
|
Peers []byte
|
||||||
|
Size int64
|
||||||
|
}
|
||||||
|
|
||||||
|
// SnapshotStore interface is used to allow for flexible implementations
|
||||||
|
// of snapshot storage and retrieval. For example, a client could implement
|
||||||
|
// a shared state store such as S3, allowing new nodes to restore snapshots
|
||||||
|
// without streaming from the leader.
|
||||||
|
type SnapshotStore interface {
|
||||||
|
// Create is used to begin a snapshot at a given index and term,
|
||||||
|
// with the current peer set already encoded.
|
||||||
|
Create(index, term uint64, peers []byte) (SnapshotSink, error)
|
||||||
|
|
||||||
|
// List is used to list the available snapshots in the store.
|
||||||
|
// It should return then in descending order, with the highest index first.
|
||||||
|
List() ([]*SnapshotMeta, error)
|
||||||
|
|
||||||
|
// Open takes a snapshot ID and provides a ReadCloser. Once close is
|
||||||
|
// called it is assumed the snapshot is no longer needed.
|
||||||
|
Open(id string) (*SnapshotMeta, io.ReadCloser, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
// SnapshotSink is returned by StartSnapshot. The FSM will Write state
|
||||||
|
// to the sink and call Close on completion. On error, Cancel will be invoked.
|
||||||
|
type SnapshotSink interface {
|
||||||
|
io.WriteCloser
|
||||||
|
ID() string
|
||||||
|
Cancel() error
|
||||||
|
}
|
@ -0,0 +1,15 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
// StableStore is used to provide stable storage
|
||||||
|
// of key configurations to ensure safety.
|
||||||
|
type StableStore interface {
|
||||||
|
Set(key []byte, val []byte) error
|
||||||
|
|
||||||
|
// Get returns the value for key, or an empty byte slice if key was not found.
|
||||||
|
Get(key []byte) ([]byte, error)
|
||||||
|
|
||||||
|
SetUint64(key []byte, val uint64) error
|
||||||
|
|
||||||
|
// GetUint64 returns the uint64 value for key, or 0 if key was not found.
|
||||||
|
GetUint64(key []byte) (uint64, error)
|
||||||
|
}
|
@ -0,0 +1,171 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
|
)
|
||||||
|
|
||||||
|
// RaftState captures the state of a Raft node: Follower, Candidate, Leader,
|
||||||
|
// or Shutdown.
|
||||||
|
type RaftState uint32
|
||||||
|
|
||||||
|
const (
|
||||||
|
// Follower is the initial state of a Raft node.
|
||||||
|
Follower RaftState = iota
|
||||||
|
|
||||||
|
// Candidate is one of the valid states of a Raft node.
|
||||||
|
Candidate
|
||||||
|
|
||||||
|
// Leader is one of the valid states of a Raft node.
|
||||||
|
Leader
|
||||||
|
|
||||||
|
// Shutdown is the terminal state of a Raft node.
|
||||||
|
Shutdown
|
||||||
|
)
|
||||||
|
|
||||||
|
func (s RaftState) String() string {
|
||||||
|
switch s {
|
||||||
|
case Follower:
|
||||||
|
return "Follower"
|
||||||
|
case Candidate:
|
||||||
|
return "Candidate"
|
||||||
|
case Leader:
|
||||||
|
return "Leader"
|
||||||
|
case Shutdown:
|
||||||
|
return "Shutdown"
|
||||||
|
default:
|
||||||
|
return "Unknown"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// raftState is used to maintain various state variables
|
||||||
|
// and provides an interface to set/get the variables in a
|
||||||
|
// thread safe manner.
|
||||||
|
type raftState struct {
|
||||||
|
// currentTerm commitIndex, lastApplied, must be kept at the top of
|
||||||
|
// the struct so they're 64 bit aligned which is a requirement for
|
||||||
|
// atomic ops on 32 bit platforms.
|
||||||
|
|
||||||
|
// The current term, cache of StableStore
|
||||||
|
currentTerm uint64
|
||||||
|
|
||||||
|
// Highest committed log entry
|
||||||
|
commitIndex uint64
|
||||||
|
|
||||||
|
// Last applied log to the FSM
|
||||||
|
lastApplied uint64
|
||||||
|
|
||||||
|
// protects 4 next fields
|
||||||
|
lastLock sync.Mutex
|
||||||
|
|
||||||
|
// Cache the latest snapshot index/term
|
||||||
|
lastSnapshotIndex uint64
|
||||||
|
lastSnapshotTerm uint64
|
||||||
|
|
||||||
|
// Cache the latest log from LogStore
|
||||||
|
lastLogIndex uint64
|
||||||
|
lastLogTerm uint64
|
||||||
|
|
||||||
|
// Tracks running goroutines
|
||||||
|
routinesGroup sync.WaitGroup
|
||||||
|
|
||||||
|
// The current state
|
||||||
|
state RaftState
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftState) getState() RaftState {
|
||||||
|
stateAddr := (*uint32)(&r.state)
|
||||||
|
return RaftState(atomic.LoadUint32(stateAddr))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftState) setState(s RaftState) {
|
||||||
|
stateAddr := (*uint32)(&r.state)
|
||||||
|
atomic.StoreUint32(stateAddr, uint32(s))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftState) getCurrentTerm() uint64 {
|
||||||
|
return atomic.LoadUint64(&r.currentTerm)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftState) setCurrentTerm(term uint64) {
|
||||||
|
atomic.StoreUint64(&r.currentTerm, term)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftState) getLastLog() (index, term uint64) {
|
||||||
|
r.lastLock.Lock()
|
||||||
|
index = r.lastLogIndex
|
||||||
|
term = r.lastLogTerm
|
||||||
|
r.lastLock.Unlock()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftState) setLastLog(index, term uint64) {
|
||||||
|
r.lastLock.Lock()
|
||||||
|
r.lastLogIndex = index
|
||||||
|
r.lastLogTerm = term
|
||||||
|
r.lastLock.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftState) getLastSnapshot() (index, term uint64) {
|
||||||
|
r.lastLock.Lock()
|
||||||
|
index = r.lastSnapshotIndex
|
||||||
|
term = r.lastSnapshotTerm
|
||||||
|
r.lastLock.Unlock()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftState) setLastSnapshot(index, term uint64) {
|
||||||
|
r.lastLock.Lock()
|
||||||
|
r.lastSnapshotIndex = index
|
||||||
|
r.lastSnapshotTerm = term
|
||||||
|
r.lastLock.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftState) getCommitIndex() uint64 {
|
||||||
|
return atomic.LoadUint64(&r.commitIndex)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftState) setCommitIndex(index uint64) {
|
||||||
|
atomic.StoreUint64(&r.commitIndex, index)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftState) getLastApplied() uint64 {
|
||||||
|
return atomic.LoadUint64(&r.lastApplied)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftState) setLastApplied(index uint64) {
|
||||||
|
atomic.StoreUint64(&r.lastApplied, index)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start a goroutine and properly handle the race between a routine
|
||||||
|
// starting and incrementing, and exiting and decrementing.
|
||||||
|
func (r *raftState) goFunc(f func()) {
|
||||||
|
r.routinesGroup.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer r.routinesGroup.Done()
|
||||||
|
f()
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftState) waitShutdown() {
|
||||||
|
r.routinesGroup.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
// getLastIndex returns the last index in stable storage.
|
||||||
|
// Either from the last log or from the last snapshot.
|
||||||
|
func (r *raftState) getLastIndex() uint64 {
|
||||||
|
r.lastLock.Lock()
|
||||||
|
defer r.lastLock.Unlock()
|
||||||
|
return max(r.lastLogIndex, r.lastSnapshotIndex)
|
||||||
|
}
|
||||||
|
|
||||||
|
// getLastEntry returns the last index and term in stable storage.
|
||||||
|
// Either from the last log or from the last snapshot.
|
||||||
|
func (r *raftState) getLastEntry() (uint64, uint64) {
|
||||||
|
r.lastLock.Lock()
|
||||||
|
defer r.lastLock.Unlock()
|
||||||
|
if r.lastLogIndex >= r.lastSnapshotIndex {
|
||||||
|
return r.lastLogIndex, r.lastLogTerm
|
||||||
|
}
|
||||||
|
return r.lastSnapshotIndex, r.lastSnapshotTerm
|
||||||
|
}
|
@ -0,0 +1,16 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# The version must be supplied from the environment. Do not include the
|
||||||
|
# leading "v".
|
||||||
|
if [ -z $VERSION ]; then
|
||||||
|
echo "Please specify a version."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Generate the tag.
|
||||||
|
echo "==> Tagging version $VERSION..."
|
||||||
|
git commit --allow-empty -a --gpg-sign=348FFC4C -m "Release v$VERSION"
|
||||||
|
git tag -a -m "Version $VERSION" -s -u 348FFC4C "v${VERSION}" master
|
||||||
|
|
||||||
|
exit 0
|
@ -0,0 +1,105 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"io"
|
||||||
|
"log"
|
||||||
|
"net"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
errNotAdvertisable = errors.New("local bind address is not advertisable")
|
||||||
|
errNotTCP = errors.New("local address is not a TCP address")
|
||||||
|
)
|
||||||
|
|
||||||
|
// TCPStreamLayer implements StreamLayer interface for plain TCP.
|
||||||
|
type TCPStreamLayer struct {
|
||||||
|
advertise net.Addr
|
||||||
|
listener *net.TCPListener
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewTCPTransport returns a NetworkTransport that is built on top of
|
||||||
|
// a TCP streaming transport layer.
|
||||||
|
func NewTCPTransport(
|
||||||
|
bindAddr string,
|
||||||
|
advertise net.Addr,
|
||||||
|
maxPool int,
|
||||||
|
timeout time.Duration,
|
||||||
|
logOutput io.Writer,
|
||||||
|
) (*NetworkTransport, error) {
|
||||||
|
return newTCPTransport(bindAddr, advertise, maxPool, timeout, func(stream StreamLayer) *NetworkTransport {
|
||||||
|
return NewNetworkTransport(stream, maxPool, timeout, logOutput)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewTCPTransportWithLogger returns a NetworkTransport that is built on top of
|
||||||
|
// a TCP streaming transport layer, with log output going to the supplied Logger
|
||||||
|
func NewTCPTransportWithLogger(
|
||||||
|
bindAddr string,
|
||||||
|
advertise net.Addr,
|
||||||
|
maxPool int,
|
||||||
|
timeout time.Duration,
|
||||||
|
logger *log.Logger,
|
||||||
|
) (*NetworkTransport, error) {
|
||||||
|
return newTCPTransport(bindAddr, advertise, maxPool, timeout, func(stream StreamLayer) *NetworkTransport {
|
||||||
|
return NewNetworkTransportWithLogger(stream, maxPool, timeout, logger)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func newTCPTransport(bindAddr string,
|
||||||
|
advertise net.Addr,
|
||||||
|
maxPool int,
|
||||||
|
timeout time.Duration,
|
||||||
|
transportCreator func(stream StreamLayer) *NetworkTransport) (*NetworkTransport, error) {
|
||||||
|
// Try to bind
|
||||||
|
list, err := net.Listen("tcp", bindAddr)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create stream
|
||||||
|
stream := &TCPStreamLayer{
|
||||||
|
advertise: advertise,
|
||||||
|
listener: list.(*net.TCPListener),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify that we have a usable advertise address
|
||||||
|
addr, ok := stream.Addr().(*net.TCPAddr)
|
||||||
|
if !ok {
|
||||||
|
list.Close()
|
||||||
|
return nil, errNotTCP
|
||||||
|
}
|
||||||
|
if addr.IP.IsUnspecified() {
|
||||||
|
list.Close()
|
||||||
|
return nil, errNotAdvertisable
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the network transport
|
||||||
|
trans := transportCreator(stream)
|
||||||
|
return trans, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dial implements the StreamLayer interface.
|
||||||
|
func (t *TCPStreamLayer) Dial(address string, timeout time.Duration) (net.Conn, error) {
|
||||||
|
return net.DialTimeout("tcp", address, timeout)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Accept implements the net.Listener interface.
|
||||||
|
func (t *TCPStreamLayer) Accept() (c net.Conn, err error) {
|
||||||
|
return t.listener.Accept()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close implements the net.Listener interface.
|
||||||
|
func (t *TCPStreamLayer) Close() (err error) {
|
||||||
|
return t.listener.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Addr implements the net.Listener interface.
|
||||||
|
func (t *TCPStreamLayer) Addr() net.Addr {
|
||||||
|
// Use an advertise addr if provided
|
||||||
|
if t.advertise != nil {
|
||||||
|
return t.advertise
|
||||||
|
}
|
||||||
|
return t.listener.Addr()
|
||||||
|
}
|
@ -0,0 +1,24 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestTCPTransport_BadAddr(t *testing.T) {
|
||||||
|
_, err := NewTCPTransportWithLogger("0.0.0.0:0", nil, 1, 0, newTestLogger(t))
|
||||||
|
if err != errNotAdvertisable {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTCPTransport_WithAdvertise(t *testing.T) {
|
||||||
|
addr := &net.TCPAddr{IP: []byte{127, 0, 0, 1}, Port: 12345}
|
||||||
|
trans, err := NewTCPTransportWithLogger("0.0.0.0:0", addr, 1, 0, newTestLogger(t))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
if trans.LocalAddr() != "127.0.0.1:12345" {
|
||||||
|
t.Fatalf("bad: %v", trans.LocalAddr())
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,124 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"io"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// RPCResponse captures both a response and a potential error.
|
||||||
|
type RPCResponse struct {
|
||||||
|
Response interface{}
|
||||||
|
Error error
|
||||||
|
}
|
||||||
|
|
||||||
|
// RPC has a command, and provides a response mechanism.
|
||||||
|
type RPC struct {
|
||||||
|
Command interface{}
|
||||||
|
Reader io.Reader // Set only for InstallSnapshot
|
||||||
|
RespChan chan<- RPCResponse
|
||||||
|
}
|
||||||
|
|
||||||
|
// Respond is used to respond with a response, error or both
|
||||||
|
func (r *RPC) Respond(resp interface{}, err error) {
|
||||||
|
r.RespChan <- RPCResponse{resp, err}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Transport provides an interface for network transports
|
||||||
|
// to allow Raft to communicate with other nodes.
|
||||||
|
type Transport interface {
|
||||||
|
// Consumer returns a channel that can be used to
|
||||||
|
// consume and respond to RPC requests.
|
||||||
|
Consumer() <-chan RPC
|
||||||
|
|
||||||
|
// LocalAddr is used to return our local address to distinguish from our peers.
|
||||||
|
LocalAddr() string
|
||||||
|
|
||||||
|
// AppendEntriesPipeline returns an interface that can be used to pipeline
|
||||||
|
// AppendEntries requests.
|
||||||
|
AppendEntriesPipeline(target string) (AppendPipeline, error)
|
||||||
|
|
||||||
|
// AppendEntries sends the appropriate RPC to the target node.
|
||||||
|
AppendEntries(target string, args *AppendEntriesRequest, resp *AppendEntriesResponse) error
|
||||||
|
|
||||||
|
// RequestVote sends the appropriate RPC to the target node.
|
||||||
|
RequestVote(target string, args *RequestVoteRequest, resp *RequestVoteResponse) error
|
||||||
|
|
||||||
|
// InstallSnapshot is used to push a snapshot down to a follower. The data is read from
|
||||||
|
// the ReadCloser and streamed to the client.
|
||||||
|
InstallSnapshot(target string, args *InstallSnapshotRequest, resp *InstallSnapshotResponse, data io.Reader) error
|
||||||
|
|
||||||
|
// EncodePeer is used to serialize a peer name.
|
||||||
|
EncodePeer(string) []byte
|
||||||
|
|
||||||
|
// DecodePeer is used to deserialize a peer name.
|
||||||
|
DecodePeer([]byte) string
|
||||||
|
|
||||||
|
// SetHeartbeatHandler is used to setup a heartbeat handler
|
||||||
|
// as a fast-pass. This is to avoid head-of-line blocking from
|
||||||
|
// disk IO. If a Transport does not support this, it can simply
|
||||||
|
// ignore the call, and push the heartbeat onto the Consumer channel.
|
||||||
|
SetHeartbeatHandler(cb func(rpc RPC))
|
||||||
|
}
|
||||||
|
|
||||||
|
// WithClose is an interface that a transport may provide which
|
||||||
|
// allows a transport to be shut down cleanly when a Raft instance
|
||||||
|
// shuts down.
|
||||||
|
//
|
||||||
|
// It is defined separately from Transport as unfortunately it wasn't in the
|
||||||
|
// original interface specification.
|
||||||
|
type WithClose interface {
|
||||||
|
// Close permanently closes a transport, stopping
|
||||||
|
// any associated goroutines and freeing other resources.
|
||||||
|
Close() error
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoopbackTransport is an interface that provides a loopback transport suitable for testing
|
||||||
|
// e.g. InmemTransport. It's there so we don't have to rewrite tests.
|
||||||
|
type LoopbackTransport interface {
|
||||||
|
Transport // Embedded transport reference
|
||||||
|
WithPeers // Embedded peer management
|
||||||
|
WithClose // with a close routine
|
||||||
|
}
|
||||||
|
|
||||||
|
// WithPeers is an interface that a transport may provide which allows for connection and
|
||||||
|
// disconnection. Unless the transport is a loopback transport, the transport specified to
|
||||||
|
// "Connect" is likely to be nil.
|
||||||
|
type WithPeers interface {
|
||||||
|
Connect(peer string, t Transport) // Connect a peer
|
||||||
|
Disconnect(peer string) // Disconnect a given peer
|
||||||
|
DisconnectAll() // Disconnect all peers, possibly to reconnect them later
|
||||||
|
}
|
||||||
|
|
||||||
|
// AppendPipeline is used for pipelining AppendEntries requests. It is used
|
||||||
|
// to increase the replication throughput by masking latency and better
|
||||||
|
// utilizing bandwidth.
|
||||||
|
type AppendPipeline interface {
|
||||||
|
// AppendEntries is used to add another request to the pipeline.
|
||||||
|
// The send may block which is an effective form of back-pressure.
|
||||||
|
AppendEntries(args *AppendEntriesRequest, resp *AppendEntriesResponse) (AppendFuture, error)
|
||||||
|
|
||||||
|
// Consumer returns a channel that can be used to consume
|
||||||
|
// response futures when they are ready.
|
||||||
|
Consumer() <-chan AppendFuture
|
||||||
|
|
||||||
|
// Close closes the pipeline and cancels all inflight RPCs
|
||||||
|
Close() error
|
||||||
|
}
|
||||||
|
|
||||||
|
// AppendFuture is used to return information about a pipelined AppendEntries request.
|
||||||
|
type AppendFuture interface {
|
||||||
|
Future
|
||||||
|
|
||||||
|
// Start returns the time that the append request was started.
|
||||||
|
// It is always OK to call this method.
|
||||||
|
Start() time.Time
|
||||||
|
|
||||||
|
// Request holds the parameters of the AppendEntries call.
|
||||||
|
// It is always OK to call this method.
|
||||||
|
Request() *AppendEntriesRequest
|
||||||
|
|
||||||
|
// Response holds the results of the AppendEntries call.
|
||||||
|
// This method must only be called after the Error
|
||||||
|
// method returns, and will only be valid on success.
|
||||||
|
Response() *AppendEntriesResponse
|
||||||
|
}
|
@ -0,0 +1,313 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
TT_Inmem = iota
|
||||||
|
|
||||||
|
// NOTE: must be last
|
||||||
|
numTestTransports
|
||||||
|
)
|
||||||
|
|
||||||
|
func NewTestTransport(ttype int, addr string) (string, LoopbackTransport) {
|
||||||
|
switch ttype {
|
||||||
|
case TT_Inmem:
|
||||||
|
addr, lt := NewInmemTransport(addr)
|
||||||
|
return addr, lt
|
||||||
|
default:
|
||||||
|
panic("Unknown transport type")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTransport_StartStop(t *testing.T) {
|
||||||
|
for ttype := 0; ttype < numTestTransports; ttype++ {
|
||||||
|
_, trans := NewTestTransport(ttype, "")
|
||||||
|
if err := trans.Close(); err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTransport_AppendEntries(t *testing.T) {
|
||||||
|
for ttype := 0; ttype < numTestTransports; ttype++ {
|
||||||
|
addr1, trans1 := NewTestTransport(ttype, "")
|
||||||
|
defer trans1.Close()
|
||||||
|
rpcCh := trans1.Consumer()
|
||||||
|
|
||||||
|
// Make the RPC request
|
||||||
|
args := AppendEntriesRequest{
|
||||||
|
Term: 10,
|
||||||
|
Leader: []byte("cartman"),
|
||||||
|
PrevLogEntry: 100,
|
||||||
|
PrevLogTerm: 4,
|
||||||
|
Entries: []*Log{
|
||||||
|
&Log{
|
||||||
|
Index: 101,
|
||||||
|
Term: 4,
|
||||||
|
Type: LogNoop,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
LeaderCommitIndex: 90,
|
||||||
|
}
|
||||||
|
resp := AppendEntriesResponse{
|
||||||
|
Term: 4,
|
||||||
|
LastLog: 90,
|
||||||
|
Success: true,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Listen for a request
|
||||||
|
go func() {
|
||||||
|
select {
|
||||||
|
case rpc := <-rpcCh:
|
||||||
|
// Verify the command
|
||||||
|
req := rpc.Command.(*AppendEntriesRequest)
|
||||||
|
if !reflect.DeepEqual(req, &args) {
|
||||||
|
t.Fatalf("command mismatch: %#v %#v", *req, args)
|
||||||
|
}
|
||||||
|
rpc.Respond(&resp, nil)
|
||||||
|
|
||||||
|
case <-time.After(200 * time.Millisecond):
|
||||||
|
t.Fatalf("timeout")
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Transport 2 makes outbound request
|
||||||
|
addr2, trans2 := NewTestTransport(ttype, "")
|
||||||
|
defer trans2.Close()
|
||||||
|
|
||||||
|
trans1.Connect(addr2, trans2)
|
||||||
|
trans2.Connect(addr1, trans1)
|
||||||
|
|
||||||
|
var out AppendEntriesResponse
|
||||||
|
if err := trans2.AppendEntries(trans1.LocalAddr(), &args, &out); err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify the response
|
||||||
|
if !reflect.DeepEqual(resp, out) {
|
||||||
|
t.Fatalf("command mismatch: %#v %#v", resp, out)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTransport_AppendEntriesPipeline(t *testing.T) {
|
||||||
|
for ttype := 0; ttype < numTestTransports; ttype++ {
|
||||||
|
addr1, trans1 := NewTestTransport(ttype, "")
|
||||||
|
defer trans1.Close()
|
||||||
|
rpcCh := trans1.Consumer()
|
||||||
|
|
||||||
|
// Make the RPC request
|
||||||
|
args := AppendEntriesRequest{
|
||||||
|
Term: 10,
|
||||||
|
Leader: []byte("cartman"),
|
||||||
|
PrevLogEntry: 100,
|
||||||
|
PrevLogTerm: 4,
|
||||||
|
Entries: []*Log{
|
||||||
|
&Log{
|
||||||
|
Index: 101,
|
||||||
|
Term: 4,
|
||||||
|
Type: LogNoop,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
LeaderCommitIndex: 90,
|
||||||
|
}
|
||||||
|
resp := AppendEntriesResponse{
|
||||||
|
Term: 4,
|
||||||
|
LastLog: 90,
|
||||||
|
Success: true,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Listen for a request
|
||||||
|
go func() {
|
||||||
|
for i := 0; i < 10; i++ {
|
||||||
|
select {
|
||||||
|
case rpc := <-rpcCh:
|
||||||
|
// Verify the command
|
||||||
|
req := rpc.Command.(*AppendEntriesRequest)
|
||||||
|
if !reflect.DeepEqual(req, &args) {
|
||||||
|
t.Fatalf("command mismatch: %#v %#v", *req, args)
|
||||||
|
}
|
||||||
|
rpc.Respond(&resp, nil)
|
||||||
|
|
||||||
|
case <-time.After(200 * time.Millisecond):
|
||||||
|
t.Fatalf("timeout")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Transport 2 makes outbound request
|
||||||
|
addr2, trans2 := NewTestTransport(ttype, "")
|
||||||
|
defer trans2.Close()
|
||||||
|
|
||||||
|
trans1.Connect(addr2, trans2)
|
||||||
|
trans2.Connect(addr1, trans1)
|
||||||
|
|
||||||
|
pipeline, err := trans2.AppendEntriesPipeline(trans1.LocalAddr())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
defer pipeline.Close()
|
||||||
|
for i := 0; i < 10; i++ {
|
||||||
|
out := new(AppendEntriesResponse)
|
||||||
|
if _, err := pipeline.AppendEntries(&args, out); err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
respCh := pipeline.Consumer()
|
||||||
|
for i := 0; i < 10; i++ {
|
||||||
|
select {
|
||||||
|
case ready := <-respCh:
|
||||||
|
// Verify the response
|
||||||
|
if !reflect.DeepEqual(&resp, ready.Response()) {
|
||||||
|
t.Fatalf("command mismatch: %#v %#v", &resp, ready.Response())
|
||||||
|
}
|
||||||
|
case <-time.After(200 * time.Millisecond):
|
||||||
|
t.Fatalf("timeout")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTransport_RequestVote(t *testing.T) {
|
||||||
|
for ttype := 0; ttype < numTestTransports; ttype++ {
|
||||||
|
addr1, trans1 := NewTestTransport(ttype, "")
|
||||||
|
defer trans1.Close()
|
||||||
|
rpcCh := trans1.Consumer()
|
||||||
|
|
||||||
|
// Make the RPC request
|
||||||
|
args := RequestVoteRequest{
|
||||||
|
Term: 20,
|
||||||
|
Candidate: []byte("butters"),
|
||||||
|
LastLogIndex: 100,
|
||||||
|
LastLogTerm: 19,
|
||||||
|
}
|
||||||
|
resp := RequestVoteResponse{
|
||||||
|
Term: 100,
|
||||||
|
Peers: []byte("blah"),
|
||||||
|
Granted: false,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Listen for a request
|
||||||
|
go func() {
|
||||||
|
select {
|
||||||
|
case rpc := <-rpcCh:
|
||||||
|
// Verify the command
|
||||||
|
req := rpc.Command.(*RequestVoteRequest)
|
||||||
|
if !reflect.DeepEqual(req, &args) {
|
||||||
|
t.Fatalf("command mismatch: %#v %#v", *req, args)
|
||||||
|
}
|
||||||
|
|
||||||
|
rpc.Respond(&resp, nil)
|
||||||
|
|
||||||
|
case <-time.After(200 * time.Millisecond):
|
||||||
|
t.Fatalf("timeout")
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Transport 2 makes outbound request
|
||||||
|
addr2, trans2 := NewTestTransport(ttype, "")
|
||||||
|
defer trans2.Close()
|
||||||
|
|
||||||
|
trans1.Connect(addr2, trans2)
|
||||||
|
trans2.Connect(addr1, trans1)
|
||||||
|
|
||||||
|
var out RequestVoteResponse
|
||||||
|
if err := trans2.RequestVote(trans1.LocalAddr(), &args, &out); err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify the response
|
||||||
|
if !reflect.DeepEqual(resp, out) {
|
||||||
|
t.Fatalf("command mismatch: %#v %#v", resp, out)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTransport_InstallSnapshot(t *testing.T) {
|
||||||
|
for ttype := 0; ttype < numTestTransports; ttype++ {
|
||||||
|
addr1, trans1 := NewTestTransport(ttype, "")
|
||||||
|
defer trans1.Close()
|
||||||
|
rpcCh := trans1.Consumer()
|
||||||
|
|
||||||
|
// Make the RPC request
|
||||||
|
args := InstallSnapshotRequest{
|
||||||
|
Term: 10,
|
||||||
|
Leader: []byte("kyle"),
|
||||||
|
LastLogIndex: 100,
|
||||||
|
LastLogTerm: 9,
|
||||||
|
Peers: []byte("blah blah"),
|
||||||
|
Size: 10,
|
||||||
|
}
|
||||||
|
resp := InstallSnapshotResponse{
|
||||||
|
Term: 10,
|
||||||
|
Success: true,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Listen for a request
|
||||||
|
go func() {
|
||||||
|
select {
|
||||||
|
case rpc := <-rpcCh:
|
||||||
|
// Verify the command
|
||||||
|
req := rpc.Command.(*InstallSnapshotRequest)
|
||||||
|
if !reflect.DeepEqual(req, &args) {
|
||||||
|
t.Fatalf("command mismatch: %#v %#v", *req, args)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to read the bytes
|
||||||
|
buf := make([]byte, 10)
|
||||||
|
rpc.Reader.Read(buf)
|
||||||
|
|
||||||
|
// Compare
|
||||||
|
if bytes.Compare(buf, []byte("0123456789")) != 0 {
|
||||||
|
t.Fatalf("bad buf %v", buf)
|
||||||
|
}
|
||||||
|
|
||||||
|
rpc.Respond(&resp, nil)
|
||||||
|
|
||||||
|
case <-time.After(200 * time.Millisecond):
|
||||||
|
t.Fatalf("timeout")
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Transport 2 makes outbound request
|
||||||
|
addr2, trans2 := NewTestTransport(ttype, "")
|
||||||
|
defer trans2.Close()
|
||||||
|
|
||||||
|
trans1.Connect(addr2, trans2)
|
||||||
|
trans2.Connect(addr1, trans1)
|
||||||
|
|
||||||
|
// Create a buffer
|
||||||
|
buf := bytes.NewBuffer([]byte("0123456789"))
|
||||||
|
|
||||||
|
var out InstallSnapshotResponse
|
||||||
|
if err := trans2.InstallSnapshot(trans1.LocalAddr(), &args, &out, buf); err != nil {
|
||||||
|
t.Fatalf("err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify the response
|
||||||
|
if !reflect.DeepEqual(resp, out) {
|
||||||
|
t.Fatalf("command mismatch: %#v %#v", resp, out)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTransport_EncodeDecode(t *testing.T) {
|
||||||
|
for ttype := 0; ttype < numTestTransports; ttype++ {
|
||||||
|
_, trans1 := NewTestTransport(ttype, "")
|
||||||
|
defer trans1.Close()
|
||||||
|
|
||||||
|
local := trans1.LocalAddr()
|
||||||
|
enc := trans1.EncodePeer(local)
|
||||||
|
dec := trans1.DecodePeer(enc)
|
||||||
|
|
||||||
|
if dec != local {
|
||||||
|
t.Fatalf("enc/dec fail: %v %v", dec, local)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,179 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
crand "crypto/rand"
|
||||||
|
"fmt"
|
||||||
|
"math"
|
||||||
|
"math/big"
|
||||||
|
"math/rand"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/hashicorp/go-msgpack/codec"
|
||||||
|
)
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
// Ensure we use a high-entropy seed for the psuedo-random generator
|
||||||
|
rand.Seed(newSeed())
|
||||||
|
}
|
||||||
|
|
||||||
|
// returns an int64 from a crypto random source
|
||||||
|
// can be used to seed a source for a math/rand.
|
||||||
|
func newSeed() int64 {
|
||||||
|
r, err := crand.Int(crand.Reader, big.NewInt(math.MaxInt64))
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Errorf("failed to read random bytes: %v", err))
|
||||||
|
}
|
||||||
|
return r.Int64()
|
||||||
|
}
|
||||||
|
|
||||||
|
// randomTimeout returns a value that is between the minVal and 2x minVal.
|
||||||
|
func randomTimeout(minVal time.Duration) <-chan time.Time {
|
||||||
|
if minVal == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
extra := (time.Duration(rand.Int63()) % minVal)
|
||||||
|
return time.After(minVal + extra)
|
||||||
|
}
|
||||||
|
|
||||||
|
// min returns the minimum.
|
||||||
|
func min(a, b uint64) uint64 {
|
||||||
|
if a <= b {
|
||||||
|
return a
|
||||||
|
}
|
||||||
|
return b
|
||||||
|
}
|
||||||
|
|
||||||
|
// max returns the maximum.
|
||||||
|
func max(a, b uint64) uint64 {
|
||||||
|
if a >= b {
|
||||||
|
return a
|
||||||
|
}
|
||||||
|
return b
|
||||||
|
}
|
||||||
|
|
||||||
|
// generateUUID is used to generate a random UUID.
|
||||||
|
func generateUUID() string {
|
||||||
|
buf := make([]byte, 16)
|
||||||
|
if _, err := crand.Read(buf); err != nil {
|
||||||
|
panic(fmt.Errorf("failed to read random bytes: %v", err))
|
||||||
|
}
|
||||||
|
|
||||||
|
return fmt.Sprintf("%08x-%04x-%04x-%04x-%12x",
|
||||||
|
buf[0:4],
|
||||||
|
buf[4:6],
|
||||||
|
buf[6:8],
|
||||||
|
buf[8:10],
|
||||||
|
buf[10:16])
|
||||||
|
}
|
||||||
|
|
||||||
|
// asyncNotifyCh is used to do an async channel send
|
||||||
|
// to a single channel without blocking.
|
||||||
|
func asyncNotifyCh(ch chan struct{}) {
|
||||||
|
select {
|
||||||
|
case ch <- struct{}{}:
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// asyncNotifyBool is used to do an async notification
|
||||||
|
// on a bool channel.
|
||||||
|
func asyncNotifyBool(ch chan bool, v bool) {
|
||||||
|
select {
|
||||||
|
case ch <- v:
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ExcludePeer is used to exclude a single peer from a list of peers.
|
||||||
|
func ExcludePeer(peers []string, peer string) []string {
|
||||||
|
otherPeers := make([]string, 0, len(peers))
|
||||||
|
for _, p := range peers {
|
||||||
|
if p != peer {
|
||||||
|
otherPeers = append(otherPeers, p)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return otherPeers
|
||||||
|
}
|
||||||
|
|
||||||
|
// PeerContained checks if a given peer is contained in a list.
|
||||||
|
func PeerContained(peers []string, peer string) bool {
|
||||||
|
for _, p := range peers {
|
||||||
|
if p == peer {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// AddUniquePeer is used to add a peer to a list of existing
|
||||||
|
// peers only if it is not already contained.
|
||||||
|
func AddUniquePeer(peers []string, peer string) []string {
|
||||||
|
if PeerContained(peers, peer) {
|
||||||
|
return peers
|
||||||
|
}
|
||||||
|
return append(peers, peer)
|
||||||
|
}
|
||||||
|
|
||||||
|
// encodePeers is used to serialize a list of peers.
|
||||||
|
func encodePeers(peers []string, trans Transport) []byte {
|
||||||
|
// Encode each peer
|
||||||
|
var encPeers [][]byte
|
||||||
|
for _, p := range peers {
|
||||||
|
encPeers = append(encPeers, trans.EncodePeer(p))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Encode the entire array
|
||||||
|
buf, err := encodeMsgPack(encPeers)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Errorf("failed to encode peers: %v", err))
|
||||||
|
}
|
||||||
|
|
||||||
|
return buf.Bytes()
|
||||||
|
}
|
||||||
|
|
||||||
|
// decodePeers is used to deserialize a list of peers.
|
||||||
|
func decodePeers(buf []byte, trans Transport) []string {
|
||||||
|
// Decode the buffer first
|
||||||
|
var encPeers [][]byte
|
||||||
|
if err := decodeMsgPack(buf, &encPeers); err != nil {
|
||||||
|
panic(fmt.Errorf("failed to decode peers: %v", err))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Deserialize each peer
|
||||||
|
var peers []string
|
||||||
|
for _, enc := range encPeers {
|
||||||
|
peers = append(peers, trans.DecodePeer(enc))
|
||||||
|
}
|
||||||
|
|
||||||
|
return peers
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decode reverses the encode operation on a byte slice input.
|
||||||
|
func decodeMsgPack(buf []byte, out interface{}) error {
|
||||||
|
r := bytes.NewBuffer(buf)
|
||||||
|
hd := codec.MsgpackHandle{}
|
||||||
|
dec := codec.NewDecoder(r, &hd)
|
||||||
|
return dec.Decode(out)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Encode writes an encoded object to a new bytes buffer.
|
||||||
|
func encodeMsgPack(in interface{}) (*bytes.Buffer, error) {
|
||||||
|
buf := bytes.NewBuffer(nil)
|
||||||
|
hd := codec.MsgpackHandle{}
|
||||||
|
enc := codec.NewEncoder(buf, &hd)
|
||||||
|
err := enc.Encode(in)
|
||||||
|
return buf, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// backoff is used to compute an exponential backoff
|
||||||
|
// duration. Base time is scaled by the current round,
|
||||||
|
// up to some maximum scale factor.
|
||||||
|
func backoff(base time.Duration, round, limit uint64) time.Duration {
|
||||||
|
power := min(round, limit)
|
||||||
|
for power > 2 {
|
||||||
|
base *= 2
|
||||||
|
power--
|
||||||
|
}
|
||||||
|
return base
|
||||||
|
}
|
@ -0,0 +1,152 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"regexp"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestRandomTimeout(t *testing.T) {
|
||||||
|
start := time.Now()
|
||||||
|
timeout := randomTimeout(time.Millisecond)
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-timeout:
|
||||||
|
diff := time.Now().Sub(start)
|
||||||
|
if diff < time.Millisecond {
|
||||||
|
t.Fatalf("fired early")
|
||||||
|
}
|
||||||
|
case <-time.After(3 * time.Millisecond):
|
||||||
|
t.Fatalf("timeout")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNewSeed(t *testing.T) {
|
||||||
|
vals := make(map[int64]bool)
|
||||||
|
for i := 0; i < 1000; i++ {
|
||||||
|
seed := newSeed()
|
||||||
|
if _, exists := vals[seed]; exists {
|
||||||
|
t.Fatal("newSeed() return a value it'd previously returned")
|
||||||
|
}
|
||||||
|
vals[seed] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRandomTimeout_NoTime(t *testing.T) {
|
||||||
|
timeout := randomTimeout(0)
|
||||||
|
if timeout != nil {
|
||||||
|
t.Fatalf("expected nil channel")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMin(t *testing.T) {
|
||||||
|
if min(1, 1) != 1 {
|
||||||
|
t.Fatalf("bad min")
|
||||||
|
}
|
||||||
|
if min(2, 1) != 1 {
|
||||||
|
t.Fatalf("bad min")
|
||||||
|
}
|
||||||
|
if min(1, 2) != 1 {
|
||||||
|
t.Fatalf("bad min")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMax(t *testing.T) {
|
||||||
|
if max(1, 1) != 1 {
|
||||||
|
t.Fatalf("bad max")
|
||||||
|
}
|
||||||
|
if max(2, 1) != 2 {
|
||||||
|
t.Fatalf("bad max")
|
||||||
|
}
|
||||||
|
if max(1, 2) != 2 {
|
||||||
|
t.Fatalf("bad max")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGenerateUUID(t *testing.T) {
|
||||||
|
prev := generateUUID()
|
||||||
|
for i := 0; i < 100; i++ {
|
||||||
|
id := generateUUID()
|
||||||
|
if prev == id {
|
||||||
|
t.Fatalf("Should get a new ID!")
|
||||||
|
}
|
||||||
|
|
||||||
|
matched, err := regexp.MatchString(
|
||||||
|
`[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}`, id)
|
||||||
|
if !matched || err != nil {
|
||||||
|
t.Fatalf("expected match %s %v %s", id, matched, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExcludePeer(t *testing.T) {
|
||||||
|
peers := []string{NewInmemAddr(), NewInmemAddr(), NewInmemAddr()}
|
||||||
|
peer := peers[2]
|
||||||
|
|
||||||
|
after := ExcludePeer(peers, peer)
|
||||||
|
if len(after) != 2 {
|
||||||
|
t.Fatalf("Bad length")
|
||||||
|
}
|
||||||
|
if after[0] == peer || after[1] == peer {
|
||||||
|
t.Fatalf("should not contain peer")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPeerContained(t *testing.T) {
|
||||||
|
peers := []string{NewInmemAddr(), NewInmemAddr(), NewInmemAddr()}
|
||||||
|
|
||||||
|
if !PeerContained(peers, peers[2]) {
|
||||||
|
t.Fatalf("Expect contained")
|
||||||
|
}
|
||||||
|
if PeerContained(peers, NewInmemAddr()) {
|
||||||
|
t.Fatalf("unexpected contained")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAddUniquePeer(t *testing.T) {
|
||||||
|
peers := []string{NewInmemAddr(), NewInmemAddr(), NewInmemAddr()}
|
||||||
|
after := AddUniquePeer(peers, peers[2])
|
||||||
|
if !reflect.DeepEqual(after, peers) {
|
||||||
|
t.Fatalf("unexpected append")
|
||||||
|
}
|
||||||
|
after = AddUniquePeer(peers, NewInmemAddr())
|
||||||
|
if len(after) != 4 {
|
||||||
|
t.Fatalf("expected append")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEncodeDecodePeers(t *testing.T) {
|
||||||
|
peers := []string{NewInmemAddr(), NewInmemAddr(), NewInmemAddr()}
|
||||||
|
_, trans := NewInmemTransport("")
|
||||||
|
|
||||||
|
// Try to encode/decode
|
||||||
|
buf := encodePeers(peers, trans)
|
||||||
|
decoded := decodePeers(buf, trans)
|
||||||
|
|
||||||
|
if !reflect.DeepEqual(peers, decoded) {
|
||||||
|
t.Fatalf("mismatch %v %v", peers, decoded)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBackoff(t *testing.T) {
|
||||||
|
b := backoff(10*time.Millisecond, 1, 8)
|
||||||
|
if b != 10*time.Millisecond {
|
||||||
|
t.Fatalf("bad: %v", b)
|
||||||
|
}
|
||||||
|
|
||||||
|
b = backoff(20*time.Millisecond, 2, 8)
|
||||||
|
if b != 20*time.Millisecond {
|
||||||
|
t.Fatalf("bad: %v", b)
|
||||||
|
}
|
||||||
|
|
||||||
|
b = backoff(10*time.Millisecond, 8, 8)
|
||||||
|
if b != 640*time.Millisecond {
|
||||||
|
t.Fatalf("bad: %v", b)
|
||||||
|
}
|
||||||
|
|
||||||
|
b = backoff(10*time.Millisecond, 9, 8)
|
||||||
|
if b != 640*time.Millisecond {
|
||||||
|
t.Fatalf("bad: %v", b)
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue