1
0
Fork 0

Merge pull request #600 from rqlite/hashicorp_v1_go_mod

Move to Hashicorp Raft v1
master
Philip O'Toole 5 years ago committed by GitHub
commit 6c5c7d60b5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,10 +1,13 @@
## 4.7.0 (Unreleased)
## 5.0.0 (Unreleased)
This release uses a new Raft consensus version, with the move to Hashicorp Raft v1. As a result **the Raft system in 5.0 is not compatible with the 4.0 series**. To upgrade from an earlier version to this release you should backup your 4.0 leader node, and restore the database dump into a new 5.0 cluster. The HTTP API remains unchanged however.
### New features
- [PR #595](https://github.com/rqlite/rqlite/pull/595): rqlite CLI prints Welcome message on startup.
### Implementation changes and bug fixes
- [PR #597](https://github.com/rqlite/rqlite/pull/597): Don't ignore any Join error, instead return it.
- [PR #598](https://github.com/rqlite/rqlite/pull/598): Ensure backup is correctly closed.
- [PR #600](https://github.com/rqlite/rqlite/pull/600): Move to Hashicorp Raft v1.
## 4.6.0 (November 29th 2019)
_This release adds significant new functionality to the command-line tool, including much more control over backup and restore of the database. [Visit the Releases page](https://github.com/rqlite/rqlite/releases/tag/v4.6.0) to download this release._

@ -16,24 +16,27 @@ Let's say you have 3 host machines, _host1_, _host2_, and _host3_, and that each
To create a cluster you must first launch a node that can act as the initial leader. Do this as follows on _host1_:
```bash
host1:$ rqlited -http-addr host1:4001 -raft-addr host1:4002 ~/node
host1:$ rqlited -node-id 1 -http-addr host1:4001 -raft-addr host1:4002 ~/node
```
With this command a single node is started, listening for API requests on port 4001 and listening on port 4002 for intra-cluster communication and cluster-join requests from other nodes. This node stores its state at `~/node`.
To join a second node to this leader, execute the following command on _host2_:
```bash
host2:$ rqlited -http-addr host2:4001 -raft-addr host2:4002 -join http://host1:4001 ~/node
host2:$ rqlited -node-id 2 -http-addr host2:4001 -raft-addr host2:4002 -join http://host1:4001 ~/node
```
_If a node receives a join request, and that node is not actually the leader of the cluster, the receiving node will automatically redirect the requesting node to the leader node. As a result a node can actually join a cluster by contacting any node in the cluster. You can also specify multiple join addresses, and the node will try each address until joining is successful._
Once executed you now have a cluster of two nodes. Of course, for fault-tolerance you need a 3-node cluster, so launch a third node like so on _host3_:
```bash
host3:$ rqlited -http-addr host3:4001 -raft-addr host3:4002 -join http://host1:4001 ~/node
host3:$ rqlited -node-id 3 -http-addr host3:4001 -raft-addr host3:4002 -join http://host1:4001 ~/node
```
_When restarting a node, there is no further need to pass `-join`. It will be ignored if a node is already a member of a cluster._
You've now got a fault-tolerant, distributed, relational database. It can tolerate the failure of any node, even the leader, and remain operational.
## Node IDs
You can set the Node ID (`-node-id`) to anything you wish, as long as it's unique for each node.
## Listening on all interfaces
You can pass `0.0.0.0` to both `-http-addr` and `-raft-addr` if you wish a node to listen on all interfaces. You must still pass an explicit network address to `-join` however. In this case you'll also want to set `-http-adv-addr` to the actual interface address, so other nodes learn the correct network address to use to reach the node listening on `0.0.0.0`.

@ -1,4 +1,4 @@
version: 4.{build}
version: 5.{build}
platform: x64

@ -19,10 +19,10 @@ import (
const numAttempts int = 3
const attemptInterval time.Duration = 5 * time.Second
// Join attempts to join the cluster at one of the addresses given in joinAddr.
// It walks through joinAddr in order, and sets the Raft address of the joining
// node as advAddr. It returns the endpoint successfully used to join the cluster.
func Join(joinAddr []string, advAddr string, tlsConfig *tls.Config) (string, error) {
// It walks through joinAddr in order, and sets the node ID and Raft address of
// the joining node as nodeID advAddr respectively. It returns the endpoint
// successfully used to join the cluster.
func Join(joinAddr []string, nodeID, advAddr string, tlsConfig *tls.Config) (string, error) {
var err error
var j string
logger := log.New(os.Stderr, "[cluster-join] ", log.LstdFlags)
@ -32,7 +32,7 @@ func Join(joinAddr []string, advAddr string, tlsConfig *tls.Config) (string, err
for i := 0; i < numAttempts; i++ {
for _, a := range joinAddr {
j, err = join(a, advAddr, tlsConfig, logger)
j, err = join(a, nodeID, advAddr, tlsConfig, logger)
if err == nil {
// Success!
return j, nil
@ -45,7 +45,11 @@ func Join(joinAddr []string, advAddr string, tlsConfig *tls.Config) (string, err
return "", err
}
func join(joinAddr string, advAddr string, tlsConfig *tls.Config, logger *log.Logger) (string, error) {
func join(joinAddr, nodeID, advAddr string, tlsConfig *tls.Config, logger *log.Logger) (string, error) {
if nodeID == "" {
return "", fmt.Errorf("node ID not set")
}
// Join using IP address, as that is what Hashicorp Raft works in.
resv, err := net.ResolveTCPAddr("tcp", advAddr)
if err != nil {
@ -65,10 +69,10 @@ func join(joinAddr string, advAddr string, tlsConfig *tls.Config, logger *log.Lo
}
for {
b, err := json.Marshal(map[string]string{"addr": resv.String()})
if err != nil {
return "", err
}
b, err := json.Marshal(map[string]string{
"id": nodeID,
"addr": resv.String(),
})
// Attempt to join.
resp, err := client.Post(fullAddr, "application-type/json", bytes.NewReader(b))

@ -16,7 +16,7 @@ func Test_SingleJoinOK(t *testing.T) {
}))
defer ts.Close()
j, err := Join([]string{ts.URL}, "127.0.0.1:9090", nil)
j, err := Join([]string{ts.URL}, "id0", "127.0.0.1:9090", nil)
if err != nil {
t.Fatalf("failed to join a single node: %s", err.Error())
}
@ -31,7 +31,7 @@ func Test_SingleJoinFail(t *testing.T) {
}))
defer ts.Close()
_, err := Join([]string{ts.URL}, "127.0.0.1:9090", nil)
_, err := Join([]string{ts.URL}, "id0", "127.0.0.1:9090", nil)
if err == nil {
t.Fatalf("expected error when joining bad node")
}
@ -45,7 +45,7 @@ func Test_DoubleJoinOK(t *testing.T) {
}))
defer ts2.Close()
j, err := Join([]string{ts1.URL, ts2.URL}, "127.0.0.1:9090", nil)
j, err := Join([]string{ts1.URL, ts2.URL}, "id0", "127.0.0.1:9090", nil)
if err != nil {
t.Fatalf("failed to join a single node: %s", err.Error())
}
@ -63,7 +63,7 @@ func Test_DoubleJoinOKSecondNode(t *testing.T) {
}))
defer ts2.Close()
j, err := Join([]string{ts1.URL, ts2.URL}, "127.0.0.1:9090", nil)
j, err := Join([]string{ts1.URL, ts2.URL}, "id0", "127.0.0.1:9090", nil)
if err != nil {
t.Fatalf("failed to join a single node: %s", err.Error())
}
@ -83,7 +83,7 @@ func Test_DoubleJoinOKSecondNodeRedirect(t *testing.T) {
}))
defer ts2.Close()
j, err := Join([]string{ts2.URL}, "127.0.0.1:9090", nil)
j, err := Join([]string{ts2.URL}, "id0", "127.0.0.1:9090", nil)
if err != nil {
t.Fatalf("failed to join a single node: %s", err.Error())
}

@ -40,7 +40,7 @@ const logo = `
// These variables are populated via the Go linker.
var (
version = "4"
version = "5"
commit = "unknown"
branch = "unknown"
buildtime = "unknown"
@ -66,6 +66,7 @@ var nodeEncrypt bool
var nodeX509CACert string
var nodeX509Cert string
var nodeX509Key string
var nodeID string
var raftAddr string
var raftAdv string
var joinAddr string
@ -92,6 +93,7 @@ const desc = `rqlite is a lightweight, distributed relational database, which us
storage engine. It provides an easy-to-use, fault-tolerant store for relational data.`
func init() {
flag.StringVar(&nodeID, "node-id", "", "Unique name for node. If not set, set to hostname")
flag.StringVar(&httpAddr, "http-addr", "localhost:4001", "HTTP server bind address. For HTTPS, set X.509 cert and key")
flag.StringVar(&httpAdv, "http-adv-addr", "", "Advertised HTTP address. If not set, same as HTTP server")
flag.StringVar(&x509CACert, "http-ca-cert", "", "Path to root X.509 certificate for HTTP endpoint")
@ -196,10 +198,15 @@ func main() {
}
dbConf := store.NewDBConfig(dsn, !onDisk)
nid, err := idOrHostname()
if err != nil {
log.Fatalf("failed to determine node ID: %s", err.Error())
}
str := store.New(&store.StoreConfig{
DBConf: dbConf,
Dir: dataPath,
Tn: raftTn,
ID: nid,
})
// Set optional parameters on store.
@ -271,7 +278,7 @@ func main() {
}
}
if j, err := cluster.Join(joins, advAddr, &tlsConfig); err != nil {
if j, err := cluster.Join(joins, nid, advAddr, &tlsConfig); err != nil {
log.Fatalf("failed to join cluster at %s: %s", joins, err.Error())
} else {
log.Println("successfully joined cluster at", j)
@ -408,6 +415,13 @@ func credentialStore() (*auth.CredentialsStore, error) {
return cs, nil
}
func idOrHostname() (string, error) {
if nodeID != "" {
return nodeID, nil
}
return os.Hostname()
}
// prof stores the file locations of active profiles.
var prof struct {
cpu *os.File

@ -0,0 +1,16 @@
module github.com/rqlite/rqlite
go 1.13
require (
github.com/Bowery/prompt v0.0.0-20190916142128-fa8279994f75
github.com/hashicorp/raft v1.1.1
github.com/hashicorp/raft-boltdb v0.0.0-20191021154308-4207f1bf0617
github.com/labstack/gommon v0.3.0 // indirect
github.com/mattn/go-colorable v0.1.4 // indirect
github.com/mattn/go-isatty v0.0.11 // indirect
github.com/mattn/go-sqlite3 v2.0.2+incompatible
github.com/mkideal/cli v0.0.3
github.com/mkideal/pkg v0.0.0-20170503154153-3e188c9e7ecc
golang.org/x/crypto v0.0.0-20191219195013-becbf705a915
)

@ -0,0 +1,78 @@
github.com/Bowery/prompt v0.0.0-20190916142128-fa8279994f75 h1:xGHheKK44eC6K0u5X+DZW/fRaR1LnDdqPHMZMWx5fv8=
github.com/Bowery/prompt v0.0.0-20190916142128-fa8279994f75/go.mod h1:4/6eNcqZ09BZ9wLK3tZOjBA1nDj+B0728nlX5YRlSmQ=
github.com/DataDog/datadog-go v2.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ=
github.com/armon/go-metrics v0.0.0-20190430140413-ec5e00d3c878 h1:EFSB7Zo9Eg91v7MJPVsifUysc/wPdN+NOnVe6bWbdBM=
github.com/armon/go-metrics v0.0.0-20190430140413-ec5e00d3c878/go.mod h1:3AMJUQhVx52RsWOnlkpikZr01T/yAVN2gn0861vByNg=
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
github.com/boltdb/bolt v1.3.1 h1:JQmyP4ZBrce+ZQu0dY660FMfatumYDLun9hBCUVIkF4=
github.com/boltdb/bolt v1.3.1/go.mod h1:clJnj/oiGkjum5o1McbSZDSLxVThjynRyGBgiAx27Ps=
github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible/go.mod h1:nmEj6Dob7S7YxXgwXpfOuvO54S+tGdZdw9fuRZt25Ag=
github.com/circonus-labs/circonusllhist v0.1.3/go.mod h1:kMXHVDlOchFAehlya5ePtbp5jckzBHf4XRpQvBOLI+I=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/hashicorp/go-cleanhttp v0.5.0/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80=
github.com/hashicorp/go-hclog v0.9.1 h1:9PZfAcVEvez4yhLH2TBU64/h/z4xlFI80cWXRrxuKuM=
github.com/hashicorp/go-hclog v0.9.1/go.mod h1:5CU+agLiy3J7N7QjHK5d05KxGsuXiQLrjA0H7acj2lQ=
github.com/hashicorp/go-immutable-radix v1.0.0 h1:AKDB1HM5PWEA7i4nhcpwOrO2byshxBjXVn/J/3+z5/0=
github.com/hashicorp/go-immutable-radix v1.0.0/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60=
github.com/hashicorp/go-msgpack v0.5.5 h1:i9R9JSrqIz0QVLz3sz+i3YJdT7TTSLcfLLzJi9aZTuI=
github.com/hashicorp/go-msgpack v0.5.5/go.mod h1:ahLV/dePpqEmjfWmKiqvPkv/twdG7iPBM1vqhUKIvfM=
github.com/hashicorp/go-retryablehttp v0.5.3/go.mod h1:9B5zBasrRhHXnJnui7y6sL7es7NDiJgTc6Er0maI1Xs=
github.com/hashicorp/go-uuid v1.0.0/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
github.com/hashicorp/golang-lru v0.5.0 h1:CL2msUPvZTLb5O648aiLNJw3hnBxN2+1Jq8rCOH9wdo=
github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hashicorp/raft v1.1.0/go.mod h1:4Ak7FSPnuvmb0GV6vgIAJ4vYT4bek9bb6Q+7HVbyzqM=
github.com/hashicorp/raft v1.1.1 h1:HJr7UE1x/JrJSc9Oy6aDBHtNHUUBHjcQjTgvUVihoZs=
github.com/hashicorp/raft v1.1.1/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8=
github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea/go.mod h1:pNv7Wc3ycL6F5oOWn+tPGo2gWD4a5X+yp/ntwdKLjRk=
github.com/hashicorp/raft-boltdb v0.0.0-20191021154308-4207f1bf0617 h1:CJDRE/2tBNFOrcoexD2nvTRbQEox3FDxl4NxIezp1b8=
github.com/hashicorp/raft-boltdb v0.0.0-20191021154308-4207f1bf0617/go.mod h1:aUF6HQr8+t3FC/ZHAC+pZreUBhTaxumuu3L+d37uRxk=
github.com/labstack/gommon v0.3.0 h1:JEeO0bvc78PKdyHxloTKiF8BD5iGrH8T6MSeGvSgob0=
github.com/labstack/gommon v0.3.0/go.mod h1:MULnywXg0yavhxWKc+lOruYdAhDwPK9wf0OL7NoOu+k=
github.com/mattn/go-colorable v0.1.2/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE=
github.com/mattn/go-colorable v0.1.4 h1:snbPLB8fVfU9iwbbo30TPtbLRzwWu6aJS6Xh4eaaviA=
github.com/mattn/go-colorable v0.1.4/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE=
github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s=
github.com/mattn/go-isatty v0.0.9/go.mod h1:YNRxwqDuOph6SZLI9vUUz6OYw3QyUt7WiY2yME+cCiQ=
github.com/mattn/go-isatty v0.0.11 h1:FxPOTFNqGkuDUGi3H/qkUbQO4ZiBa2brKq5r0l8TGeM=
github.com/mattn/go-isatty v0.0.11/go.mod h1:PhnuNfih5lzO57/f3n+odYbM4JtupLOxQOAqxQCu2WE=
github.com/mattn/go-sqlite3 v2.0.2+incompatible h1:qzw9c2GNT8UFrgWNDhCTqRqYUSmu/Dav/9Z58LGpk7U=
github.com/mattn/go-sqlite3 v2.0.2+incompatible/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc=
github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
github.com/mkideal/cli v0.0.3 h1:Y1OXyfTVI9eQ9RTiXq12h7q88y22Q9ZU4VI09ifz6lE=
github.com/mkideal/cli v0.0.3/go.mod h1:HLuSls75T7LFlTgByGeuLwcvdUmmx/aUQxnnEKxoZzY=
github.com/mkideal/pkg v0.0.0-20170503154153-3e188c9e7ecc h1:eyN9UWVX+CeeCQZPudCUAPc84xQYTjEu9MWNa2HuJrs=
github.com/mkideal/pkg v0.0.0-20170503154153-3e188c9e7ecc/go.mod h1:DECgB56amjU/mmmsKuooNPQ1856HASOMC3D4ntSVU70=
github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_golang v0.9.2/go.mod h1:OsXs2jCmiKlQ1lTBmv21f2mNfw4xf/QclQDMrYNZzcM=
github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
github.com/prometheus/common v0.0.0-20181126121408-4724e9255275/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro=
github.com/prometheus/procfs v0.0.0-20181204211112-1dc9a6cbc91a/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/tv42/httpunix v0.0.0-20150427012821-b75d8614f926/go.mod h1:9ESjWnEqriFuLhtthL60Sar/7RFoluCcXsuvEwTV5KM=
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
github.com/valyala/fasttemplate v1.0.1/go.mod h1:UQGH1tvbgY+Nz5t2n7tXsz52dQxojPUpymEIMZ47gx8=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191219195013-becbf705a915 h1:aJ0ex187qoXrJHPo8ZasVTASQB7llQP6YeNzgDALPRk=
golang.org/x/crypto v0.0.0-20191219195013-becbf705a915/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/net v0.0.0-20181201002055-351d144fa1fc/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190523142557-0e01d883c5c5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190602015325-4c4f7f33c9ed h1:uPxWBzB3+mlnjy9W58qY1j/cjyFjutgw/Vhan2zLy/A=
golang.org/x/sys v0.0.0-20190602015325-4c4f7f33c9ed/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191026070338-33540a1f6037 h1:YyJpGZS1sBuBCzLAR1VEpK193GlqGZbnPFnPV/5Rsb4=
golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=

@ -43,8 +43,8 @@ type Store interface {
// is held on the database.
Query(qr *store.QueryRequest) ([]*sql.Rows, error)
// Join joins the node, reachable at addr, to this node.
Join(addr string) error
// Join joins the node with the given ID, reachable at addr, to this node.
Join(id, addr string) error
// Remove removes the node, specified by addr, from the cluster.
Remove(addr string) error
@ -295,7 +295,8 @@ func (s *Service) handleJoin(w http.ResponseWriter, r *http.Request) {
return
}
if len(m) != 1 {
remoteID, ok := m["id"]
if !ok {
w.WriteHeader(http.StatusBadRequest)
return
}
@ -306,7 +307,7 @@ func (s *Service) handleJoin(w http.ResponseWriter, r *http.Request) {
return
}
if err := s.store.Join(remoteAddr); err != nil {
if err := s.store.Join(remoteID, remoteAddr); err != nil {
if err == store.ErrNotLeader {
leader := s.store.Peer(s.store.Leader())
if leader == "" {

@ -491,7 +491,7 @@ func (m *MockStore) Query(qr *store.QueryRequest) ([]*sql.Rows, error) {
return nil, nil
}
func (m *MockStore) Join(addr string) error {
func (m *MockStore) Join(id, addr string) error {
return nil
}

@ -0,0 +1,41 @@
package store
import (
"encoding/json"
)
// commandType are commands that affect the state of the cluster, and must go through Raft.
type commandType int
const (
execute commandType = iota // Commands which modify the database.
query // Commands which query the database.
peer // Commands that modify peers map.
)
type command struct {
Typ commandType `json:"typ,omitempty"`
Sub json.RawMessage `json:"sub,omitempty"`
}
func newCommand(t commandType, d interface{}) (*command, error) {
b, err := json.Marshal(d)
if err != nil {
return nil, err
}
return &command{
Typ: t,
Sub: b,
}, nil
}
// databaseSub is a command sub which involves interaction with the database.
type databaseSub struct {
Tx bool `json:"tx,omitempty"`
Queries []string `json:"queries,omitempty"`
Timings bool `json:"timings,omitempty"`
}
// peersSub is a command which sets the API address for a Raft address.
type peersSub map[string]string

@ -1,99 +0,0 @@
package store
import (
"os"
"sort"
"testing"
"time"
)
// Test_NumPeersEnableSingle tests that a single node reports
// itself as capable of joining a cluster.
func Test_NumPeersEnableSingle(t *testing.T) {
s0 := mustNewStore(true)
defer os.RemoveAll(s0.Path())
if err := s0.Open(true); err != nil {
t.Fatalf("failed to open node for num peers test: %s", err.Error())
}
s0.WaitForLeader(5 * time.Second)
s0.Close(true)
j, err := JoinAllowed(s0.Path())
if err != nil {
t.Fatalf("failed to check join status of %s: %s", s0.Path(), err.Error())
}
if !j {
t.Fatalf("config files at %s indicate joining is not allowed", s0.Path())
}
}
// Test_NumPeersDisableSingle tests that a single node reports
// itself as capable of joining a cluster, when explicitly configured
// as not capable of self-electing.
func Test_NumPeersDisableSingle(t *testing.T) {
s0 := mustNewStore(true)
defer os.RemoveAll(s0.Path())
if err := s0.Open(false); err != nil {
t.Fatalf("failed to open node for num peers test: %s", err.Error())
}
s0.Close(true)
j, err := JoinAllowed(s0.Path())
if err != nil {
t.Fatalf("failed to check join status of %s: %s", s0.Path(), err.Error())
}
if !j {
t.Fatalf("config files at %s indicate joining is not allowed", s0.Path())
}
}
// Test_NumPeersJoin tests that the correct number of nodes are recorded by
// nodes in a cluster.
func Test_NumPeersJoin(t *testing.T) {
s0 := mustNewStore(true)
defer os.RemoveAll(s0.Path())
if err := s0.Open(true); err != nil {
t.Fatalf("failed to open node for num peers test: %s", err.Error())
}
s0.WaitForLeader(5 * time.Second)
s1 := mustNewStore(true)
defer os.RemoveAll(s1.Path())
if err := s1.Open(false); err != nil {
t.Fatalf("failed to open node for num peers test: %s", err.Error())
}
// Get sorted list of cluster nodes.
storeNodes := []string{s0.Addr().String(), s1.Addr().String()}
sort.StringSlice(storeNodes).Sort()
// Join the second node to the first.
if err := s0.Join(s1.Addr().String()); err != nil {
t.Fatalf("failed to join to node at %s: %s", s0.Addr().String(), err.Error())
}
s1.WaitForLeader(5 * time.Second)
s1.Close(true)
s0.Close(true)
// Check that peers are set as expected.
m, _ := NumPeers(s0.Path())
if m != 2 {
t.Fatalf("got wrong value for number of peers, exp %d, got %d", 2, m)
}
j, err := JoinAllowed(s0.Path())
if err != nil {
t.Fatalf("failed to check join status of %s: %s", s0.Path(), err.Error())
}
if j {
t.Fatalf("config files at %s indicate joining is allowed", s0.Path())
}
k, err := JoinAllowed(s1.Path())
if err != nil {
t.Fatalf("failed to check join status of %s: %s", s1.Path(), err.Error())
}
if k {
t.Fatalf("config files at %s indicate joining is allowed", s1.Path())
}
}

@ -16,6 +16,7 @@ import (
"net"
"os"
"path/filepath"
"sort"
"sync"
"time"
@ -91,50 +92,6 @@ type ExecuteRequest struct {
Tx bool
}
// Transport is the interface the network service must provide.
type Transport interface {
net.Listener
// Dial is used to create a new outgoing connection
Dial(address string, timeout time.Duration) (net.Conn, error)
}
// commandType are commands that affect the state of the cluster, and must go through Raft.
type commandType int
const (
execute commandType = iota // Commands which modify the database.
query // Commands which query the database.
peer // Commands that modify peers map.
)
type command struct {
Typ commandType `json:"typ,omitempty"`
Sub json.RawMessage `json:"sub,omitempty"`
}
func newCommand(t commandType, d interface{}) (*command, error) {
b, err := json.Marshal(d)
if err != nil {
return nil, err
}
return &command{
Typ: t,
Sub: b,
}, nil
}
// databaseSub is a command sub which involves interaction with the database.
type databaseSub struct {
Tx bool `json:"tx,omitempty"`
Queries []string `json:"queries,omitempty"`
Timings bool `json:"timings,omitempty"`
}
// peersSub is a command which sets the API address for a Raft address.
type peersSub map[string]string
// ConsistencyLevel represents the available read consistency levels.
type ConsistencyLevel int
@ -199,19 +156,30 @@ func NewDBConfig(dsn string, memory bool) *DBConfig {
return &DBConfig{DSN: dsn, Memory: memory}
}
// Server represents another node in the cluster.
type Server struct {
ID string `json:"id,omitempty"`
Addr string `json:"addr,omitempty"`
}
type Servers []*Server
func (s Servers) Less(i, j int) bool { return s[i].ID < s[j].ID }
func (s Servers) Len() int { return len(s) }
func (s Servers) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
// Store is a SQLite database, where all changes are made via Raft consensus.
type Store struct {
raftDir string
mu sync.RWMutex // Sync access between queries and snapshots.
raft *raft.Raft // The consensus mechanism.
raftTransport Transport
peerStore raft.PeerStore
dbConf *DBConfig // SQLite database config.
dbPath string // Path to underlying SQLite file, if not in-memory.
db *sql.DB // The underlying SQLite store.
joinRequired bool // Whether an explicit join is required.
raft *raft.Raft // The consensus mechanism.
raftTn *raftTransport
raftID string // Node ID.
dbConf *DBConfig // SQLite database config.
dbPath string // Path to underlying SQLite file, if not in-memory.
db *sql.DB // The underlying SQLite store.
metaMu sync.RWMutex
meta *clusterMeta
@ -229,11 +197,11 @@ type Store struct {
// StoreConfig represents the configuration of the underlying Store.
type StoreConfig struct {
DBConf *DBConfig // The DBConfig object for this Store.
Dir string // The working directory for raft.
Tn Transport // The underlying Transport for raft.
Logger *log.Logger // The logger to use to log stuff.
PeerStore raft.PeerStore // The PeerStore to use for raft.
DBConf *DBConfig // The DBConfig object for this Store.
Dir string // The working directory for raft.
Tn Transport // The underlying Transport for raft.
ID string // Node ID.
Logger *log.Logger // The logger to use to log stuff.
}
// New returns a new Store.
@ -244,22 +212,24 @@ func New(c *StoreConfig) *Store {
}
return &Store{
raftDir: c.Dir,
raftTransport: c.Tn,
dbConf: c.DBConf,
dbPath: filepath.Join(c.Dir, sqliteFile),
meta: newClusterMeta(),
logger: logger,
peerStore: c.PeerStore,
ApplyTimeout: applyTimeout,
OpenTimeout: openTimeout,
raftDir: c.Dir,
raftTn: &raftTransport{c.Tn},
raftID: c.ID,
dbConf: c.DBConf,
dbPath: filepath.Join(c.Dir, sqliteFile),
meta: newClusterMeta(),
logger: logger,
ApplyTimeout: applyTimeout,
OpenTimeout: openTimeout,
}
}
// Open opens the store. If enableSingle is set, and there are no existing peers,
// then this node becomes the first node, and therefore leader, of the cluster.
func (s *Store) Open(enableSingle bool) error {
s.logger.Printf("ensuring %s exists", s.raftDir)
s.logger.Printf("opening store with node ID %s", s.raftID)
s.logger.Printf("ensuring directory at %s exists", s.raftDir)
if err := os.MkdirAll(s.raftDir, 0755); err != nil {
return err
}
@ -270,31 +240,17 @@ func (s *Store) Open(enableSingle bool) error {
}
s.db = db
// Setup Raft communication.
transport := raft.NewNetworkTransport(s.raftTransport, 3, 10*time.Second, os.Stderr)
// Is this a brand new node?
newNode := !pathExists(filepath.Join(s.raftDir, "raft.db"))
// Create peer storage if necesssary.
if s.peerStore == nil {
s.peerStore = raft.NewJSONPeers(s.raftDir, transport)
}
// Setup Raft communication.
transport := raft.NewNetworkTransport(s.raftTn, 3, 10*time.Second, os.Stderr)
// Get the Raft configuration for this store.
config := s.raftConfig()
// Check for any existing peers.
peers, err := s.peerStore.Peers()
if err != nil {
return err
}
s.joinRequired = len(peers) <= 1
// Allow the node to entry single-mode, potentially electing itself, if
// explicitly enabled and there is only 1 node in the cluster already.
if enableSingle && len(peers) <= 1 {
s.logger.Println("enabling single-node mode")
config.EnableSingleNode = true
config.DisableBootstrapAfterElect = false
}
config.LocalID = raft.ServerID(s.raftID)
// XXXconfig.Logger = log.New(os.Stderr, "[raft] ", log.LstdFlags)
// Create the snapshot store. This allows Raft to truncate the log.
snapshots, err := raft.NewFileSnapshotStore(s.raftDir, retainSnapshotCount, os.Stderr)
@ -309,10 +265,26 @@ func (s *Store) Open(enableSingle bool) error {
}
// Instantiate the Raft system.
ra, err := raft.NewRaft(config, s, logStore, logStore, snapshots, s.peerStore, transport)
ra, err := raft.NewRaft(config, s, logStore, logStore, snapshots, transport)
if err != nil {
return fmt.Errorf("new raft: %s", err)
}
if enableSingle && newNode {
s.logger.Printf("bootstrap needed")
configuration := raft.Configuration{
Servers: []raft.Server{
raft.Server{
ID: config.LocalID,
Address: transport.LocalAddr(),
},
},
}
ra.BootstrapCluster(configuration)
} else {
s.logger.Printf("no bootstrap needed")
}
s.raft = ra
if s.OpenTimeout != 0 {
@ -364,11 +336,6 @@ func (s *Store) State() ClusterState {
}
}
// JoinRequired returns whether the node needs to join a cluster after being opened.
func (s *Store) JoinRequired() bool {
return s.joinRequired
}
// Path returns the path to the store's storage directory.
func (s *Store) Path() string {
return s.raftDir
@ -376,13 +343,18 @@ func (s *Store) Path() string {
// Addr returns the address of the store.
func (s *Store) Addr() net.Addr {
return s.raftTransport.Addr()
return s.raftTn.Addr()
}
// ID returns the Raft ID of the store.
func (s *Store) ID() string {
return s.raftID
}
// Leader returns the current leader. Returns a blank string if there is
// no leader.
func (s *Store) Leader() string {
return s.raft.Leader()
return string(s.raft.Leader())
}
// Peer returns the API address for the given addr. If there is no peer
@ -403,9 +375,24 @@ func (s *Store) APIPeers() (map[string]string, error) {
return peers, nil
}
// Nodes returns the list of current peers.
func (s *Store) Nodes() ([]string, error) {
return s.peerStore.Peers()
// Nodes returns the slice of nodes in the cluster, sorted by ID ascending.
func (s *Store) Nodes() ([]*Server, error) {
f := s.raft.GetConfiguration()
if f.Error() != nil {
return nil, f.Error()
}
rs := f.Configuration().Servers
servers := make([]*Server, len(rs))
for i := range rs {
servers[i] = &Server{
ID: string(rs[i].ID),
Addr: string(rs[i].Address),
}
}
sort.Sort(Servers(servers))
return servers, nil
}
// WaitForLeader blocks until a leader is detected, or the timeout expires.
@ -471,13 +458,12 @@ func (s *Store) Stats() (map[string]interface{}, error) {
dbStatus["path"] = ":memory:"
}
s.metaMu.RLock()
defer s.metaMu.RUnlock()
peers, err := s.peerStore.Peers()
nodes, err := s.Nodes()
if err != nil {
return nil, err
}
status := map[string]interface{}{
"node_id": s.raftID,
"raft": s.raft.Stats(),
"addr": s.Addr().String(),
"leader": s.Leader(),
@ -487,7 +473,7 @@ func (s *Store) Stats() (map[string]interface{}, error) {
"election_timeout": s.ElectionTimeout.String(),
"snapshot_threshold": s.SnapshotThreshold,
"meta": s.meta,
"peers": peers,
"peers": nodes,
"dir": s.raftDir,
"sqlite3": dbStatus,
"db_conf": s.dbConf,
@ -647,15 +633,15 @@ func (s *Store) UpdateAPIPeers(peers map[string]string) error {
return f.Error()
}
// Join joins a node, located at addr, to this store. The node must be ready to
// respond to Raft communications at that address.
func (s *Store) Join(addr string) error {
// Join joins a node, identified by id and located at addr, to this store.
// The node must be ready to respond to Raft communications at that address.
func (s *Store) Join(id, addr string) error {
s.logger.Printf("received request to join node at %s", addr)
if s.raft.State() != raft.Leader {
return ErrNotLeader
}
f := s.raft.AddPeer(addr)
f := s.raft.AddVoter(raft.ServerID(id), raft.ServerAddress(addr), 0, 0)
if e := f.(raft.Future); e.Error() != nil {
if e.Error() == raft.ErrNotLeader {
return ErrNotLeader
@ -666,21 +652,21 @@ func (s *Store) Join(addr string) error {
return nil
}
// Remove removes a node from the store, specified by addr.
func (s *Store) Remove(addr string) error {
s.logger.Printf("received request to remove node %s", addr)
// Remove removes a node from the store, specified by ID.
func (s *Store) Remove(id string) error {
s.logger.Printf("received request to remove node %s", id)
if s.raft.State() != raft.Leader {
return ErrNotLeader
}
f := s.raft.RemovePeer(addr)
f := s.raft.RemoveServer(raft.ServerID(id), 0, 0)
if f.Error() != nil {
if f.Error() == raft.ErrNotLeader {
return ErrNotLeader
}
return f.Error()
}
s.logger.Printf("node %s removed successfully", addr)
s.logger.Printf("node %s removed successfully", id)
return nil
}
@ -965,3 +951,11 @@ func enabledFromBool(b bool) string {
}
return "disabled"
}
// pathExists returns true if the given path exists.
func pathExists(p string) bool {
if _, err := os.Lstat(p); err != nil && os.IsNotExist(err) {
return false
}
return true
}

@ -117,74 +117,6 @@ func Test_SingleNodeInMemExecuteQueryFail(t *testing.T) {
}
}
func Test_StoreLogTruncationMultinode(t *testing.T) {
s0 := mustNewStore(true)
defer os.RemoveAll(s0.Path())
s0.SnapshotThreshold = 4
s0.SnapshotInterval = 100 * time.Millisecond
if err := s0.Open(true); err != nil {
t.Fatalf("failed to open single-node store: %s", err.Error())
}
defer s0.Close(true)
s0.WaitForLeader(10 * time.Second)
nSnaps := stats.Get(numSnaphots).String()
// Write more than s.SnapshotThreshold statements.
queries := []string{
`CREATE TABLE foo (id INTEGER NOT NULL PRIMARY KEY, name TEXT)`,
`INSERT INTO foo(id, name) VALUES(1, "fiona")`,
`INSERT INTO foo(id, name) VALUES(2, "fiona")`,
`INSERT INTO foo(id, name) VALUES(3, "fiona")`,
`INSERT INTO foo(id, name) VALUES(4, "fiona")`,
`INSERT INTO foo(id, name) VALUES(5, "fiona")`,
}
for i := range queries {
_, err := s0.Execute(&ExecuteRequest{[]string{queries[i]}, false, false})
if err != nil {
t.Fatalf("failed to execute on single node: %s", err.Error())
}
}
// Wait for the snapshot to happen and log to be truncated.
for {
time.Sleep(1000 * time.Millisecond)
if stats.Get(numSnaphots).String() != nSnaps {
// It's changed, so a snap and truncate has happened.
break
}
}
// Fire up new node and ensure it picks up all changes. This will
// involve getting a snapshot and truncated log.
s1 := mustNewStore(true)
if err := s1.Open(true); err != nil {
t.Fatalf("failed to open single-node store: %s", err.Error())
}
defer s1.Close(true)
// Join the second node to the first.
if err := s0.Join(s1.Addr().String()); err != nil {
t.Fatalf("failed to join to node at %s: %s", s0.Addr(), err.Error())
}
s1.WaitForLeader(10 * time.Second)
// Wait until the log entries have been applied to the follower,
// and then query.
if err := s1.WaitForAppliedIndex(8, 5*time.Second); err != nil {
t.Fatalf("error waiting for follower to apply index: %s:", err.Error())
}
r, err := s1.Query(&QueryRequest{[]string{`SELECT count(*) FROM foo`}, false, true, None})
if err != nil {
t.Fatalf("failed to query single node: %s", err.Error())
}
if exp, got := `["count(*)"]`, asJSON(r[0].Columns); exp != got {
t.Fatalf("unexpected results for query\nexp: %s\ngot: %s", exp, got)
}
if exp, got := `[[5]]`, asJSON(r[0].Values); exp != got {
t.Fatalf("unexpected results for query\nexp: %s\ngot: %s", exp, got)
}
}
func Test_SingleNodeFileExecuteQuery(t *testing.T) {
s := mustNewStore(false)
defer os.RemoveAll(s.Path())
@ -502,11 +434,11 @@ func Test_MultiNodeJoinRemove(t *testing.T) {
defer s1.Close(true)
// Get sorted list of cluster nodes.
storeNodes := []string{s0.Addr().String(), s1.Addr().String()}
storeNodes := []string{s0.ID(), s1.ID()}
sort.StringSlice(storeNodes).Sort()
// Join the second node to the first.
if err := s0.Join(s1.Addr().String()); err != nil {
if err := s0.Join(s1.ID(), s1.Addr().String()); err != nil {
t.Fatalf("failed to join to node at %s: %s", s0.Addr().String(), err.Error())
}
@ -514,18 +446,17 @@ func Test_MultiNodeJoinRemove(t *testing.T) {
if err != nil {
t.Fatalf("failed to get nodes: %s", err.Error())
}
sort.StringSlice(nodes).Sort()
if len(nodes) != len(storeNodes) {
t.Fatalf("size of cluster is not correct")
}
if storeNodes[0] != nodes[0] && storeNodes[1] != nodes[1] {
if storeNodes[0] != nodes[0].ID || storeNodes[1] != nodes[1].ID {
t.Fatalf("cluster does not have correct nodes")
}
// Remove a node.
if err := s0.Remove(s1.Addr().String()); err != nil {
t.Fatalf("failed to remove %s from cluster: %s", s1.Addr().String(), err.Error())
if err := s0.Remove(s1.ID()); err != nil {
t.Fatalf("failed to remove %s from cluster: %s", s1.ID(), err.Error())
}
nodes, err = s0.Nodes()
@ -535,7 +466,7 @@ func Test_MultiNodeJoinRemove(t *testing.T) {
if len(nodes) != 1 {
t.Fatalf("size of cluster is not correct post remove")
}
if s0.Addr().String() != nodes[0] {
if s0.ID() != nodes[0].ID {
t.Fatalf("cluster does not have correct nodes post remove")
}
}
@ -557,7 +488,7 @@ func Test_MultiNodeExecuteQuery(t *testing.T) {
defer s1.Close(true)
// Join the second node to the first.
if err := s0.Join(s1.Addr().String()); err != nil {
if err := s0.Join(s1.ID(), s1.Addr().String()); err != nil {
t.Fatalf("failed to join to node at %s: %s", s0.Addr().String(), err.Error())
}
@ -605,6 +536,74 @@ func Test_MultiNodeExecuteQuery(t *testing.T) {
}
}
func Test_StoreLogTruncationMultinode(t *testing.T) {
s0 := mustNewStore(true)
defer os.RemoveAll(s0.Path())
s0.SnapshotThreshold = 4
s0.SnapshotInterval = 100 * time.Millisecond
if err := s0.Open(true); err != nil {
t.Fatalf("failed to open single-node store: %s", err.Error())
}
defer s0.Close(true)
s0.WaitForLeader(10 * time.Second)
nSnaps := stats.Get(numSnaphots).String()
// Write more than s.SnapshotThreshold statements.
queries := []string{
`CREATE TABLE foo (id INTEGER NOT NULL PRIMARY KEY, name TEXT)`,
`INSERT INTO foo(id, name) VALUES(1, "fiona")`,
`INSERT INTO foo(id, name) VALUES(2, "fiona")`,
`INSERT INTO foo(id, name) VALUES(3, "fiona")`,
`INSERT INTO foo(id, name) VALUES(4, "fiona")`,
`INSERT INTO foo(id, name) VALUES(5, "fiona")`,
}
for i := range queries {
_, err := s0.Execute(&ExecuteRequest{[]string{queries[i]}, false, false})
if err != nil {
t.Fatalf("failed to execute on single node: %s", err.Error())
}
}
// Wait for the snapshot to happen and log to be truncated.
for {
time.Sleep(1000 * time.Millisecond)
if stats.Get(numSnaphots).String() != nSnaps {
// It's changed, so a snap and truncate has happened.
break
}
}
// Fire up new node and ensure it picks up all changes. This will
// involve getting a snapshot and truncated log.
s1 := mustNewStore(true)
if err := s1.Open(true); err != nil {
t.Fatalf("failed to open single-node store: %s", err.Error())
}
defer s1.Close(true)
// Join the second node to the first.
if err := s0.Join(s1.ID(), s1.Addr().String()); err != nil {
t.Fatalf("failed to join to node at %s: %s", s0.Addr(), err.Error())
}
s1.WaitForLeader(10 * time.Second)
// Wait until the log entries have been applied to the follower,
// and then query.
if err := s1.WaitForAppliedIndex(8, 5*time.Second); err != nil {
t.Fatalf("error waiting for follower to apply index: %s:", err.Error())
}
r, err := s1.Query(&QueryRequest{[]string{`SELECT count(*) FROM foo`}, false, true, None})
if err != nil {
t.Fatalf("failed to query single node: %s", err.Error())
}
if exp, got := `["count(*)"]`, asJSON(r[0].Columns); exp != got {
t.Fatalf("unexpected results for query\nexp: %s\ngot: %s", exp, got)
}
if exp, got := `[[5]]`, asJSON(r[0].Values); exp != got {
t.Fatalf("unexpected results for query\nexp: %s\ngot: %s", exp, got)
}
}
func Test_SingleNodeSnapshotOnDisk(t *testing.T) {
s := mustNewStore(false)
defer os.RemoveAll(s.Path())
@ -799,11 +798,13 @@ func mustNewStore(inmem bool) *Store {
path := mustTempDir()
defer os.RemoveAll(path)
tn := mustMockTransport("localhost:0")
cfg := NewDBConfig("", inmem)
s := New(&StoreConfig{
DBConf: cfg,
Dir: path,
Tn: mustMockTransport("localhost:0"),
Tn: tn,
ID: tn.Addr().String(), // Could be any unique string.
})
if s == nil {
panic("failed to create new store")

@ -0,0 +1,38 @@
package store
import (
"net"
"time"
"github.com/hashicorp/raft"
)
// Transport is the interface the network service must provide.
type Transport interface {
net.Listener
// Dial is used to create a new outgoing connection
Dial(address string, timeout time.Duration) (net.Conn, error)
}
// raftTransport takes a Transport and makes it suitable for use by the Raft
// networking system.
type raftTransport struct {
tn Transport
}
func (r *raftTransport) Dial(address raft.ServerAddress, timeout time.Duration) (net.Conn, error) {
return r.tn.Dial(string(address), timeout)
}
func (r *raftTransport) Accept() (net.Conn, error) {
return r.tn.Accept()
}
func (r *raftTransport) Addr() net.Addr {
return r.tn.Addr()
}
func (r *raftTransport) Close() error {
return r.tn.Close()
}

@ -5,11 +5,11 @@ TMP_DATA=`mktemp`
rm $GOPATH/bin/*
go install ./...
$GOPATH/bin/rqlited -http-addr localhost:4001 -raft-addr localhost:4002 ${TMP_DATA}_1 &
$GOPATH/bin/rqlited -node-id node0 -http-addr localhost:4001 -raft-addr localhost:4002 ${TMP_DATA}_1 &
sleep 5
$GOPATH/bin/rqlited -http-addr localhost:4003 -raft-addr localhost:4004 -join http://localhost:4001 ${TMP_DATA}_2 &
$GOPATH/bin/rqlited -node-id node1 -http-addr localhost:4003 -raft-addr localhost:4004 -join http://localhost:4001 ${TMP_DATA}_2 &
sleep 5
$GOPATH/bin/rqlited -http-addr localhost:4005 -raft-addr localhost:4006 -join http://localhost:4001 ${TMP_DATA}_3 &
$GOPATH/bin/rqlited -node-id node2 -http-addr localhost:4005 -raft-addr localhost:4006 -join http://localhost:4001 ${TMP_DATA}_3 &
sleep 5
wait

@ -7,11 +7,11 @@ go install ./...
openssl req -x509 -nodes -newkey rsa:4096 -keyout ${TMP_DATA}_key.pem -out ${TMP_DATA}_cert.pem -days 365
$GOPATH/bin/rqlited -http-addr localhost:4001 -raft-addr localhost:4002 -node-cert ${TMP_DATA}_cert.pem -node-key ${TMP_DATA}_key.pem -node-no-verify -node-encrypt ${TMP_DATA}_1 &
$GOPATH/bin/rqlited -node-id node0 -http-addr localhost:4001 -raft-addr localhost:4002 -node-cert ${TMP_DATA}_cert.pem -node-key ${TMP_DATA}_key.pem -node-no-verify -node-encrypt ${TMP_DATA}_1 &
sleep 5
$GOPATH/bin/rqlited -http-addr localhost:4003 -raft-addr localhost:4004 -join http://localhost:4001 -node-cert ${TMP_DATA}_cert.pem -node-key ${TMP_DATA}_key.pem -node-no-verify -node-encrypt ${TMP_DATA}_2 &
$GOPATH/bin/rqlited -node-id node1 -http-addr localhost:4003 -raft-addr localhost:4004 -join http://localhost:4001 -node-cert ${TMP_DATA}_cert.pem -node-key ${TMP_DATA}_key.pem -node-no-verify -node-encrypt ${TMP_DATA}_2 &
sleep 5
$GOPATH/bin/rqlited -http-addr localhost:4005 -raft-addr localhost:4006 -join http://localhost:4001 -node-cert ${TMP_DATA}_cert.pem -node-key ${TMP_DATA}_key.pem -node-no-verify -node-encrypt ${TMP_DATA}_3 &
$GOPATH/bin/rqlited -node-id node2 -http-addr localhost:4005 -raft-addr localhost:4006 -join http://localhost:4001 -node-cert ${TMP_DATA}_cert.pem -node-key ${TMP_DATA}_key.pem -node-no-verify -node-encrypt ${TMP_DATA}_3 &
sleep 5
wait

@ -19,7 +19,7 @@ RQLITED_PATH = os.environ['RQLITED_PATH']
TIMEOUT=10
class Node(object):
def __init__(self, path,
def __init__(self, path, node_id,
api_addr=None, api_adv=None,
raft_addr=None, raft_adv=None,
dir=None):
@ -33,6 +33,7 @@ class Node(object):
api_adv = api_addr
self.path = path
self.node_id = node_id
self.api_addr = api_addr
self.api_adv = api_adv
self.raft_addr = raft_addr
@ -67,6 +68,7 @@ class Node(object):
return
command = [self.path,
'-node-id', self.node_id,
'-http-addr', self.api_addr,
'-raft-addr', self.raft_addr]
if self.api_adv is not None:
@ -199,9 +201,9 @@ class Node(object):
def _load_url(self):
return 'http://' + self.APIAddr() + '/db/load'
def __eq__(self, other):
return self.raft_addr == other.raft_addr
return self.node_id == other.node_id
def __str__(self):
return '%s:[%s]:[%s]' % (self.APIAddr(), self.raft_addr, self.dir)
return '%s:[%s]:[%s]:[%s]' % (self.node_id, self.APIAddr(), self.raft_addr, self.dir)
def __del__(self):
self.stdout_fd.close()
self.stderr_fd.close()
@ -239,15 +241,15 @@ class Cluster(object):
class TestEndToEnd(unittest.TestCase):
def setUp(self):
n0 = Node(RQLITED_PATH)
n0 = Node(RQLITED_PATH, '0')
n0.start()
n0.wait_for_leader()
n1 = Node(RQLITED_PATH)
n1 = Node(RQLITED_PATH, '1')
n1.start(join=n0.APIAddr())
n1.wait_for_leader()
n2 = Node(RQLITED_PATH)
n2 = Node(RQLITED_PATH, '2')
n2.start(join=n0.APIAddr())
n2.wait_for_leader()
@ -307,17 +309,17 @@ class TestEndToEnd(unittest.TestCase):
class TestEndToEndAdvAddr(TestEndToEnd):
def setUp(self):
n0 = Node(RQLITED_PATH,
n0 = Node(RQLITED_PATH, '0',
api_addr="0.0.0.0:4001", api_adv="localhost:4001",
raft_addr="0.0.0.0:4002", raft_adv="localhost:4002")
n0.start()
n0.wait_for_leader()
n1 = Node(RQLITED_PATH)
n1 = Node(RQLITED_PATH, '1')
n1.start(join=n0.APIAddr())
n1.wait_for_leader()
n2 = Node(RQLITED_PATH)
n2 = Node(RQLITED_PATH, '2')
n2.start(join=n0.APIAddr())
n2.wait_for_leader()
@ -328,7 +330,7 @@ class TestEndToEndBackupRestore(unittest.TestCase):
fd, self.db_file = tempfile.mkstemp()
os.close(fd)
self.node0 = Node(RQLITED_PATH)
self.node0 = Node(RQLITED_PATH, '0')
self.node0.start()
self.node0.wait_for_leader()
self.node0.execute('CREATE TABLE foo (id INTEGER NOT NULL PRIMARY KEY, name TEXT)')
@ -342,7 +344,7 @@ class TestEndToEndBackupRestore(unittest.TestCase):
self.assertEqual(rows[0], (1, u'fiona'))
conn.close()
self.node1 = Node(RQLITED_PATH)
self.node1 = Node(RQLITED_PATH, '1')
self.node1.start()
self.node1.wait_for_leader()
j = self.node1.restore(self.db_file)

@ -84,7 +84,7 @@ func (n *Node) QueryMulti(stmts []string) (string, error) {
// Join instructs this node to join the leader.
func (n *Node) Join(leader *Node) error {
resp, err := DoJoinRequest(leader.APIAddr, n.RaftAddr)
resp, err := DoJoinRequest(leader.APIAddr, n.Store.ID(), n.RaftAddr)
if err != nil {
return err
}
@ -281,9 +281,9 @@ func Remove(n *Node, addr string) error {
return nil
}
// DoJoinRequest sends a join request to nodeAddr, for raftAddr.
func DoJoinRequest(nodeAddr, raftAddr string) (*http.Response, error) {
b, err := json.Marshal(map[string]string{"addr": raftAddr})
// DoJoinRequest sends a join request to nodeAddr, for raftID, reachable at raftAddr.
func DoJoinRequest(nodeAddr, raftID, raftAddr string) (*http.Response, error) {
b, err := json.Marshal(map[string]string{"id": raftID, "addr": raftAddr})
if err != nil {
return nil, err
}
@ -302,10 +302,12 @@ func mustNewNode(enableSingle bool) *Node {
}
dbConf := store.NewDBConfig("", false)
tn := mustMockTransport("localhost:0")
node.Store = store.New(&store.StoreConfig{
DBConf: dbConf,
Dir: node.Dir,
Tn: mustMockTransport("localhost:0"),
Tn: tn,
ID: tn.Addr().String(),
})
if err := node.Store.Open(enableSingle); err != nil {
node.Deprovision()

@ -1,10 +0,0 @@
language: go
go:
- 1.6
- 1.7
- tip
install: make deps
script:
- make test

@ -1,362 +0,0 @@
Mozilla Public License, version 2.0
1. Definitions
1.1. "Contributor"
means each individual or legal entity that creates, contributes to the
creation of, or owns Covered Software.
1.2. "Contributor Version"
means the combination of the Contributions of others (if any) used by a
Contributor and that particular Contributor's Contribution.
1.3. "Contribution"
means Covered Software of a particular Contributor.
1.4. "Covered Software"
means Source Code Form to which the initial Contributor has attached the
notice in Exhibit A, the Executable Form of such Source Code Form, and
Modifications of such Source Code Form, in each case including portions
thereof.
1.5. "Incompatible With Secondary Licenses"
means
a. that the initial Contributor has attached the notice described in
Exhibit B to the Covered Software; or
b. that the Covered Software was made available under the terms of
version 1.1 or earlier of the License, but not also under the terms of
a Secondary License.
1.6. "Executable Form"
means any form of the work other than Source Code Form.
1.7. "Larger Work"
means a work that combines Covered Software with other material, in a
separate file or files, that is not Covered Software.
1.8. "License"
means this document.
1.9. "Licensable"
means having the right to grant, to the maximum extent possible, whether
at the time of the initial grant or subsequently, any and all of the
rights conveyed by this License.
1.10. "Modifications"
means any of the following:
a. any file in Source Code Form that results from an addition to,
deletion from, or modification of the contents of Covered Software; or
b. any new file in Source Code Form that contains any Covered Software.
1.11. "Patent Claims" of a Contributor
means any patent claim(s), including without limitation, method,
process, and apparatus claims, in any patent Licensable by such
Contributor that would be infringed, but for the grant of the License,
by the making, using, selling, offering for sale, having made, import,
or transfer of either its Contributions or its Contributor Version.
1.12. "Secondary License"
means either the GNU General Public License, Version 2.0, the GNU Lesser
General Public License, Version 2.1, the GNU Affero General Public
License, Version 3.0, or any later versions of those licenses.
1.13. "Source Code Form"
means the form of the work preferred for making modifications.
1.14. "You" (or "Your")
means an individual or a legal entity exercising rights under this
License. For legal entities, "You" includes any entity that controls, is
controlled by, or is under common control with You. For purposes of this
definition, "control" means (a) the power, direct or indirect, to cause
the direction or management of such entity, whether by contract or
otherwise, or (b) ownership of more than fifty percent (50%) of the
outstanding shares or beneficial ownership of such entity.
2. License Grants and Conditions
2.1. Grants
Each Contributor hereby grants You a world-wide, royalty-free,
non-exclusive license:
a. under intellectual property rights (other than patent or trademark)
Licensable by such Contributor to use, reproduce, make available,
modify, display, perform, distribute, and otherwise exploit its
Contributions, either on an unmodified basis, with Modifications, or
as part of a Larger Work; and
b. under Patent Claims of such Contributor to make, use, sell, offer for
sale, have made, import, and otherwise transfer either its
Contributions or its Contributor Version.
2.2. Effective Date
The licenses granted in Section 2.1 with respect to any Contribution
become effective for each Contribution on the date the Contributor first
distributes such Contribution.
2.3. Limitations on Grant Scope
The licenses granted in this Section 2 are the only rights granted under
this License. No additional rights or licenses will be implied from the
distribution or licensing of Covered Software under this License.
Notwithstanding Section 2.1(b) above, no patent license is granted by a
Contributor:
a. for any code that a Contributor has removed from Covered Software; or
b. for infringements caused by: (i) Your and any other third party's
modifications of Covered Software, or (ii) the combination of its
Contributions with other software (except as part of its Contributor
Version); or
c. under Patent Claims infringed by Covered Software in the absence of
its Contributions.
This License does not grant any rights in the trademarks, service marks,
or logos of any Contributor (except as may be necessary to comply with
the notice requirements in Section 3.4).
2.4. Subsequent Licenses
No Contributor makes additional grants as a result of Your choice to
distribute the Covered Software under a subsequent version of this
License (see Section 10.2) or under the terms of a Secondary License (if
permitted under the terms of Section 3.3).
2.5. Representation
Each Contributor represents that the Contributor believes its
Contributions are its original creation(s) or it has sufficient rights to
grant the rights to its Contributions conveyed by this License.
2.6. Fair Use
This License is not intended to limit any rights You have under
applicable copyright doctrines of fair use, fair dealing, or other
equivalents.
2.7. Conditions
Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in
Section 2.1.
3. Responsibilities
3.1. Distribution of Source Form
All distribution of Covered Software in Source Code Form, including any
Modifications that You create or to which You contribute, must be under
the terms of this License. You must inform recipients that the Source
Code Form of the Covered Software is governed by the terms of this
License, and how they can obtain a copy of this License. You may not
attempt to alter or restrict the recipients' rights in the Source Code
Form.
3.2. Distribution of Executable Form
If You distribute Covered Software in Executable Form then:
a. such Covered Software must also be made available in Source Code Form,
as described in Section 3.1, and You must inform recipients of the
Executable Form how they can obtain a copy of such Source Code Form by
reasonable means in a timely manner, at a charge no more than the cost
of distribution to the recipient; and
b. You may distribute such Executable Form under the terms of this
License, or sublicense it under different terms, provided that the
license for the Executable Form does not attempt to limit or alter the
recipients' rights in the Source Code Form under this License.
3.3. Distribution of a Larger Work
You may create and distribute a Larger Work under terms of Your choice,
provided that You also comply with the requirements of this License for
the Covered Software. If the Larger Work is a combination of Covered
Software with a work governed by one or more Secondary Licenses, and the
Covered Software is not Incompatible With Secondary Licenses, this
License permits You to additionally distribute such Covered Software
under the terms of such Secondary License(s), so that the recipient of
the Larger Work may, at their option, further distribute the Covered
Software under the terms of either this License or such Secondary
License(s).
3.4. Notices
You may not remove or alter the substance of any license notices
(including copyright notices, patent notices, disclaimers of warranty, or
limitations of liability) contained within the Source Code Form of the
Covered Software, except that You may alter any license notices to the
extent required to remedy known factual inaccuracies.
3.5. Application of Additional Terms
You may choose to offer, and to charge a fee for, warranty, support,
indemnity or liability obligations to one or more recipients of Covered
Software. However, You may do so only on Your own behalf, and not on
behalf of any Contributor. You must make it absolutely clear that any
such warranty, support, indemnity, or liability obligation is offered by
You alone, and You hereby agree to indemnify every Contributor for any
liability incurred by such Contributor as a result of warranty, support,
indemnity or liability terms You offer. You may include additional
disclaimers of warranty and limitations of liability specific to any
jurisdiction.
4. Inability to Comply Due to Statute or Regulation
If it is impossible for You to comply with any of the terms of this License
with respect to some or all of the Covered Software due to statute,
judicial order, or regulation then You must: (a) comply with the terms of
this License to the maximum extent possible; and (b) describe the
limitations and the code they affect. Such description must be placed in a
text file included with all distributions of the Covered Software under
this License. Except to the extent prohibited by statute or regulation,
such description must be sufficiently detailed for a recipient of ordinary
skill to be able to understand it.
5. Termination
5.1. The rights granted under this License will terminate automatically if You
fail to comply with any of its terms. However, if You become compliant,
then the rights granted under this License from a particular Contributor
are reinstated (a) provisionally, unless and until such Contributor
explicitly and finally terminates Your grants, and (b) on an ongoing
basis, if such Contributor fails to notify You of the non-compliance by
some reasonable means prior to 60 days after You have come back into
compliance. Moreover, Your grants from a particular Contributor are
reinstated on an ongoing basis if such Contributor notifies You of the
non-compliance by some reasonable means, this is the first time You have
received notice of non-compliance with this License from such
Contributor, and You become compliant prior to 30 days after Your receipt
of the notice.
5.2. If You initiate litigation against any entity by asserting a patent
infringement claim (excluding declaratory judgment actions,
counter-claims, and cross-claims) alleging that a Contributor Version
directly or indirectly infringes any patent, then the rights granted to
You by any and all Contributors for the Covered Software under Section
2.1 of this License shall terminate.
5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user
license agreements (excluding distributors and resellers) which have been
validly granted by You or Your distributors under this License prior to
termination shall survive termination.
6. Disclaimer of Warranty
Covered Software is provided under this License on an "as is" basis,
without warranty of any kind, either expressed, implied, or statutory,
including, without limitation, warranties that the Covered Software is free
of defects, merchantable, fit for a particular purpose or non-infringing.
The entire risk as to the quality and performance of the Covered Software
is with You. Should any Covered Software prove defective in any respect,
You (not any Contributor) assume the cost of any necessary servicing,
repair, or correction. This disclaimer of warranty constitutes an essential
part of this License. No use of any Covered Software is authorized under
this License except under this disclaimer.
7. Limitation of Liability
Under no circumstances and under no legal theory, whether tort (including
negligence), contract, or otherwise, shall any Contributor, or anyone who
distributes Covered Software as permitted above, be liable to You for any
direct, indirect, special, incidental, or consequential damages of any
character including, without limitation, damages for lost profits, loss of
goodwill, work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses, even if such party shall have been
informed of the possibility of such damages. This limitation of liability
shall not apply to liability for death or personal injury resulting from
such party's negligence to the extent applicable law prohibits such
limitation. Some jurisdictions do not allow the exclusion or limitation of
incidental or consequential damages, so this exclusion and limitation may
not apply to You.
8. Litigation
Any litigation relating to this License may be brought only in the courts
of a jurisdiction where the defendant maintains its principal place of
business and such litigation shall be governed by laws of that
jurisdiction, without reference to its conflict-of-law provisions. Nothing
in this Section shall prevent a party's ability to bring cross-claims or
counter-claims.
9. Miscellaneous
This License represents the complete agreement concerning the subject
matter hereof. If any provision of this License is held to be
unenforceable, such provision shall be reformed only to the extent
necessary to make it enforceable. Any law or regulation which provides that
the language of a contract shall be construed against the drafter shall not
be used to construe this License against a Contributor.
10. Versions of the License
10.1. New Versions
Mozilla Foundation is the license steward. Except as provided in Section
10.3, no one other than the license steward has the right to modify or
publish new versions of this License. Each version will be given a
distinguishing version number.
10.2. Effect of New Versions
You may distribute the Covered Software under the terms of the version
of the License under which You originally received the Covered Software,
or under the terms of any subsequent version published by the license
steward.
10.3. Modified Versions
If you create software not governed by this License, and you want to
create a new license for such software, you may create and use a
modified version of this License if you rename the license and remove
any references to the name of the license steward (except to note that
such modified license differs from this License).
10.4. Distributing Source Code Form that is Incompatible With Secondary
Licenses If You choose to distribute Source Code Form that is
Incompatible With Secondary Licenses under the terms of this version of
the License, the notice described in Exhibit B of this License must be
attached.
Exhibit A - Source Code Form License Notice
This Source Code Form is subject to the
terms of the Mozilla Public License, v.
2.0. If a copy of the MPL was not
distributed with this file, You can
obtain one at
http://mozilla.org/MPL/2.0/.
If it is not possible or desirable to put the notice in a particular file,
then You may include the notice in a location (such as a LICENSE file in a
relevant directory) where a recipient would be likely to look for such a
notice.
You may add additional accurate notices of copyright ownership.
Exhibit B - "Incompatible With Secondary Licenses" Notice
This Source Code Form is "Incompatible
With Secondary Licenses", as defined by
the Mozilla Public License, v. 2.0.

@ -1,11 +0,0 @@
DEPS = $(go list -f '{{range .TestImports}}{{.}} {{end}}' ./...)
.PHONY: test deps
test:
go test -timeout=30s ./...
deps:
go get -d -v ./...
echo $(DEPS) | xargs -n1 go get -d

@ -1,11 +0,0 @@
raft-boltdb
===========
This repository provides the `raftboltdb` package. The package exports the
`BoltStore` which is an implementation of both a `LogStore` and `StableStore`.
It is meant to be used as a backend for the `raft` [package
here](https://github.com/hashicorp/raft).
This implementation uses [BoltDB](https://github.com/boltdb/bolt). BoltDB is
a simple key/value store implemented in pure Go, and inspired by LMDB.

@ -1,88 +0,0 @@
package raftboltdb
import (
"os"
"testing"
"github.com/hashicorp/raft/bench"
)
func BenchmarkBoltStore_FirstIndex(b *testing.B) {
store := testBoltStore(b)
defer store.Close()
defer os.Remove(store.path)
raftbench.FirstIndex(b, store)
}
func BenchmarkBoltStore_LastIndex(b *testing.B) {
store := testBoltStore(b)
defer store.Close()
defer os.Remove(store.path)
raftbench.LastIndex(b, store)
}
func BenchmarkBoltStore_GetLog(b *testing.B) {
store := testBoltStore(b)
defer store.Close()
defer os.Remove(store.path)
raftbench.GetLog(b, store)
}
func BenchmarkBoltStore_StoreLog(b *testing.B) {
store := testBoltStore(b)
defer store.Close()
defer os.Remove(store.path)
raftbench.StoreLog(b, store)
}
func BenchmarkBoltStore_StoreLogs(b *testing.B) {
store := testBoltStore(b)
defer store.Close()
defer os.Remove(store.path)
raftbench.StoreLogs(b, store)
}
func BenchmarkBoltStore_DeleteRange(b *testing.B) {
store := testBoltStore(b)
defer store.Close()
defer os.Remove(store.path)
raftbench.DeleteRange(b, store)
}
func BenchmarkBoltStore_Set(b *testing.B) {
store := testBoltStore(b)
defer store.Close()
defer os.Remove(store.path)
raftbench.Set(b, store)
}
func BenchmarkBoltStore_Get(b *testing.B) {
store := testBoltStore(b)
defer store.Close()
defer os.Remove(store.path)
raftbench.Get(b, store)
}
func BenchmarkBoltStore_SetUint64(b *testing.B) {
store := testBoltStore(b)
defer store.Close()
defer os.Remove(store.path)
raftbench.SetUint64(b, store)
}
func BenchmarkBoltStore_GetUint64(b *testing.B) {
store := testBoltStore(b)
defer store.Close()
defer os.Remove(store.path)
raftbench.GetUint64(b, store)
}

@ -1,255 +0,0 @@
package raftboltdb
import (
"errors"
"github.com/boltdb/bolt"
"github.com/hashicorp/raft"
)
const (
// Permissions to use on the db file. This is only used if the
// database file does not exist and needs to be created.
dbFileMode = 0600
)
var (
// Bucket names we perform transactions in
dbLogs = []byte("logs")
dbConf = []byte("conf")
// An error indicating a given key does not exist
ErrKeyNotFound = errors.New("not found")
)
// BoltStore provides access to BoltDB for Raft to store and retrieve
// log entries. It also provides key/value storage, and can be used as
// a LogStore and StableStore.
type BoltStore struct {
// conn is the underlying handle to the db.
conn *bolt.DB
// The path to the Bolt database file
path string
}
// Options contains all the configuraiton used to open the BoltDB
type Options struct {
// Path is the file path to the BoltDB to use
Path string
// BoltOptions contains any specific BoltDB options you might
// want to specify [e.g. open timeout]
BoltOptions *bolt.Options
}
// readOnly returns true if the contained bolt options say to open
// the DB in readOnly mode [this can be useful to tools that want
// to examine the log]
func (o *Options) readOnly() bool {
return o != nil && o.BoltOptions != nil && o.BoltOptions.ReadOnly
}
// NewBoltStore takes a file path and returns a connected Raft backend.
func NewBoltStore(path string) (*BoltStore, error) {
return New(Options{Path: path})
}
// New uses the supplied options to open the BoltDB and prepare it for use as a raft backend.
func New(options Options) (*BoltStore, error) {
// Try to connect
handle, err := bolt.Open(options.Path, dbFileMode, options.BoltOptions)
if err != nil {
return nil, err
}
// Create the new store
store := &BoltStore{
conn: handle,
path: options.Path,
}
// If the store was opened read-only, don't try and create buckets
if !options.readOnly() {
// Set up our buckets
if err := store.initialize(); err != nil {
store.Close()
return nil, err
}
}
return store, nil
}
// initialize is used to set up all of the buckets.
func (b *BoltStore) initialize() error {
tx, err := b.conn.Begin(true)
if err != nil {
return err
}
defer tx.Rollback()
// Create all the buckets
if _, err := tx.CreateBucketIfNotExists(dbLogs); err != nil {
return err
}
if _, err := tx.CreateBucketIfNotExists(dbConf); err != nil {
return err
}
return tx.Commit()
}
// Close is used to gracefully close the DB connection.
func (b *BoltStore) Close() error {
return b.conn.Close()
}
// FirstIndex returns the first known index from the Raft log.
func (b *BoltStore) FirstIndex() (uint64, error) {
tx, err := b.conn.Begin(false)
if err != nil {
return 0, err
}
defer tx.Rollback()
curs := tx.Bucket(dbLogs).Cursor()
if first, _ := curs.First(); first == nil {
return 0, nil
} else {
return bytesToUint64(first), nil
}
}
// LastIndex returns the last known index from the Raft log.
func (b *BoltStore) LastIndex() (uint64, error) {
tx, err := b.conn.Begin(false)
if err != nil {
return 0, err
}
defer tx.Rollback()
curs := tx.Bucket(dbLogs).Cursor()
if last, _ := curs.Last(); last == nil {
return 0, nil
} else {
return bytesToUint64(last), nil
}
}
// GetLog is used to retrieve a log from BoltDB at a given index.
func (b *BoltStore) GetLog(idx uint64, log *raft.Log) error {
tx, err := b.conn.Begin(false)
if err != nil {
return err
}
defer tx.Rollback()
bucket := tx.Bucket(dbLogs)
val := bucket.Get(uint64ToBytes(idx))
if val == nil {
return raft.ErrLogNotFound
}
return decodeMsgPack(val, log)
}
// StoreLog is used to store a single raft log
func (b *BoltStore) StoreLog(log *raft.Log) error {
return b.StoreLogs([]*raft.Log{log})
}
// StoreLogs is used to store a set of raft logs
func (b *BoltStore) StoreLogs(logs []*raft.Log) error {
tx, err := b.conn.Begin(true)
if err != nil {
return err
}
defer tx.Rollback()
for _, log := range logs {
key := uint64ToBytes(log.Index)
val, err := encodeMsgPack(log)
if err != nil {
return err
}
bucket := tx.Bucket(dbLogs)
if err := bucket.Put(key, val.Bytes()); err != nil {
return err
}
}
return tx.Commit()
}
// DeleteRange is used to delete logs within a given range inclusively.
func (b *BoltStore) DeleteRange(min, max uint64) error {
minKey := uint64ToBytes(min)
tx, err := b.conn.Begin(true)
if err != nil {
return err
}
defer tx.Rollback()
curs := tx.Bucket(dbLogs).Cursor()
for k, _ := curs.Seek(minKey); k != nil; k, _ = curs.Next() {
// Handle out-of-range log index
if bytesToUint64(k) > max {
break
}
// Delete in-range log index
if err := curs.Delete(); err != nil {
return err
}
}
return tx.Commit()
}
// Set is used to set a key/value set outside of the raft log
func (b *BoltStore) Set(k, v []byte) error {
tx, err := b.conn.Begin(true)
if err != nil {
return err
}
defer tx.Rollback()
bucket := tx.Bucket(dbConf)
if err := bucket.Put(k, v); err != nil {
return err
}
return tx.Commit()
}
// Get is used to retrieve a value from the k/v store by key
func (b *BoltStore) Get(k []byte) ([]byte, error) {
tx, err := b.conn.Begin(false)
if err != nil {
return nil, err
}
defer tx.Rollback()
bucket := tx.Bucket(dbConf)
val := bucket.Get(k)
if val == nil {
return nil, ErrKeyNotFound
}
return append([]byte(nil), val...), nil
}
// SetUint64 is like Set, but handles uint64 values
func (b *BoltStore) SetUint64(key []byte, val uint64) error {
return b.Set(key, uint64ToBytes(val))
}
// GetUint64 is like Get, but handles uint64 values
func (b *BoltStore) GetUint64(key []byte) (uint64, error) {
val, err := b.Get(key)
if err != nil {
return 0, err
}
return bytesToUint64(val), nil
}

@ -1,416 +0,0 @@
package raftboltdb
import (
"bytes"
"io/ioutil"
"os"
"reflect"
"testing"
"time"
"github.com/boltdb/bolt"
"github.com/hashicorp/raft"
)
func testBoltStore(t testing.TB) *BoltStore {
fh, err := ioutil.TempFile("", "bolt")
if err != nil {
t.Fatalf("err: %s", err)
}
os.Remove(fh.Name())
// Successfully creates and returns a store
store, err := NewBoltStore(fh.Name())
if err != nil {
t.Fatalf("err: %s", err)
}
return store
}
func testRaftLog(idx uint64, data string) *raft.Log {
return &raft.Log{
Data: []byte(data),
Index: idx,
}
}
func TestBoltStore_Implements(t *testing.T) {
var store interface{} = &BoltStore{}
if _, ok := store.(raft.StableStore); !ok {
t.Fatalf("BoltStore does not implement raft.StableStore")
}
if _, ok := store.(raft.LogStore); !ok {
t.Fatalf("BoltStore does not implement raft.LogStore")
}
}
func TestBoltOptionsTimeout(t *testing.T) {
fh, err := ioutil.TempFile("", "bolt")
if err != nil {
t.Fatalf("err: %s", err)
}
os.Remove(fh.Name())
defer os.Remove(fh.Name())
options := Options{
Path: fh.Name(),
BoltOptions: &bolt.Options{
Timeout: time.Second / 10,
},
}
store, err := New(options)
if err != nil {
t.Fatalf("err: %v", err)
}
defer store.Close()
// trying to open it again should timeout
doneCh := make(chan error, 1)
go func() {
_, err := New(options)
doneCh <- err
}()
select {
case err := <-doneCh:
if err == nil || err.Error() != "timeout" {
t.Errorf("Expected timeout error but got %v", err)
}
case <-time.After(5 * time.Second):
t.Errorf("Gave up waiting for timeout response")
}
}
func TestBoltOptionsReadOnly(t *testing.T) {
fh, err := ioutil.TempFile("", "bolt")
if err != nil {
t.Fatalf("err: %s", err)
}
defer os.Remove(fh.Name())
store, err := NewBoltStore(fh.Name())
if err != nil {
t.Fatalf("err: %s", err)
}
// Create the log
log := &raft.Log{
Data: []byte("log1"),
Index: 1,
}
// Attempt to store the log
if err := store.StoreLog(log); err != nil {
t.Fatalf("err: %s", err)
}
store.Close()
options := Options{
Path: fh.Name(),
BoltOptions: &bolt.Options{
Timeout: time.Second / 10,
ReadOnly: true,
},
}
roStore, err := New(options)
if err != nil {
t.Fatalf("err: %s", err)
}
defer roStore.Close()
result := new(raft.Log)
if err := roStore.GetLog(1, result); err != nil {
t.Fatalf("err: %s", err)
}
// Ensure the log comes back the same
if !reflect.DeepEqual(log, result) {
t.Errorf("bad: %v", result)
}
// Attempt to store the log, should fail on a read-only store
err = roStore.StoreLog(log)
if err != bolt.ErrDatabaseReadOnly {
t.Errorf("expecting error %v, but got %v", bolt.ErrDatabaseReadOnly, err)
}
}
func TestNewBoltStore(t *testing.T) {
fh, err := ioutil.TempFile("", "bolt")
if err != nil {
t.Fatalf("err: %s", err)
}
os.Remove(fh.Name())
defer os.Remove(fh.Name())
// Successfully creates and returns a store
store, err := NewBoltStore(fh.Name())
if err != nil {
t.Fatalf("err: %s", err)
}
// Ensure the file was created
if store.path != fh.Name() {
t.Fatalf("unexpected file path %q", store.path)
}
if _, err := os.Stat(fh.Name()); err != nil {
t.Fatalf("err: %s", err)
}
// Close the store so we can open again
if err := store.Close(); err != nil {
t.Fatalf("err: %s", err)
}
// Ensure our tables were created
db, err := bolt.Open(fh.Name(), dbFileMode, nil)
if err != nil {
t.Fatalf("err: %s", err)
}
tx, err := db.Begin(true)
if err != nil {
t.Fatalf("err: %s", err)
}
if _, err := tx.CreateBucket([]byte(dbLogs)); err != bolt.ErrBucketExists {
t.Fatalf("bad: %v", err)
}
if _, err := tx.CreateBucket([]byte(dbConf)); err != bolt.ErrBucketExists {
t.Fatalf("bad: %v", err)
}
}
func TestBoltStore_FirstIndex(t *testing.T) {
store := testBoltStore(t)
defer store.Close()
defer os.Remove(store.path)
// Should get 0 index on empty log
idx, err := store.FirstIndex()
if err != nil {
t.Fatalf("err: %s", err)
}
if idx != 0 {
t.Fatalf("bad: %v", idx)
}
// Set a mock raft log
logs := []*raft.Log{
testRaftLog(1, "log1"),
testRaftLog(2, "log2"),
testRaftLog(3, "log3"),
}
if err := store.StoreLogs(logs); err != nil {
t.Fatalf("bad: %s", err)
}
// Fetch the first Raft index
idx, err = store.FirstIndex()
if err != nil {
t.Fatalf("err: %s", err)
}
if idx != 1 {
t.Fatalf("bad: %d", idx)
}
}
func TestBoltStore_LastIndex(t *testing.T) {
store := testBoltStore(t)
defer store.Close()
defer os.Remove(store.path)
// Should get 0 index on empty log
idx, err := store.LastIndex()
if err != nil {
t.Fatalf("err: %s", err)
}
if idx != 0 {
t.Fatalf("bad: %v", idx)
}
// Set a mock raft log
logs := []*raft.Log{
testRaftLog(1, "log1"),
testRaftLog(2, "log2"),
testRaftLog(3, "log3"),
}
if err := store.StoreLogs(logs); err != nil {
t.Fatalf("bad: %s", err)
}
// Fetch the last Raft index
idx, err = store.LastIndex()
if err != nil {
t.Fatalf("err: %s", err)
}
if idx != 3 {
t.Fatalf("bad: %d", idx)
}
}
func TestBoltStore_GetLog(t *testing.T) {
store := testBoltStore(t)
defer store.Close()
defer os.Remove(store.path)
log := new(raft.Log)
// Should return an error on non-existent log
if err := store.GetLog(1, log); err != raft.ErrLogNotFound {
t.Fatalf("expected raft log not found error, got: %v", err)
}
// Set a mock raft log
logs := []*raft.Log{
testRaftLog(1, "log1"),
testRaftLog(2, "log2"),
testRaftLog(3, "log3"),
}
if err := store.StoreLogs(logs); err != nil {
t.Fatalf("bad: %s", err)
}
// Should return the proper log
if err := store.GetLog(2, log); err != nil {
t.Fatalf("err: %s", err)
}
if !reflect.DeepEqual(log, logs[1]) {
t.Fatalf("bad: %#v", log)
}
}
func TestBoltStore_SetLog(t *testing.T) {
store := testBoltStore(t)
defer store.Close()
defer os.Remove(store.path)
// Create the log
log := &raft.Log{
Data: []byte("log1"),
Index: 1,
}
// Attempt to store the log
if err := store.StoreLog(log); err != nil {
t.Fatalf("err: %s", err)
}
// Retrieve the log again
result := new(raft.Log)
if err := store.GetLog(1, result); err != nil {
t.Fatalf("err: %s", err)
}
// Ensure the log comes back the same
if !reflect.DeepEqual(log, result) {
t.Fatalf("bad: %v", result)
}
}
func TestBoltStore_SetLogs(t *testing.T) {
store := testBoltStore(t)
defer store.Close()
defer os.Remove(store.path)
// Create a set of logs
logs := []*raft.Log{
testRaftLog(1, "log1"),
testRaftLog(2, "log2"),
}
// Attempt to store the logs
if err := store.StoreLogs(logs); err != nil {
t.Fatalf("err: %s", err)
}
// Ensure we stored them all
result1, result2 := new(raft.Log), new(raft.Log)
if err := store.GetLog(1, result1); err != nil {
t.Fatalf("err: %s", err)
}
if !reflect.DeepEqual(logs[0], result1) {
t.Fatalf("bad: %#v", result1)
}
if err := store.GetLog(2, result2); err != nil {
t.Fatalf("err: %s", err)
}
if !reflect.DeepEqual(logs[1], result2) {
t.Fatalf("bad: %#v", result2)
}
}
func TestBoltStore_DeleteRange(t *testing.T) {
store := testBoltStore(t)
defer store.Close()
defer os.Remove(store.path)
// Create a set of logs
log1 := testRaftLog(1, "log1")
log2 := testRaftLog(2, "log2")
log3 := testRaftLog(3, "log3")
logs := []*raft.Log{log1, log2, log3}
// Attempt to store the logs
if err := store.StoreLogs(logs); err != nil {
t.Fatalf("err: %s", err)
}
// Attempt to delete a range of logs
if err := store.DeleteRange(1, 2); err != nil {
t.Fatalf("err: %s", err)
}
// Ensure the logs were deleted
if err := store.GetLog(1, new(raft.Log)); err != raft.ErrLogNotFound {
t.Fatalf("should have deleted log1")
}
if err := store.GetLog(2, new(raft.Log)); err != raft.ErrLogNotFound {
t.Fatalf("should have deleted log2")
}
}
func TestBoltStore_Set_Get(t *testing.T) {
store := testBoltStore(t)
defer store.Close()
defer os.Remove(store.path)
// Returns error on non-existent key
if _, err := store.Get([]byte("bad")); err != ErrKeyNotFound {
t.Fatalf("expected not found error, got: %q", err)
}
k, v := []byte("hello"), []byte("world")
// Try to set a k/v pair
if err := store.Set(k, v); err != nil {
t.Fatalf("err: %s", err)
}
// Try to read it back
val, err := store.Get(k)
if err != nil {
t.Fatalf("err: %s", err)
}
if !bytes.Equal(val, v) {
t.Fatalf("bad: %v", val)
}
}
func TestBoltStore_SetUint64_GetUint64(t *testing.T) {
store := testBoltStore(t)
defer store.Close()
defer os.Remove(store.path)
// Returns error on non-existent key
if _, err := store.GetUint64([]byte("bad")); err != ErrKeyNotFound {
t.Fatalf("expected not found error, got: %q", err)
}
k, v := []byte("abc"), uint64(123)
// Attempt to set the k/v pair
if err := store.SetUint64(k, v); err != nil {
t.Fatalf("err: %s", err)
}
// Read back the value
val, err := store.GetUint64(k)
if err != nil {
t.Fatalf("err: %s", err)
}
if val != v {
t.Fatalf("bad: %v", val)
}
}

@ -1,37 +0,0 @@
package raftboltdb
import (
"bytes"
"encoding/binary"
"github.com/hashicorp/go-msgpack/codec"
)
// Decode reverses the encode operation on a byte slice input
func decodeMsgPack(buf []byte, out interface{}) error {
r := bytes.NewBuffer(buf)
hd := codec.MsgpackHandle{}
dec := codec.NewDecoder(r, &hd)
return dec.Decode(out)
}
// Encode writes an encoded object to a new bytes buffer
func encodeMsgPack(in interface{}) (*bytes.Buffer, error) {
buf := bytes.NewBuffer(nil)
hd := codec.MsgpackHandle{}
enc := codec.NewEncoder(buf, &hd)
err := enc.Encode(in)
return buf, err
}
// Converts bytes to an integer
func bytesToUint64(b []byte) uint64 {
return binary.BigEndian.Uint64(b)
}
// Converts a uint to a byte slice
func uint64ToBytes(u uint64) []byte {
buf := make([]byte, 8)
binary.BigEndian.PutUint64(buf, u)
return buf
}

@ -1,23 +0,0 @@
# Compiled Object files, Static and Dynamic libs (Shared Objects)
*.o
*.a
*.so
# Folders
_obj
_test
# Architecture specific extensions/prefixes
*.[568vq]
[568vq].out
*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*
_testmain.go
*.exe
*.test

@ -1,16 +0,0 @@
language: go
go:
- 1.4
- 1.5
- 1.6
- tip
install: make deps
script:
- make integ
notifications:
flowdock:
secure: fZrcf9rlh2IrQrlch1sHkn3YI7SKvjGnAl/zyV5D6NROe1Bbr6d3QRMuCXWWdhJHzjKmXk5rIzbqJhUc0PNF7YjxGNKSzqWMQ56KcvN1k8DzlqxpqkcA3Jbs6fXCWo2fssRtZ7hj/wOP1f5n6cc7kzHDt9dgaYJ6nO2fqNPJiTc=

@ -1,354 +0,0 @@
Mozilla Public License, version 2.0
1. Definitions
1.1. “Contributor”
means each individual or legal entity that creates, contributes to the
creation of, or owns Covered Software.
1.2. “Contributor Version”
means the combination of the Contributions of others (if any) used by a
Contributor and that particular Contributors Contribution.
1.3. “Contribution”
means Covered Software of a particular Contributor.
1.4. “Covered Software”
means Source Code Form to which the initial Contributor has attached the
notice in Exhibit A, the Executable Form of such Source Code Form, and
Modifications of such Source Code Form, in each case including portions
thereof.
1.5. “Incompatible With Secondary Licenses”
means
a. that the initial Contributor has attached the notice described in
Exhibit B to the Covered Software; or
b. that the Covered Software was made available under the terms of version
1.1 or earlier of the License, but not also under the terms of a
Secondary License.
1.6. “Executable Form”
means any form of the work other than Source Code Form.
1.7. “Larger Work”
means a work that combines Covered Software with other material, in a separate
file or files, that is not Covered Software.
1.8. “License”
means this document.
1.9. “Licensable”
means having the right to grant, to the maximum extent possible, whether at the
time of the initial grant or subsequently, any and all of the rights conveyed by
this License.
1.10. “Modifications”
means any of the following:
a. any file in Source Code Form that results from an addition to, deletion
from, or modification of the contents of Covered Software; or
b. any new file in Source Code Form that contains any Covered Software.
1.11. “Patent Claims” of a Contributor
means any patent claim(s), including without limitation, method, process,
and apparatus claims, in any patent Licensable by such Contributor that
would be infringed, but for the grant of the License, by the making,
using, selling, offering for sale, having made, import, or transfer of
either its Contributions or its Contributor Version.
1.12. “Secondary License”
means either the GNU General Public License, Version 2.0, the GNU Lesser
General Public License, Version 2.1, the GNU Affero General Public
License, Version 3.0, or any later versions of those licenses.
1.13. “Source Code Form”
means the form of the work preferred for making modifications.
1.14. “You” (or “Your”)
means an individual or a legal entity exercising rights under this
License. For legal entities, “You” includes any entity that controls, is
controlled by, or is under common control with You. For purposes of this
definition, “control” means (a) the power, direct or indirect, to cause
the direction or management of such entity, whether by contract or
otherwise, or (b) ownership of more than fifty percent (50%) of the
outstanding shares or beneficial ownership of such entity.
2. License Grants and Conditions
2.1. Grants
Each Contributor hereby grants You a world-wide, royalty-free,
non-exclusive license:
a. under intellectual property rights (other than patent or trademark)
Licensable by such Contributor to use, reproduce, make available,
modify, display, perform, distribute, and otherwise exploit its
Contributions, either on an unmodified basis, with Modifications, or as
part of a Larger Work; and
b. under Patent Claims of such Contributor to make, use, sell, offer for
sale, have made, import, and otherwise transfer either its Contributions
or its Contributor Version.
2.2. Effective Date
The licenses granted in Section 2.1 with respect to any Contribution become
effective for each Contribution on the date the Contributor first distributes
such Contribution.
2.3. Limitations on Grant Scope
The licenses granted in this Section 2 are the only rights granted under this
License. No additional rights or licenses will be implied from the distribution
or licensing of Covered Software under this License. Notwithstanding Section
2.1(b) above, no patent license is granted by a Contributor:
a. for any code that a Contributor has removed from Covered Software; or
b. for infringements caused by: (i) Your and any other third partys
modifications of Covered Software, or (ii) the combination of its
Contributions with other software (except as part of its Contributor
Version); or
c. under Patent Claims infringed by Covered Software in the absence of its
Contributions.
This License does not grant any rights in the trademarks, service marks, or
logos of any Contributor (except as may be necessary to comply with the
notice requirements in Section 3.4).
2.4. Subsequent Licenses
No Contributor makes additional grants as a result of Your choice to
distribute the Covered Software under a subsequent version of this License
(see Section 10.2) or under the terms of a Secondary License (if permitted
under the terms of Section 3.3).
2.5. Representation
Each Contributor represents that the Contributor believes its Contributions
are its original creation(s) or it has sufficient rights to grant the
rights to its Contributions conveyed by this License.
2.6. Fair Use
This License is not intended to limit any rights You have under applicable
copyright doctrines of fair use, fair dealing, or other equivalents.
2.7. Conditions
Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in
Section 2.1.
3. Responsibilities
3.1. Distribution of Source Form
All distribution of Covered Software in Source Code Form, including any
Modifications that You create or to which You contribute, must be under the
terms of this License. You must inform recipients that the Source Code Form
of the Covered Software is governed by the terms of this License, and how
they can obtain a copy of this License. You may not attempt to alter or
restrict the recipients rights in the Source Code Form.
3.2. Distribution of Executable Form
If You distribute Covered Software in Executable Form then:
a. such Covered Software must also be made available in Source Code Form,
as described in Section 3.1, and You must inform recipients of the
Executable Form how they can obtain a copy of such Source Code Form by
reasonable means in a timely manner, at a charge no more than the cost
of distribution to the recipient; and
b. You may distribute such Executable Form under the terms of this License,
or sublicense it under different terms, provided that the license for
the Executable Form does not attempt to limit or alter the recipients
rights in the Source Code Form under this License.
3.3. Distribution of a Larger Work
You may create and distribute a Larger Work under terms of Your choice,
provided that You also comply with the requirements of this License for the
Covered Software. If the Larger Work is a combination of Covered Software
with a work governed by one or more Secondary Licenses, and the Covered
Software is not Incompatible With Secondary Licenses, this License permits
You to additionally distribute such Covered Software under the terms of
such Secondary License(s), so that the recipient of the Larger Work may, at
their option, further distribute the Covered Software under the terms of
either this License or such Secondary License(s).
3.4. Notices
You may not remove or alter the substance of any license notices (including
copyright notices, patent notices, disclaimers of warranty, or limitations
of liability) contained within the Source Code Form of the Covered
Software, except that You may alter any license notices to the extent
required to remedy known factual inaccuracies.
3.5. Application of Additional Terms
You may choose to offer, and to charge a fee for, warranty, support,
indemnity or liability obligations to one or more recipients of Covered
Software. However, You may do so only on Your own behalf, and not on behalf
of any Contributor. You must make it absolutely clear that any such
warranty, support, indemnity, or liability obligation is offered by You
alone, and You hereby agree to indemnify every Contributor for any
liability incurred by such Contributor as a result of warranty, support,
indemnity or liability terms You offer. You may include additional
disclaimers of warranty and limitations of liability specific to any
jurisdiction.
4. Inability to Comply Due to Statute or Regulation
If it is impossible for You to comply with any of the terms of this License
with respect to some or all of the Covered Software due to statute, judicial
order, or regulation then You must: (a) comply with the terms of this License
to the maximum extent possible; and (b) describe the limitations and the code
they affect. Such description must be placed in a text file included with all
distributions of the Covered Software under this License. Except to the
extent prohibited by statute or regulation, such description must be
sufficiently detailed for a recipient of ordinary skill to be able to
understand it.
5. Termination
5.1. The rights granted under this License will terminate automatically if You
fail to comply with any of its terms. However, if You become compliant,
then the rights granted under this License from a particular Contributor
are reinstated (a) provisionally, unless and until such Contributor
explicitly and finally terminates Your grants, and (b) on an ongoing basis,
if such Contributor fails to notify You of the non-compliance by some
reasonable means prior to 60 days after You have come back into compliance.
Moreover, Your grants from a particular Contributor are reinstated on an
ongoing basis if such Contributor notifies You of the non-compliance by
some reasonable means, this is the first time You have received notice of
non-compliance with this License from such Contributor, and You become
compliant prior to 30 days after Your receipt of the notice.
5.2. If You initiate litigation against any entity by asserting a patent
infringement claim (excluding declaratory judgment actions, counter-claims,
and cross-claims) alleging that a Contributor Version directly or
indirectly infringes any patent, then the rights granted to You by any and
all Contributors for the Covered Software under Section 2.1 of this License
shall terminate.
5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user
license agreements (excluding distributors and resellers) which have been
validly granted by You or Your distributors under this License prior to
termination shall survive termination.
6. Disclaimer of Warranty
Covered Software is provided under this License on an “as is” basis, without
warranty of any kind, either expressed, implied, or statutory, including,
without limitation, warranties that the Covered Software is free of defects,
merchantable, fit for a particular purpose or non-infringing. The entire
risk as to the quality and performance of the Covered Software is with You.
Should any Covered Software prove defective in any respect, You (not any
Contributor) assume the cost of any necessary servicing, repair, or
correction. This disclaimer of warranty constitutes an essential part of this
License. No use of any Covered Software is authorized under this License
except under this disclaimer.
7. Limitation of Liability
Under no circumstances and under no legal theory, whether tort (including
negligence), contract, or otherwise, shall any Contributor, or anyone who
distributes Covered Software as permitted above, be liable to You for any
direct, indirect, special, incidental, or consequential damages of any
character including, without limitation, damages for lost profits, loss of
goodwill, work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses, even if such party shall have been
informed of the possibility of such damages. This limitation of liability
shall not apply to liability for death or personal injury resulting from such
partys negligence to the extent applicable law prohibits such limitation.
Some jurisdictions do not allow the exclusion or limitation of incidental or
consequential damages, so this exclusion and limitation may not apply to You.
8. Litigation
Any litigation relating to this License may be brought only in the courts of
a jurisdiction where the defendant maintains its principal place of business
and such litigation shall be governed by laws of that jurisdiction, without
reference to its conflict-of-law provisions. Nothing in this Section shall
prevent a partys ability to bring cross-claims or counter-claims.
9. Miscellaneous
This License represents the complete agreement concerning the subject matter
hereof. If any provision of this License is held to be unenforceable, such
provision shall be reformed only to the extent necessary to make it
enforceable. Any law or regulation which provides that the language of a
contract shall be construed against the drafter shall not be used to construe
this License against a Contributor.
10. Versions of the License
10.1. New Versions
Mozilla Foundation is the license steward. Except as provided in Section
10.3, no one other than the license steward has the right to modify or
publish new versions of this License. Each version will be given a
distinguishing version number.
10.2. Effect of New Versions
You may distribute the Covered Software under the terms of the version of
the License under which You originally received the Covered Software, or
under the terms of any subsequent version published by the license
steward.
10.3. Modified Versions
If you create software not governed by this License, and you want to
create a new license for such software, you may create and use a modified
version of this License if you rename the license and remove any
references to the name of the license steward (except to note that such
modified license differs from this License).
10.4. Distributing Source Code Form that is Incompatible With Secondary Licenses
If You choose to distribute Source Code Form that is Incompatible With
Secondary Licenses under the terms of this version of the License, the
notice described in Exhibit B of this License must be attached.
Exhibit A - Source Code Form License Notice
This Source Code Form is subject to the
terms of the Mozilla Public License, v.
2.0. If a copy of the MPL was not
distributed with this file, You can
obtain one at
http://mozilla.org/MPL/2.0/.
If it is not possible or desirable to put the notice in a particular file, then
You may include the notice in a location (such as a LICENSE file in a relevant
directory) where a recipient would be likely to look for such a notice.
You may add additional accurate notices of copyright ownership.
Exhibit B - “Incompatible With Secondary Licenses” Notice
This Source Code Form is “Incompatible
With Secondary Licenses”, as defined by
the Mozilla Public License, v. 2.0.

@ -1,17 +0,0 @@
DEPS = $(go list -f '{{range .TestImports}}{{.}} {{end}}' ./...)
test:
go test -timeout=30s ./...
integ: test
INTEG_TESTS=yes go test -timeout=23s -run=Integ ./...
deps:
go get -d -v ./...
echo $(DEPS) | xargs -n1 go get -d
cov:
INTEG_TESTS=yes gocov test github.com/hashicorp/raft | gocov-html > /tmp/coverage.html
open /tmp/coverage.html
.PHONY: test cov integ deps

@ -1,107 +0,0 @@
raft [![Build Status](https://travis-ci.org/hashicorp/raft.png)](https://travis-ci.org/hashicorp/raft)
====
raft is a [Go](http://www.golang.org) library that manages a replicated
log and can be used with an FSM to manage replicated state machines. It
is a library for providing [consensus](http://en.wikipedia.org/wiki/Consensus_(computer_science)).
The use cases for such a library are far-reaching as replicated state
machines are a key component of many distributed systems. They enable
building Consistent, Partition Tolerant (CP) systems, with limited
fault tolerance as well.
## Building
If you wish to build raft you'll need Go version 1.2+ installed.
Please check your installation with:
```
go version
```
## Documentation
For complete documentation, see the associated [Godoc](http://godoc.org/github.com/hashicorp/raft).
To prevent complications with cgo, the primary backend `MDBStore` is in a separate repository,
called [raft-mdb](http://github.com/hashicorp/raft-mdb). That is the recommended implementation
for the `LogStore` and `StableStore`.
A pure Go backend using [BoltDB](https://github.com/boltdb/bolt) is also available called
[raft-boltdb](https://github.com/hashicorp/raft-boltdb). It can also be used as a `LogStore`
and `StableStore`.
## Tagged Releases
As of September 2017, Hashicorp will start using tags for this library to clearly indicate
major version updates. We recommend you vendor your application's dependency on this library.
* v0.1.0 is the original stable version of the library that was in master and has been maintained
with no breaking API changes. This was in use by Consul prior to version 0.7.0.
* v1.0.0 takes the changes that were staged in the library-v2-stage-one branch. This version
manages server identities using a UUID, so introduces some breaking API changes. It also versions
the Raft protocol, and requires some special steps when interoperating with Raft servers running
older versions of the library (see the detailed comment in config.go about version compatibility).
You can reference https://github.com/hashicorp/consul/pull/2222 for an idea of what was required
to port Consul to these new interfaces.
This version includes some new features as well, including non voting servers, a new address
provider abstraction in the transport layer, and more resilient snapshots.
## Protocol
raft is based on ["Raft: In Search of an Understandable Consensus Algorithm"](https://ramcloud.stanford.edu/wiki/download/attachments/11370504/raft.pdf)
A high level overview of the Raft protocol is described below, but for details please read the full
[Raft paper](https://ramcloud.stanford.edu/wiki/download/attachments/11370504/raft.pdf)
followed by the raft source. Any questions about the raft protocol should be sent to the
[raft-dev mailing list](https://groups.google.com/forum/#!forum/raft-dev).
### Protocol Description
Raft nodes are always in one of three states: follower, candidate or leader. All
nodes initially start out as a follower. In this state, nodes can accept log entries
from a leader and cast votes. If no entries are received for some time, nodes
self-promote to the candidate state. In the candidate state nodes request votes from
their peers. If a candidate receives a quorum of votes, then it is promoted to a leader.
The leader must accept new log entries and replicate to all the other followers.
In addition, if stale reads are not acceptable, all queries must also be performed on
the leader.
Once a cluster has a leader, it is able to accept new log entries. A client can
request that a leader append a new log entry, which is an opaque binary blob to
Raft. The leader then writes the entry to durable storage and attempts to replicate
to a quorum of followers. Once the log entry is considered *committed*, it can be
*applied* to a finite state machine. The finite state machine is application specific,
and is implemented using an interface.
An obvious question relates to the unbounded nature of a replicated log. Raft provides
a mechanism by which the current state is snapshotted, and the log is compacted. Because
of the FSM abstraction, restoring the state of the FSM must result in the same state
as a replay of old logs. This allows Raft to capture the FSM state at a point in time,
and then remove all the logs that were used to reach that state. This is performed automatically
without user intervention, and prevents unbounded disk usage as well as minimizing
time spent replaying logs.
Lastly, there is the issue of updating the peer set when new servers are joining
or existing servers are leaving. As long as a quorum of nodes is available, this
is not an issue as Raft provides mechanisms to dynamically update the peer set.
If a quorum of nodes is unavailable, then this becomes a very challenging issue.
For example, suppose there are only 2 peers, A and B. The quorum size is also
2, meaning both nodes must agree to commit a log entry. If either A or B fails,
it is now impossible to reach quorum. This means the cluster is unable to add,
or remove a node, or commit any additional log entries. This results in *unavailability*.
At this point, manual intervention would be required to remove either A or B,
and to restart the remaining node in bootstrap mode.
A Raft cluster of 3 nodes can tolerate a single node failure, while a cluster
of 5 can tolerate 2 node failures. The recommended configuration is to either
run 3 or 5 raft servers. This maximizes availability without
greatly sacrificing performance.
In terms of performance, Raft is comparable to Paxos. Assuming stable leadership,
committing a log entry requires a single round trip to half of the cluster.
Thus performance is bound by disk I/O and network latency.

@ -1,171 +0,0 @@
package raftbench
// raftbench provides common benchmarking functions which can be used by
// anything which implements the raft.LogStore and raft.StableStore interfaces.
// All functions accept these interfaces and perform benchmarking. This
// makes comparing backend performance easier by sharing the tests.
import (
"github.com/hashicorp/raft"
"testing"
)
func FirstIndex(b *testing.B, store raft.LogStore) {
// Create some fake data
var logs []*raft.Log
for i := 1; i < 10; i++ {
logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")})
}
if err := store.StoreLogs(logs); err != nil {
b.Fatalf("err: %s", err)
}
b.ResetTimer()
// Run FirstIndex a number of times
for n := 0; n < b.N; n++ {
store.FirstIndex()
}
}
func LastIndex(b *testing.B, store raft.LogStore) {
// Create some fake data
var logs []*raft.Log
for i := 1; i < 10; i++ {
logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")})
}
if err := store.StoreLogs(logs); err != nil {
b.Fatalf("err: %s", err)
}
b.ResetTimer()
// Run LastIndex a number of times
for n := 0; n < b.N; n++ {
store.LastIndex()
}
}
func GetLog(b *testing.B, store raft.LogStore) {
// Create some fake data
var logs []*raft.Log
for i := 1; i < 10; i++ {
logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")})
}
if err := store.StoreLogs(logs); err != nil {
b.Fatalf("err: %s", err)
}
b.ResetTimer()
// Run GetLog a number of times
for n := 0; n < b.N; n++ {
if err := store.GetLog(5, new(raft.Log)); err != nil {
b.Fatalf("err: %s", err)
}
}
}
func StoreLog(b *testing.B, store raft.LogStore) {
// Run StoreLog a number of times
for n := 0; n < b.N; n++ {
log := &raft.Log{Index: uint64(n), Data: []byte("data")}
if err := store.StoreLog(log); err != nil {
b.Fatalf("err: %s", err)
}
}
}
func StoreLogs(b *testing.B, store raft.LogStore) {
// Run StoreLogs a number of times. We want to set multiple logs each
// run, so we create 3 logs with incrementing indexes for each iteration.
for n := 0; n < b.N; n++ {
b.StopTimer()
offset := 3 * (n + 1)
logs := []*raft.Log{
&raft.Log{Index: uint64(offset - 2), Data: []byte("data")},
&raft.Log{Index: uint64(offset - 1), Data: []byte("data")},
&raft.Log{Index: uint64(offset), Data: []byte("data")},
}
b.StartTimer()
if err := store.StoreLogs(logs); err != nil {
b.Fatalf("err: %s", err)
}
}
}
func DeleteRange(b *testing.B, store raft.LogStore) {
// Create some fake data. In this case, we create 3 new log entries for each
// test case, and separate them by index in multiples of 10. This allows
// some room so that we can test deleting ranges with "extra" logs to
// to ensure we stop going to the database once our max index is hit.
var logs []*raft.Log
for n := 0; n < b.N; n++ {
offset := 10 * n
for i := offset; i < offset+3; i++ {
logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")})
}
}
if err := store.StoreLogs(logs); err != nil {
b.Fatalf("err: %s", err)
}
b.ResetTimer()
// Delete a range of the data
for n := 0; n < b.N; n++ {
offset := 10 * n
if err := store.DeleteRange(uint64(offset), uint64(offset+9)); err != nil {
b.Fatalf("err: %s", err)
}
}
}
func Set(b *testing.B, store raft.StableStore) {
// Run Set a number of times
for n := 0; n < b.N; n++ {
if err := store.Set([]byte{byte(n)}, []byte("val")); err != nil {
b.Fatalf("err: %s", err)
}
}
}
func Get(b *testing.B, store raft.StableStore) {
// Create some fake data
for i := 1; i < 10; i++ {
if err := store.Set([]byte{byte(i)}, []byte("val")); err != nil {
b.Fatalf("err: %s", err)
}
}
b.ResetTimer()
// Run Get a number of times
for n := 0; n < b.N; n++ {
if _, err := store.Get([]byte{0x05}); err != nil {
b.Fatalf("err: %s", err)
}
}
}
func SetUint64(b *testing.B, store raft.StableStore) {
// Run SetUint64 a number of times
for n := 0; n < b.N; n++ {
if err := store.SetUint64([]byte{byte(n)}, uint64(n)); err != nil {
b.Fatalf("err: %s", err)
}
}
}
func GetUint64(b *testing.B, store raft.StableStore) {
// Create some fake data
for i := 0; i < 10; i++ {
if err := store.SetUint64([]byte{byte(i)}, uint64(i)); err != nil {
b.Fatalf("err: %s", err)
}
}
b.ResetTimer()
// Run GetUint64 a number of times
for n := 0; n < b.N; n++ {
if _, err := store.Get([]byte{0x05}); err != nil {
b.Fatalf("err: %s", err)
}
}
}

@ -1,84 +0,0 @@
package raft
// AppendEntriesRequest is the command used to append entries to the
// replicated log.
type AppendEntriesRequest struct {
// Provide the current term and leader
Term uint64
Leader []byte
// Provide the previous entries for integrity checking
PrevLogEntry uint64
PrevLogTerm uint64
// New entries to commit
Entries []*Log
// Commit index on the leader
LeaderCommitIndex uint64
}
// AppendEntriesResponse is the response returned from an
// AppendEntriesRequest.
type AppendEntriesResponse struct {
// Newer term if leader is out of date
Term uint64
// Last Log is a hint to help accelerate rebuilding slow nodes
LastLog uint64
// We may not succeed if we have a conflicting entry
Success bool
// There are scenarios where this request didn't succeed
// but there's no need to wait/back-off the next attempt.
NoRetryBackoff bool
}
// RequestVoteRequest is the command used by a candidate to ask a Raft peer
// for a vote in an election.
type RequestVoteRequest struct {
// Provide the term and our id
Term uint64
Candidate []byte
// Used to ensure safety
LastLogIndex uint64
LastLogTerm uint64
}
// RequestVoteResponse is the response returned from a RequestVoteRequest.
type RequestVoteResponse struct {
// Newer term if leader is out of date
Term uint64
// Return the peers, so that a node can shutdown on removal
Peers []byte
// Is the vote granted
Granted bool
}
// InstallSnapshotRequest is the command sent to a Raft peer to bootstrap its
// log (and state machine) from a snapshot on another peer.
type InstallSnapshotRequest struct {
Term uint64
Leader []byte
// These are the last index/term included in the snapshot
LastLogIndex uint64
LastLogTerm uint64
// Peer Set in the snapshot
Peers []byte
// Size of the snapshot
Size int64
}
// InstallSnapshotResponse is the response returned from an
// InstallSnapshotRequest.
type InstallSnapshotResponse struct {
Term uint64
Success bool
}

@ -1,136 +0,0 @@
package raft
import (
"fmt"
"io"
"log"
"time"
)
// Config provides any necessary configuration to
// the Raft server
type Config struct {
// HeartbeatTimeout specifies the time in follower state without
// a leader before we attempt an election.
HeartbeatTimeout time.Duration
// ElectionTimeout specifies the time in candidate state without
// a leader before we attempt an election.
ElectionTimeout time.Duration
// CommitTimeout controls the time without an Apply() operation
// before we heartbeat to ensure a timely commit. Due to random
// staggering, may be delayed as much as 2x this value.
CommitTimeout time.Duration
// MaxAppendEntries controls the maximum number of append entries
// to send at once. We want to strike a balance between efficiency
// and avoiding waste if the follower is going to reject because of
// an inconsistent log.
MaxAppendEntries int
// If we are a member of a cluster, and RemovePeer is invoked for the
// local node, then we forget all peers and transition into the follower state.
// If ShutdownOnRemove is is set, we additional shutdown Raft. Otherwise,
// we can become a leader of a cluster containing only this node.
ShutdownOnRemove bool
// DisableBootstrapAfterElect is used to turn off EnableSingleNode
// after the node is elected. This is used to prevent self-election
// if the node is removed from the Raft cluster via RemovePeer. Setting
// it to false will keep the bootstrap mode, allowing the node to self-elect
// and potentially bootstrap a separate cluster.
DisableBootstrapAfterElect bool
// TrailingLogs controls how many logs we leave after a snapshot. This is
// used so that we can quickly replay logs on a follower instead of being
// forced to send an entire snapshot.
TrailingLogs uint64
// SnapshotInterval controls how often we check if we should perform a snapshot.
// We randomly stagger between this value and 2x this value to avoid the entire
// cluster from performing a snapshot at once.
SnapshotInterval time.Duration
// SnapshotThreshold controls how many outstanding logs there must be before
// we perform a snapshot. This is to prevent excessive snapshots when we can
// just replay a small set of logs.
SnapshotThreshold uint64
// EnableSingleNode allows for a single node mode of operation. This
// is false by default, which prevents a lone node from electing itself.
// leader.
EnableSingleNode bool
// LeaderLeaseTimeout is used to control how long the "lease" lasts
// for being the leader without being able to contact a quorum
// of nodes. If we reach this interval without contact, we will
// step down as leader.
LeaderLeaseTimeout time.Duration
// StartAsLeader forces Raft to start in the leader state. This should
// never be used except for testing purposes, as it can cause a split-brain.
StartAsLeader bool
// NotifyCh is used to provide a channel that will be notified of leadership
// changes. Raft will block writing to this channel, so it should either be
// buffered or aggressively consumed.
NotifyCh chan<- bool
// LogOutput is used as a sink for logs, unless Logger is specified.
// Defaults to os.Stderr.
LogOutput io.Writer
// Logger is a user-provided logger. If nil, a logger writing to LogOutput
// is used.
Logger *log.Logger
}
// DefaultConfig returns a Config with usable defaults.
func DefaultConfig() *Config {
return &Config{
HeartbeatTimeout: 1000 * time.Millisecond,
ElectionTimeout: 1000 * time.Millisecond,
CommitTimeout: 50 * time.Millisecond,
MaxAppendEntries: 64,
ShutdownOnRemove: true,
DisableBootstrapAfterElect: true,
TrailingLogs: 10240,
SnapshotInterval: 120 * time.Second,
SnapshotThreshold: 8192,
EnableSingleNode: false,
LeaderLeaseTimeout: 500 * time.Millisecond,
}
}
// ValidateConfig is used to validate a sane configuration
func ValidateConfig(config *Config) error {
if config.HeartbeatTimeout < 5*time.Millisecond {
return fmt.Errorf("Heartbeat timeout is too low")
}
if config.ElectionTimeout < 5*time.Millisecond {
return fmt.Errorf("Election timeout is too low")
}
if config.CommitTimeout < time.Millisecond {
return fmt.Errorf("Commit timeout is too low")
}
if config.MaxAppendEntries <= 0 {
return fmt.Errorf("MaxAppendEntries must be positive")
}
if config.MaxAppendEntries > 1024 {
return fmt.Errorf("MaxAppendEntries is too large")
}
if config.SnapshotInterval < 5*time.Millisecond {
return fmt.Errorf("Snapshot interval is too low")
}
if config.LeaderLeaseTimeout < 5*time.Millisecond {
return fmt.Errorf("Leader lease timeout is too low")
}
if config.LeaderLeaseTimeout > config.HeartbeatTimeout {
return fmt.Errorf("Leader lease timeout cannot be larger than heartbeat timeout")
}
if config.ElectionTimeout < config.HeartbeatTimeout {
return fmt.Errorf("Election timeout must be equal or greater than Heartbeat Timeout")
}
return nil
}

@ -1,48 +0,0 @@
package raft
import (
"fmt"
"io"
)
// DiscardSnapshotStore is used to successfully snapshot while
// always discarding the snapshot. This is useful for when the
// log should be truncated but no snapshot should be retained.
// This should never be used for production use, and is only
// suitable for testing.
type DiscardSnapshotStore struct{}
type DiscardSnapshotSink struct{}
// NewDiscardSnapshotStore is used to create a new DiscardSnapshotStore.
func NewDiscardSnapshotStore() *DiscardSnapshotStore {
return &DiscardSnapshotStore{}
}
func (d *DiscardSnapshotStore) Create(index, term uint64, peers []byte) (SnapshotSink, error) {
return &DiscardSnapshotSink{}, nil
}
func (d *DiscardSnapshotStore) List() ([]*SnapshotMeta, error) {
return nil, nil
}
func (d *DiscardSnapshotStore) Open(id string) (*SnapshotMeta, io.ReadCloser, error) {
return nil, nil, fmt.Errorf("open is not supported")
}
func (d *DiscardSnapshotSink) Write(b []byte) (int, error) {
return len(b), nil
}
func (d *DiscardSnapshotSink) Close() error {
return nil
}
func (d *DiscardSnapshotSink) ID() string {
return "discard"
}
func (d *DiscardSnapshotSink) Cancel() error {
return nil
}

@ -1,17 +0,0 @@
package raft
import "testing"
func TestDiscardSnapshotStoreImpl(t *testing.T) {
var impl interface{} = &DiscardSnapshotStore{}
if _, ok := impl.(SnapshotStore); !ok {
t.Fatalf("DiscardSnapshotStore not a SnapshotStore")
}
}
func TestDiscardSnapshotSinkImpl(t *testing.T) {
var impl interface{} = &DiscardSnapshotSink{}
if _, ok := impl.(SnapshotSink); !ok {
t.Fatalf("DiscardSnapshotSink not a SnapshotSink")
}
}

@ -1,513 +0,0 @@
package raft
import (
"bufio"
"bytes"
"encoding/json"
"fmt"
"hash"
"hash/crc64"
"io"
"io/ioutil"
"log"
"os"
"path/filepath"
"runtime"
"sort"
"strings"
"time"
)
const (
testPath = "permTest"
snapPath = "snapshots"
metaFilePath = "meta.json"
stateFilePath = "state.bin"
tmpSuffix = ".tmp"
)
// FileSnapshotStore implements the SnapshotStore interface and allows
// snapshots to be made on the local disk.
type FileSnapshotStore struct {
path string
retain int
logger *log.Logger
}
type snapMetaSlice []*fileSnapshotMeta
// FileSnapshotSink implements SnapshotSink with a file.
type FileSnapshotSink struct {
store *FileSnapshotStore
logger *log.Logger
dir string
parentDir string
meta fileSnapshotMeta
stateFile *os.File
stateHash hash.Hash64
buffered *bufio.Writer
closed bool
}
// fileSnapshotMeta is stored on disk. We also put a CRC
// on disk so that we can verify the snapshot.
type fileSnapshotMeta struct {
SnapshotMeta
CRC []byte
}
// bufferedFile is returned when we open a snapshot. This way
// reads are buffered and the file still gets closed.
type bufferedFile struct {
bh *bufio.Reader
fh *os.File
}
func (b *bufferedFile) Read(p []byte) (n int, err error) {
return b.bh.Read(p)
}
func (b *bufferedFile) Close() error {
return b.fh.Close()
}
// NewFileSnapshotStoreWithLogger creates a new FileSnapshotStore based
// on a base directory. The `retain` parameter controls how many
// snapshots are retained. Must be at least 1.
func NewFileSnapshotStoreWithLogger(base string, retain int, logger *log.Logger) (*FileSnapshotStore, error) {
if retain < 1 {
return nil, fmt.Errorf("must retain at least one snapshot")
}
if logger == nil {
logger = log.New(os.Stderr, "", log.LstdFlags)
}
// Ensure our path exists
path := filepath.Join(base, snapPath)
if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) {
return nil, fmt.Errorf("snapshot path not accessible: %v", err)
}
// Setup the store
store := &FileSnapshotStore{
path: path,
retain: retain,
logger: logger,
}
// Do a permissions test
if err := store.testPermissions(); err != nil {
return nil, fmt.Errorf("permissions test failed: %v", err)
}
return store, nil
}
// NewFileSnapshotStore creates a new FileSnapshotStore based
// on a base directory. The `retain` parameter controls how many
// snapshots are retained. Must be at least 1.
func NewFileSnapshotStore(base string, retain int, logOutput io.Writer) (*FileSnapshotStore, error) {
if logOutput == nil {
logOutput = os.Stderr
}
return NewFileSnapshotStoreWithLogger(base, retain, log.New(logOutput, "", log.LstdFlags))
}
// testPermissions tries to touch a file in our path to see if it works.
func (f *FileSnapshotStore) testPermissions() error {
path := filepath.Join(f.path, testPath)
fh, err := os.Create(path)
if err != nil {
return err
}
if err = fh.Close(); err != nil {
return err
}
if err = os.Remove(path); err != nil {
return err
}
return nil
}
// snapshotName generates a name for the snapshot.
func snapshotName(term, index uint64) string {
now := time.Now()
msec := now.UnixNano() / int64(time.Millisecond)
return fmt.Sprintf("%d-%d-%d", term, index, msec)
}
// Create is used to start a new snapshot
func (f *FileSnapshotStore) Create(index, term uint64, peers []byte) (SnapshotSink, error) {
// Create a new path
name := snapshotName(term, index)
path := filepath.Join(f.path, name+tmpSuffix)
f.logger.Printf("[INFO] snapshot: Creating new snapshot at %s", path)
// Make the directory
if err := os.MkdirAll(path, 0755); err != nil {
f.logger.Printf("[ERR] snapshot: Failed to make snapshot directory: %v", err)
return nil, err
}
// Create the sink
sink := &FileSnapshotSink{
store: f,
logger: f.logger,
dir: path,
parentDir: f.path,
meta: fileSnapshotMeta{
SnapshotMeta: SnapshotMeta{
ID: name,
Index: index,
Term: term,
Peers: peers,
},
CRC: nil,
},
}
// Write out the meta data
if err := sink.writeMeta(); err != nil {
f.logger.Printf("[ERR] snapshot: Failed to write metadata: %v", err)
return nil, err
}
// Open the state file
statePath := filepath.Join(path, stateFilePath)
fh, err := os.Create(statePath)
if err != nil {
f.logger.Printf("[ERR] snapshot: Failed to create state file: %v", err)
return nil, err
}
sink.stateFile = fh
// Create a CRC64 hash
sink.stateHash = crc64.New(crc64.MakeTable(crc64.ECMA))
// Wrap both the hash and file in a MultiWriter with buffering
multi := io.MultiWriter(sink.stateFile, sink.stateHash)
sink.buffered = bufio.NewWriter(multi)
// Done
return sink, nil
}
// List returns available snapshots in the store.
func (f *FileSnapshotStore) List() ([]*SnapshotMeta, error) {
// Get the eligible snapshots
snapshots, err := f.getSnapshots()
if err != nil {
f.logger.Printf("[ERR] snapshot: Failed to get snapshots: %v", err)
return nil, err
}
var snapMeta []*SnapshotMeta
for _, meta := range snapshots {
snapMeta = append(snapMeta, &meta.SnapshotMeta)
if len(snapMeta) == f.retain {
break
}
}
return snapMeta, nil
}
// getSnapshots returns all the known snapshots.
func (f *FileSnapshotStore) getSnapshots() ([]*fileSnapshotMeta, error) {
// Get the eligible snapshots
snapshots, err := ioutil.ReadDir(f.path)
if err != nil {
f.logger.Printf("[ERR] snapshot: Failed to scan snapshot dir: %v", err)
return nil, err
}
// Populate the metadata
var snapMeta []*fileSnapshotMeta
for _, snap := range snapshots {
// Ignore any files
if !snap.IsDir() {
continue
}
// Ignore any temporary snapshots
dirName := snap.Name()
if strings.HasSuffix(dirName, tmpSuffix) {
f.logger.Printf("[WARN] snapshot: Found temporary snapshot: %v", dirName)
continue
}
// Try to read the meta data
meta, err := f.readMeta(dirName)
if err != nil {
f.logger.Printf("[WARN] snapshot: Failed to read metadata for %v: %v", dirName, err)
continue
}
// Append, but only return up to the retain count
snapMeta = append(snapMeta, meta)
}
// Sort the snapshot, reverse so we get new -> old
sort.Sort(sort.Reverse(snapMetaSlice(snapMeta)))
return snapMeta, nil
}
// readMeta is used to read the meta data for a given named backup
func (f *FileSnapshotStore) readMeta(name string) (*fileSnapshotMeta, error) {
// Open the meta file
metaPath := filepath.Join(f.path, name, metaFilePath)
fh, err := os.Open(metaPath)
if err != nil {
return nil, err
}
defer fh.Close()
// Buffer the file IO
buffered := bufio.NewReader(fh)
// Read in the JSON
meta := &fileSnapshotMeta{}
dec := json.NewDecoder(buffered)
if err := dec.Decode(meta); err != nil {
return nil, err
}
return meta, nil
}
// Open takes a snapshot ID and returns a ReadCloser for that snapshot.
func (f *FileSnapshotStore) Open(id string) (*SnapshotMeta, io.ReadCloser, error) {
// Get the metadata
meta, err := f.readMeta(id)
if err != nil {
f.logger.Printf("[ERR] snapshot: Failed to get meta data to open snapshot: %v", err)
return nil, nil, err
}
// Open the state file
statePath := filepath.Join(f.path, id, stateFilePath)
fh, err := os.Open(statePath)
if err != nil {
f.logger.Printf("[ERR] snapshot: Failed to open state file: %v", err)
return nil, nil, err
}
// Create a CRC64 hash
stateHash := crc64.New(crc64.MakeTable(crc64.ECMA))
// Compute the hash
_, err = io.Copy(stateHash, fh)
if err != nil {
f.logger.Printf("[ERR] snapshot: Failed to read state file: %v", err)
fh.Close()
return nil, nil, err
}
// Verify the hash
computed := stateHash.Sum(nil)
if bytes.Compare(meta.CRC, computed) != 0 {
f.logger.Printf("[ERR] snapshot: CRC checksum failed (stored: %v computed: %v)",
meta.CRC, computed)
fh.Close()
return nil, nil, fmt.Errorf("CRC mismatch")
}
// Seek to the start
if _, err := fh.Seek(0, 0); err != nil {
f.logger.Printf("[ERR] snapshot: State file seek failed: %v", err)
fh.Close()
return nil, nil, err
}
// Return a buffered file
buffered := &bufferedFile{
bh: bufio.NewReader(fh),
fh: fh,
}
return &meta.SnapshotMeta, buffered, nil
}
// ReapSnapshots reaps any snapshots beyond the retain count.
func (f *FileSnapshotStore) ReapSnapshots() error {
snapshots, err := f.getSnapshots()
if err != nil {
f.logger.Printf("[ERR] snapshot: Failed to get snapshots: %v", err)
return err
}
for i := f.retain; i < len(snapshots); i++ {
path := filepath.Join(f.path, snapshots[i].ID)
f.logger.Printf("[INFO] snapshot: reaping snapshot %v", path)
if err := os.RemoveAll(path); err != nil {
f.logger.Printf("[ERR] snapshot: Failed to reap snapshot %v: %v", path, err)
return err
}
}
return nil
}
// ID returns the ID of the snapshot, can be used with Open()
// after the snapshot is finalized.
func (s *FileSnapshotSink) ID() string {
return s.meta.ID
}
// Write is used to append to the state file. We write to the
// buffered IO object to reduce the amount of context switches.
func (s *FileSnapshotSink) Write(b []byte) (int, error) {
return s.buffered.Write(b)
}
// Close is used to indicate a successful end.
func (s *FileSnapshotSink) Close() error {
// Make sure close is idempotent
if s.closed {
return nil
}
s.closed = true
// Close the open handles
if err := s.finalize(); err != nil {
s.logger.Printf("[ERR] snapshot: Failed to finalize snapshot: %v", err)
if delErr := os.RemoveAll(s.dir); delErr != nil {
s.logger.Printf("[ERR] snapshot: Failed to delete temporary snapshot at path %v: %v", s.dir, delErr)
return delErr
}
return err
}
// Write out the meta data
if err := s.writeMeta(); err != nil {
s.logger.Printf("[ERR] snapshot: Failed to write metadata: %v", err)
return err
}
// Move the directory into place
newPath := strings.TrimSuffix(s.dir, tmpSuffix)
if err := os.Rename(s.dir, newPath); err != nil {
s.logger.Printf("[ERR] snapshot: Failed to move snapshot into place: %v", err)
return err
}
if runtime.GOOS != "windows" { //skipping fsync for directory entry edits on Windows, only needed for *nix style file systems
parentFH, err := os.Open(s.parentDir)
defer parentFH.Close()
if err != nil {
s.logger.Printf("[ERR] snapshot: Failed to open snapshot parent directory %v, error: %v", s.parentDir, err)
return err
}
if err = parentFH.Sync(); err != nil {
s.logger.Printf("[ERR] snapshot: Failed syncing parent directory %v, error: %v", s.parentDir, err)
return err
}
}
// Reap any old snapshots
if err := s.store.ReapSnapshots(); err != nil {
return err
}
return nil
}
// Cancel is used to indicate an unsuccessful end.
func (s *FileSnapshotSink) Cancel() error {
// Make sure close is idempotent
if s.closed {
return nil
}
s.closed = true
// Close the open handles
if err := s.finalize(); err != nil {
s.logger.Printf("[ERR] snapshot: Failed to finalize snapshot: %v", err)
return err
}
// Attempt to remove all artifacts
return os.RemoveAll(s.dir)
}
// finalize is used to close all of our resources.
func (s *FileSnapshotSink) finalize() error {
// Flush any remaining data
if err := s.buffered.Flush(); err != nil {
return err
}
// Sync to force fsync to disk
if err := s.stateFile.Sync(); err != nil {
return err
}
// Get the file size
stat, statErr := s.stateFile.Stat()
// Close the file
if err := s.stateFile.Close(); err != nil {
return err
}
// Set the file size, check after we close
if statErr != nil {
return statErr
}
s.meta.Size = stat.Size()
// Set the CRC
s.meta.CRC = s.stateHash.Sum(nil)
return nil
}
// writeMeta is used to write out the metadata we have.
func (s *FileSnapshotSink) writeMeta() error {
// Open the meta file
metaPath := filepath.Join(s.dir, metaFilePath)
fh, err := os.Create(metaPath)
if err != nil {
return err
}
defer fh.Close()
// Buffer the file IO
buffered := bufio.NewWriter(fh)
// Write out as JSON
enc := json.NewEncoder(buffered)
if err := enc.Encode(&s.meta); err != nil {
return err
}
if err = buffered.Flush(); err != nil {
return err
}
if err = fh.Sync(); err != nil {
return err
}
return nil
}
// Implement the sort interface for []*fileSnapshotMeta.
func (s snapMetaSlice) Len() int {
return len(s)
}
func (s snapMetaSlice) Less(i, j int) bool {
if s[i].Term != s[j].Term {
return s[i].Term < s[j].Term
}
if s[i].Index != s[j].Index {
return s[i].Index < s[j].Index
}
return s[i].ID < s[j].ID
}
func (s snapMetaSlice) Swap(i, j int) {
s[i], s[j] = s[j], s[i]
}

@ -1,343 +0,0 @@
package raft
import (
"bytes"
"io"
"io/ioutil"
"os"
"runtime"
"testing"
)
func FileSnapTest(t *testing.T) (string, *FileSnapshotStore) {
// Create a test dir
dir, err := ioutil.TempDir("", "raft")
if err != nil {
t.Fatalf("err: %v ", err)
}
snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t))
if err != nil {
t.Fatalf("err: %v", err)
}
return dir, snap
}
func TestFileSnapshotStoreImpl(t *testing.T) {
var impl interface{} = &FileSnapshotStore{}
if _, ok := impl.(SnapshotStore); !ok {
t.Fatalf("FileSnapshotStore not a SnapshotStore")
}
}
func TestFileSnapshotSinkImpl(t *testing.T) {
var impl interface{} = &FileSnapshotSink{}
if _, ok := impl.(SnapshotSink); !ok {
t.Fatalf("FileSnapshotSink not a SnapshotSink")
}
}
func TestFileSS_CreateSnapshotMissingParentDir(t *testing.T) {
parent, err := ioutil.TempDir("", "raft")
if err != nil {
t.Fatalf("err: %v ", err)
}
defer os.RemoveAll(parent)
dir, err := ioutil.TempDir(parent, "raft")
if err != nil {
t.Fatalf("err: %v ", err)
}
snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t))
if err != nil {
t.Fatalf("err: %v", err)
}
os.RemoveAll(parent)
peers := []byte("all my lovely friends")
_, err = snap.Create(10, 3, peers)
if err != nil {
t.Fatalf("should not fail when using non existing parent")
}
}
func TestFileSS_CreateSnapshot(t *testing.T) {
// Create a test dir
dir, err := ioutil.TempDir("", "raft")
if err != nil {
t.Fatalf("err: %v ", err)
}
defer os.RemoveAll(dir)
snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t))
if err != nil {
t.Fatalf("err: %v", err)
}
// Check no snapshots
snaps, err := snap.List()
if err != nil {
t.Fatalf("err: %v", err)
}
if len(snaps) != 0 {
t.Fatalf("did not expect any snapshots: %v", snaps)
}
// Create a new sink
peers := []byte("all my lovely friends")
sink, err := snap.Create(10, 3, peers)
if err != nil {
t.Fatalf("err: %v", err)
}
// The sink is not done, should not be in a list!
snaps, err = snap.List()
if err != nil {
t.Fatalf("err: %v", err)
}
if len(snaps) != 0 {
t.Fatalf("did not expect any snapshots: %v", snaps)
}
// Write to the sink
_, err = sink.Write([]byte("first\n"))
if err != nil {
t.Fatalf("err: %v", err)
}
_, err = sink.Write([]byte("second\n"))
if err != nil {
t.Fatalf("err: %v", err)
}
// Done!
err = sink.Close()
if err != nil {
t.Fatalf("err: %v", err)
}
// Should have a snapshot!
snaps, err = snap.List()
if err != nil {
t.Fatalf("err: %v", err)
}
if len(snaps) != 1 {
t.Fatalf("expect a snapshots: %v", snaps)
}
// Check the latest
latest := snaps[0]
if latest.Index != 10 {
t.Fatalf("bad snapshot: %v", *latest)
}
if latest.Term != 3 {
t.Fatalf("bad snapshot: %v", *latest)
}
if bytes.Compare(latest.Peers, peers) != 0 {
t.Fatalf("bad snapshot: %v", *latest)
}
if latest.Size != 13 {
t.Fatalf("bad snapshot: %v", *latest)
}
// Read the snapshot
_, r, err := snap.Open(latest.ID)
if err != nil {
t.Fatalf("err: %v", err)
}
// Read out everything
var buf bytes.Buffer
if _, err := io.Copy(&buf, r); err != nil {
t.Fatalf("err: %v", err)
}
if err := r.Close(); err != nil {
t.Fatalf("err: %v", err)
}
// Ensure a match
if bytes.Compare(buf.Bytes(), []byte("first\nsecond\n")) != 0 {
t.Fatalf("content mismatch")
}
}
func TestFileSS_CancelSnapshot(t *testing.T) {
// Create a test dir
dir, err := ioutil.TempDir("", "raft")
if err != nil {
t.Fatalf("err: %v ", err)
}
defer os.RemoveAll(dir)
snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t))
if err != nil {
t.Fatalf("err: %v", err)
}
// Create a new sink
peers := []byte("all my lovely friends")
sink, err := snap.Create(10, 3, peers)
if err != nil {
t.Fatalf("err: %v", err)
}
// Cancel the snapshot! Should delete
err = sink.Cancel()
if err != nil {
t.Fatalf("err: %v", err)
}
// The sink is canceled, should not be in a list!
snaps, err := snap.List()
if err != nil {
t.Fatalf("err: %v", err)
}
if len(snaps) != 0 {
t.Fatalf("did not expect any snapshots: %v", snaps)
}
}
func TestFileSS_Retention(t *testing.T) {
// Create a test dir
dir, err := ioutil.TempDir("", "raft")
if err != nil {
t.Fatalf("err: %v ", err)
}
defer os.RemoveAll(dir)
snap, err := NewFileSnapshotStoreWithLogger(dir, 2, newTestLogger(t))
if err != nil {
t.Fatalf("err: %v", err)
}
// Create a new sink
peers := []byte("all my lovely friends")
// Create a few snapshots
for i := 10; i < 15; i++ {
sink, err := snap.Create(uint64(i), 3, peers)
if err != nil {
t.Fatalf("err: %v", err)
}
err = sink.Close()
if err != nil {
t.Fatalf("err: %v", err)
}
}
// Should only have 2 listed!
snaps, err := snap.List()
if err != nil {
t.Fatalf("err: %v", err)
}
if len(snaps) != 2 {
t.Fatalf("expect 2 snapshots: %v", snaps)
}
// Check they are the latest
if snaps[0].Index != 14 {
t.Fatalf("bad snap: %#v", *snaps[0])
}
if snaps[1].Index != 13 {
t.Fatalf("bad snap: %#v", *snaps[1])
}
}
func TestFileSS_BadPerm(t *testing.T) {
if runtime.GOOS == "windows" {
t.Skip("skipping file permission test on windows")
}
// Create a temp dir
dir1, err := ioutil.TempDir("", "raft")
if err != nil {
t.Fatalf("err: %s", err)
}
defer os.RemoveAll(dir1)
// Create a sub dir and remove all permissions
dir2, err := ioutil.TempDir(dir1, "badperm")
if err != nil {
t.Fatalf("err: %s", err)
}
if err := os.Chmod(dir2, 000); err != nil {
t.Fatalf("err: %s", err)
}
defer os.Chmod(dir2, 777) // Set perms back for delete
// Should fail
if _, err := NewFileSnapshotStore(dir2, 3, nil); err == nil {
t.Fatalf("should fail to use dir with bad perms")
}
}
func TestFileSS_MissingParentDir(t *testing.T) {
parent, err := ioutil.TempDir("", "raft")
if err != nil {
t.Fatalf("err: %v ", err)
}
defer os.RemoveAll(parent)
dir, err := ioutil.TempDir(parent, "raft")
if err != nil {
t.Fatalf("err: %v ", err)
}
os.RemoveAll(parent)
_, err = NewFileSnapshotStore(dir, 3, nil)
if err != nil {
t.Fatalf("should not fail when using non existing parent")
}
}
func TestFileSS_Ordering(t *testing.T) {
// Create a test dir
dir, err := ioutil.TempDir("", "raft")
if err != nil {
t.Fatalf("err: %v ", err)
}
defer os.RemoveAll(dir)
snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t))
if err != nil {
t.Fatalf("err: %v", err)
}
// Create a new sink
peers := []byte("all my lovely friends")
sink, err := snap.Create(130350, 5, peers)
if err != nil {
t.Fatalf("err: %v", err)
}
err = sink.Close()
if err != nil {
t.Fatalf("err: %v", err)
}
sink, err = snap.Create(204917, 36, peers)
if err != nil {
t.Fatalf("err: %v", err)
}
err = sink.Close()
if err != nil {
t.Fatalf("err: %v", err)
}
// Should only have 2 listed!
snaps, err := snap.List()
if err != nil {
t.Fatalf("err: %v", err)
}
if len(snaps) != 2 {
t.Fatalf("expect 2 snapshots: %v", snaps)
}
// Check they are ordered
if snaps[0].Term != 36 {
t.Fatalf("bad snap: %#v", *snaps[0])
}
if snaps[1].Term != 5 {
t.Fatalf("bad snap: %#v", *snaps[1])
}
}

@ -1,40 +0,0 @@
package raft
import (
"io"
)
// FSM provides an interface that can be implemented by
// clients to make use of the replicated log.
type FSM interface {
// Apply log is invoked once a log entry is committed.
// It returns a value which will be made available in the
// ApplyFuture returned by Raft.Apply method if that
// method was called on the same Raft node as the FSM.
Apply(*Log) interface{}
// Snapshot is used to support log compaction. This call should
// return an FSMSnapshot which can be used to save a point-in-time
// snapshot of the FSM. Apply and Snapshot are not called in multiple
// threads, but Apply will be called concurrently with Persist. This means
// the FSM should be implemented in a fashion that allows for concurrent
// updates while a snapshot is happening.
Snapshot() (FSMSnapshot, error)
// Restore is used to restore an FSM from a snapshot. It is not called
// concurrently with any other command. The FSM must discard all previous
// state.
Restore(io.ReadCloser) error
}
// FSMSnapshot is returned by an FSM in response to a Snapshot
// It must be safe to invoke FSMSnapshot methods with concurrent
// calls to Apply.
type FSMSnapshot interface {
// Persist should dump all necessary state to the WriteCloser 'sink',
// and call sink.Close() when finished or call sink.Cancel() on error.
Persist(sink SnapshotSink) error
// Release is invoked when we are finished with the snapshot.
Release()
}

@ -1,203 +0,0 @@
package raft
import (
"sync"
"time"
)
// Future is used to represent an action that may occur in the future.
type Future interface {
// Error blocks until the future arrives and then
// returns the error status of the future.
// This may be called any number of times - all
// calls will return the same value.
// Note that it is not OK to call this method
// twice concurrently on the same Future instance.
Error() error
}
// ApplyFuture is used for Apply() and may return the FSM response.
type ApplyFuture interface {
Future
// Response returns the FSM response as returned
// by the FSM.Apply method. This must not be called
// until after the Error method has returned.
Response() interface{}
// Index holds the index of the newly applied log entry.
// This must not be called
// until after the Error method has returned.
Index() uint64
}
// errorFuture is used to return a static error.
type errorFuture struct {
err error
}
func (e errorFuture) Error() error {
return e.err
}
func (e errorFuture) Response() interface{} {
return nil
}
func (e errorFuture) Index() uint64 {
return 0
}
// deferError can be embedded to allow a future
// to provide an error in the future.
type deferError struct {
err error
errCh chan error
responded bool
}
func (d *deferError) init() {
d.errCh = make(chan error, 1)
}
func (d *deferError) Error() error {
if d.err != nil {
// Note that when we've received a nil error, this
// won't trigger, but the channel is closed after
// send so we'll still return nil below.
return d.err
}
if d.errCh == nil {
panic("waiting for response on nil channel")
}
d.err = <-d.errCh
return d.err
}
func (d *deferError) respond(err error) {
if d.errCh == nil {
return
}
if d.responded {
return
}
d.errCh <- err
close(d.errCh)
d.responded = true
}
// logFuture is used to apply a log entry and waits until
// the log is considered committed.
type logFuture struct {
deferError
log Log
policy quorumPolicy
response interface{}
dispatch time.Time
}
func (l *logFuture) Response() interface{} {
return l.response
}
func (l *logFuture) Index() uint64 {
return l.log.Index
}
type peerFuture struct {
deferError
peers []string
}
type shutdownFuture struct {
raft *Raft
}
func (s *shutdownFuture) Error() error {
if s.raft == nil {
return nil
}
s.raft.waitShutdown()
if closeable, ok := s.raft.trans.(WithClose); ok {
closeable.Close()
}
return nil
}
// snapshotFuture is used for waiting on a snapshot to complete.
type snapshotFuture struct {
deferError
}
// reqSnapshotFuture is used for requesting a snapshot start.
// It is only used internally.
type reqSnapshotFuture struct {
deferError
// snapshot details provided by the FSM runner before responding
index uint64
term uint64
peers []string
snapshot FSMSnapshot
}
// restoreFuture is used for requesting an FSM to perform a
// snapshot restore. Used internally only.
type restoreFuture struct {
deferError
ID string
}
// verifyFuture is used to verify the current node is still
// the leader. This is to prevent a stale read.
type verifyFuture struct {
deferError
notifyCh chan *verifyFuture
quorumSize int
votes int
voteLock sync.Mutex
}
// vote is used to respond to a verifyFuture.
// This may block when responding on the notifyCh.
func (v *verifyFuture) vote(leader bool) {
v.voteLock.Lock()
defer v.voteLock.Unlock()
// Guard against having notified already
if v.notifyCh == nil {
return
}
if leader {
v.votes++
if v.votes >= v.quorumSize {
v.notifyCh <- v
v.notifyCh = nil
}
} else {
v.notifyCh <- v
v.notifyCh = nil
}
}
// appendFuture is used for waiting on a pipelined append
// entries RPC.
type appendFuture struct {
deferError
start time.Time
args *AppendEntriesRequest
resp *AppendEntriesResponse
}
func (a *appendFuture) Start() time.Time {
return a.start
}
func (a *appendFuture) Request() *AppendEntriesRequest {
return a.args
}
func (a *appendFuture) Response() *AppendEntriesResponse {
return a.resp
}

@ -1,42 +0,0 @@
package raft
import (
"errors"
"testing"
)
func TestDeferFutureSuccess(t *testing.T) {
var f deferError
f.init()
f.respond(nil)
if err := f.Error(); err != nil {
t.Fatalf("unexpected error result; got %#v want nil", err)
}
if err := f.Error(); err != nil {
t.Fatalf("unexpected error result; got %#v want nil", err)
}
}
func TestDeferFutureError(t *testing.T) {
want := errors.New("x")
var f deferError
f.init()
f.respond(want)
if got := f.Error(); got != want {
t.Fatalf("unexpected error result; got %#v want %#v", got, want)
}
if got := f.Error(); got != want {
t.Fatalf("unexpected error result; got %#v want %#v", got, want)
}
}
func TestDeferFutureConcurrent(t *testing.T) {
// Food for the race detector.
want := errors.New("x")
var f deferError
f.init()
go f.respond(want)
if got := f.Error(); got != want {
t.Errorf("unexpected error result; got %#v want %#v", got, want)
}
}

@ -1,213 +0,0 @@
package raft
import (
"container/list"
"sync"
)
// QuorumPolicy allows individual logFutures to have different
// commitment rules while still using the inflight mechanism.
type quorumPolicy interface {
// Checks if a commit from a given peer is enough to
// satisfy the commitment rules
Commit() bool
// Checks if a commit is committed
IsCommitted() bool
}
// MajorityQuorum is used by Apply transactions and requires
// a simple majority of nodes.
type majorityQuorum struct {
count int
votesNeeded int
}
func newMajorityQuorum(clusterSize int) *majorityQuorum {
votesNeeded := (clusterSize / 2) + 1
return &majorityQuorum{count: 0, votesNeeded: votesNeeded}
}
func (m *majorityQuorum) Commit() bool {
m.count++
return m.count >= m.votesNeeded
}
func (m *majorityQuorum) IsCommitted() bool {
return m.count >= m.votesNeeded
}
// Inflight is used to track operations that are still in-flight.
type inflight struct {
sync.Mutex
committed *list.List
commitCh chan struct{}
minCommit uint64
maxCommit uint64
operations map[uint64]*logFuture
stopCh chan struct{}
}
// NewInflight returns an inflight struct that notifies
// the provided channel when logs are finished committing.
func newInflight(commitCh chan struct{}) *inflight {
return &inflight{
committed: list.New(),
commitCh: commitCh,
minCommit: 0,
maxCommit: 0,
operations: make(map[uint64]*logFuture),
stopCh: make(chan struct{}),
}
}
// Start is used to mark a logFuture as being inflight. It
// also commits the entry, as it is assumed the leader is
// starting.
func (i *inflight) Start(l *logFuture) {
i.Lock()
defer i.Unlock()
i.start(l)
}
// StartAll is used to mark a list of logFuture's as being
// inflight. It also commits each entry as the leader is
// assumed to be starting.
func (i *inflight) StartAll(logs []*logFuture) {
i.Lock()
defer i.Unlock()
for _, l := range logs {
i.start(l)
}
}
// start is used to mark a single entry as inflight,
// must be invoked with the lock held.
func (i *inflight) start(l *logFuture) {
idx := l.log.Index
i.operations[idx] = l
if idx > i.maxCommit {
i.maxCommit = idx
}
if i.minCommit == 0 {
i.minCommit = idx
}
i.commit(idx)
}
// Cancel is used to cancel all in-flight operations.
// This is done when the leader steps down, and all futures
// are sent the given error.
func (i *inflight) Cancel(err error) {
// Close the channel first to unblock any pending commits
close(i.stopCh)
// Lock after close to avoid deadlock
i.Lock()
defer i.Unlock()
// Respond to all inflight operations
for _, op := range i.operations {
op.respond(err)
}
// Clear all the committed but not processed
for e := i.committed.Front(); e != nil; e = e.Next() {
e.Value.(*logFuture).respond(err)
}
// Clear the map
i.operations = make(map[uint64]*logFuture)
// Clear the list of committed
i.committed = list.New()
// Close the commmitCh
close(i.commitCh)
// Reset indexes
i.minCommit = 0
i.maxCommit = 0
}
// Committed returns all the committed operations in order.
func (i *inflight) Committed() (l *list.List) {
i.Lock()
l, i.committed = i.committed, list.New()
i.Unlock()
return l
}
// Commit is used by leader replication routines to indicate that
// a follower was finished committing a log to disk.
func (i *inflight) Commit(index uint64) {
i.Lock()
defer i.Unlock()
i.commit(index)
}
// CommitRange is used to commit a range of indexes inclusively.
// It is optimized to avoid commits for indexes that are not tracked.
func (i *inflight) CommitRange(minIndex, maxIndex uint64) {
i.Lock()
defer i.Unlock()
// Update the minimum index
minIndex = max(i.minCommit, minIndex)
// Commit each index
for idx := minIndex; idx <= maxIndex; idx++ {
i.commit(idx)
}
}
// commit is used to commit a single index. Must be called with the lock held.
func (i *inflight) commit(index uint64) {
op, ok := i.operations[index]
if !ok {
// Ignore if not in the map, as it may be committed already
return
}
// Check if we've satisfied the commit
if !op.policy.Commit() {
return
}
// Cannot commit if this is not the minimum inflight. This can happen
// if the quorum size changes, meaning a previous commit requires a larger
// quorum that this commit. We MUST block until the previous log is committed,
// otherwise logs will be applied out of order.
if index != i.minCommit {
return
}
NOTIFY:
// Add the operation to the committed list
i.committed.PushBack(op)
// Stop tracking since it is committed
delete(i.operations, index)
// Update the indexes
if index == i.maxCommit {
i.minCommit = 0
i.maxCommit = 0
} else {
i.minCommit++
}
// Check if the next in-flight operation is ready
if i.minCommit != 0 {
op = i.operations[i.minCommit]
if op.policy.IsCommitted() {
index = i.minCommit
goto NOTIFY
}
}
// Async notify of ready operations
asyncNotifyCh(i.commitCh)
}

@ -1,150 +0,0 @@
package raft
import (
"fmt"
"testing"
)
func TestInflight_StartCommit(t *testing.T) {
commitCh := make(chan struct{}, 1)
in := newInflight(commitCh)
// Commit a transaction as being in flight
l := &logFuture{log: Log{Index: 1}}
l.policy = newMajorityQuorum(5)
in.Start(l)
// Commit 3 times
in.Commit(1)
if in.Committed().Len() != 0 {
t.Fatalf("should not be commited")
}
in.Commit(1)
if in.Committed().Len() != 1 {
t.Fatalf("should be commited")
}
// Already committed but should work anyways
in.Commit(1)
}
func TestInflight_Cancel(t *testing.T) {
commitCh := make(chan struct{}, 1)
in := newInflight(commitCh)
// Commit a transaction as being in flight
l := &logFuture{
log: Log{Index: 1},
}
l.init()
l.policy = newMajorityQuorum(3)
in.Start(l)
// Cancel with an error
err := fmt.Errorf("error 1")
in.Cancel(err)
// Should get an error return
if l.Error() != err {
t.Fatalf("expected error")
}
}
func TestInflight_StartAll(t *testing.T) {
commitCh := make(chan struct{}, 1)
in := newInflight(commitCh)
// Commit a few transaction as being in flight
l1 := &logFuture{log: Log{Index: 2}}
l1.policy = newMajorityQuorum(5)
l2 := &logFuture{log: Log{Index: 3}}
l2.policy = newMajorityQuorum(5)
l3 := &logFuture{log: Log{Index: 4}}
l3.policy = newMajorityQuorum(5)
// Start all the entries
in.StartAll([]*logFuture{l1, l2, l3})
// Commit ranges
in.CommitRange(1, 5)
in.CommitRange(1, 4)
in.CommitRange(1, 10)
// Should get 3 back
if in.Committed().Len() != 3 {
t.Fatalf("expected all 3 to commit")
}
}
func TestInflight_CommitRange(t *testing.T) {
commitCh := make(chan struct{}, 1)
in := newInflight(commitCh)
// Commit a few transaction as being in flight
l1 := &logFuture{log: Log{Index: 2}}
l1.policy = newMajorityQuorum(5)
in.Start(l1)
l2 := &logFuture{log: Log{Index: 3}}
l2.policy = newMajorityQuorum(5)
in.Start(l2)
l3 := &logFuture{log: Log{Index: 4}}
l3.policy = newMajorityQuorum(5)
in.Start(l3)
// Commit ranges
in.CommitRange(1, 5)
in.CommitRange(1, 4)
in.CommitRange(1, 10)
// Should get 3 back
if in.Committed().Len() != 3 {
t.Fatalf("expected all 3 to commit")
}
}
// Should panic if we commit non contiguously!
func TestInflight_NonContiguous(t *testing.T) {
commitCh := make(chan struct{}, 1)
in := newInflight(commitCh)
// Commit a few transaction as being in flight
l1 := &logFuture{log: Log{Index: 2}}
l1.policy = newMajorityQuorum(5)
in.Start(l1)
l2 := &logFuture{log: Log{Index: 3}}
l2.policy = newMajorityQuorum(5)
in.Start(l2)
in.Commit(3)
in.Commit(3)
in.Commit(3) // panic!
if in.Committed().Len() != 0 {
t.Fatalf("should not commit")
}
in.Commit(2)
in.Commit(2)
in.Commit(2) // panic!
committed := in.Committed()
if committed.Len() != 2 {
t.Fatalf("should commit both")
}
current := committed.Front()
l := current.Value.(*logFuture)
if l.log.Index != 2 {
t.Fatalf("bad: %v", *l)
}
current = current.Next()
l = current.Value.(*logFuture)
if l.log.Index != 3 {
t.Fatalf("bad: %v", *l)
}
}

@ -1,116 +0,0 @@
package raft
import (
"sync"
)
// InmemStore implements the LogStore and StableStore interface.
// It should NOT EVER be used for production. It is used only for
// unit tests. Use the MDBStore implementation instead.
type InmemStore struct {
l sync.RWMutex
lowIndex uint64
highIndex uint64
logs map[uint64]*Log
kv map[string][]byte
kvInt map[string]uint64
}
// NewInmemStore returns a new in-memory backend. Do not ever
// use for production. Only for testing.
func NewInmemStore() *InmemStore {
i := &InmemStore{
logs: make(map[uint64]*Log),
kv: make(map[string][]byte),
kvInt: make(map[string]uint64),
}
return i
}
// FirstIndex implements the LogStore interface.
func (i *InmemStore) FirstIndex() (uint64, error) {
i.l.RLock()
defer i.l.RUnlock()
return i.lowIndex, nil
}
// LastIndex implements the LogStore interface.
func (i *InmemStore) LastIndex() (uint64, error) {
i.l.RLock()
defer i.l.RUnlock()
return i.highIndex, nil
}
// GetLog implements the LogStore interface.
func (i *InmemStore) GetLog(index uint64, log *Log) error {
i.l.RLock()
defer i.l.RUnlock()
l, ok := i.logs[index]
if !ok {
return ErrLogNotFound
}
*log = *l
return nil
}
// StoreLog implements the LogStore interface.
func (i *InmemStore) StoreLog(log *Log) error {
return i.StoreLogs([]*Log{log})
}
// StoreLogs implements the LogStore interface.
func (i *InmemStore) StoreLogs(logs []*Log) error {
i.l.Lock()
defer i.l.Unlock()
for _, l := range logs {
i.logs[l.Index] = l
if i.lowIndex == 0 {
i.lowIndex = l.Index
}
if l.Index > i.highIndex {
i.highIndex = l.Index
}
}
return nil
}
// DeleteRange implements the LogStore interface.
func (i *InmemStore) DeleteRange(min, max uint64) error {
i.l.Lock()
defer i.l.Unlock()
for j := min; j <= max; j++ {
delete(i.logs, j)
}
i.lowIndex = max + 1
return nil
}
// Set implements the StableStore interface.
func (i *InmemStore) Set(key []byte, val []byte) error {
i.l.Lock()
defer i.l.Unlock()
i.kv[string(key)] = val
return nil
}
// Get implements the StableStore interface.
func (i *InmemStore) Get(key []byte) ([]byte, error) {
i.l.RLock()
defer i.l.RUnlock()
return i.kv[string(key)], nil
}
// SetUint64 implements the StableStore interface.
func (i *InmemStore) SetUint64(key []byte, val uint64) error {
i.l.Lock()
defer i.l.Unlock()
i.kvInt[string(key)] = val
return nil
}
// GetUint64 implements the StableStore interface.
func (i *InmemStore) GetUint64(key []byte) (uint64, error) {
i.l.RLock()
defer i.l.RUnlock()
return i.kvInt[string(key)], nil
}

@ -1,324 +0,0 @@
package raft
import (
"fmt"
"io"
"sync"
"time"
)
// NewInmemAddr returns a new in-memory addr with
// a randomly generate UUID as the ID.
func NewInmemAddr() string {
return generateUUID()
}
// inmemPipeline is used to pipeline requests for the in-mem transport.
type inmemPipeline struct {
trans *InmemTransport
peer *InmemTransport
peerAddr string
doneCh chan AppendFuture
inprogressCh chan *inmemPipelineInflight
shutdown bool
shutdownCh chan struct{}
shutdownLock sync.Mutex
}
type inmemPipelineInflight struct {
future *appendFuture
respCh <-chan RPCResponse
}
// InmemTransport Implements the Transport interface, to allow Raft to be
// tested in-memory without going over a network.
type InmemTransport struct {
sync.RWMutex
consumerCh chan RPC
localAddr string
peers map[string]*InmemTransport
pipelines []*inmemPipeline
timeout time.Duration
}
// NewInmemTransport is used to initialize a new transport
// and generates a random local address if none is specified
func NewInmemTransport(addr string) (string, *InmemTransport) {
if addr == "" {
addr = NewInmemAddr()
}
trans := &InmemTransport{
consumerCh: make(chan RPC, 16),
localAddr: addr,
peers: make(map[string]*InmemTransport),
timeout: 50 * time.Millisecond,
}
return addr, trans
}
// SetHeartbeatHandler is used to set optional fast-path for
// heartbeats, not supported for this transport.
func (i *InmemTransport) SetHeartbeatHandler(cb func(RPC)) {
}
// Consumer implements the Transport interface.
func (i *InmemTransport) Consumer() <-chan RPC {
return i.consumerCh
}
// LocalAddr implements the Transport interface.
func (i *InmemTransport) LocalAddr() string {
return i.localAddr
}
// AppendEntriesPipeline returns an interface that can be used to pipeline
// AppendEntries requests.
func (i *InmemTransport) AppendEntriesPipeline(target string) (AppendPipeline, error) {
i.RLock()
peer, ok := i.peers[target]
i.RUnlock()
if !ok {
return nil, fmt.Errorf("failed to connect to peer: %v", target)
}
pipeline := newInmemPipeline(i, peer, target)
i.Lock()
i.pipelines = append(i.pipelines, pipeline)
i.Unlock()
return pipeline, nil
}
// AppendEntries implements the Transport interface.
func (i *InmemTransport) AppendEntries(target string, args *AppendEntriesRequest, resp *AppendEntriesResponse) error {
rpcResp, err := i.makeRPC(target, args, nil, i.timeout)
if err != nil {
return err
}
// Copy the result back
out := rpcResp.Response.(*AppendEntriesResponse)
*resp = *out
return nil
}
// RequestVote implements the Transport interface.
func (i *InmemTransport) RequestVote(target string, args *RequestVoteRequest, resp *RequestVoteResponse) error {
rpcResp, err := i.makeRPC(target, args, nil, i.timeout)
if err != nil {
return err
}
// Copy the result back
out := rpcResp.Response.(*RequestVoteResponse)
*resp = *out
return nil
}
// InstallSnapshot implements the Transport interface.
func (i *InmemTransport) InstallSnapshot(target string, args *InstallSnapshotRequest, resp *InstallSnapshotResponse, data io.Reader) error {
rpcResp, err := i.makeRPC(target, args, data, 10*i.timeout)
if err != nil {
return err
}
// Copy the result back
out := rpcResp.Response.(*InstallSnapshotResponse)
*resp = *out
return nil
}
func (i *InmemTransport) makeRPC(target string, args interface{}, r io.Reader, timeout time.Duration) (rpcResp RPCResponse, err error) {
i.RLock()
peer, ok := i.peers[target]
i.RUnlock()
if !ok {
err = fmt.Errorf("failed to connect to peer: %v", target)
return
}
// Send the RPC over
respCh := make(chan RPCResponse)
peer.consumerCh <- RPC{
Command: args,
Reader: r,
RespChan: respCh,
}
// Wait for a response
select {
case rpcResp = <-respCh:
if rpcResp.Error != nil {
err = rpcResp.Error
}
case <-time.After(timeout):
err = fmt.Errorf("command timed out")
}
return
}
// EncodePeer implements the Transport interface. It uses the UUID as the
// address directly.
func (i *InmemTransport) EncodePeer(p string) []byte {
return []byte(p)
}
// DecodePeer implements the Transport interface. It wraps the UUID in an
// InmemAddr.
func (i *InmemTransport) DecodePeer(buf []byte) string {
return string(buf)
}
// Connect is used to connect this transport to another transport for
// a given peer name. This allows for local routing.
func (i *InmemTransport) Connect(peer string, t Transport) {
trans := t.(*InmemTransport)
i.Lock()
defer i.Unlock()
i.peers[peer] = trans
}
// Disconnect is used to remove the ability to route to a given peer.
func (i *InmemTransport) Disconnect(peer string) {
i.Lock()
defer i.Unlock()
delete(i.peers, peer)
// Disconnect any pipelines
n := len(i.pipelines)
for idx := 0; idx < n; idx++ {
if i.pipelines[idx].peerAddr == peer {
i.pipelines[idx].Close()
i.pipelines[idx], i.pipelines[n-1] = i.pipelines[n-1], nil
idx--
n--
}
}
i.pipelines = i.pipelines[:n]
}
// DisconnectAll is used to remove all routes to peers.
func (i *InmemTransport) DisconnectAll() {
i.Lock()
defer i.Unlock()
i.peers = make(map[string]*InmemTransport)
// Handle pipelines
for _, pipeline := range i.pipelines {
pipeline.Close()
}
i.pipelines = nil
}
// Close is used to permanently disable the transport
func (i *InmemTransport) Close() error {
i.DisconnectAll()
return nil
}
func newInmemPipeline(trans *InmemTransport, peer *InmemTransport, addr string) *inmemPipeline {
i := &inmemPipeline{
trans: trans,
peer: peer,
peerAddr: addr,
doneCh: make(chan AppendFuture, 16),
inprogressCh: make(chan *inmemPipelineInflight, 16),
shutdownCh: make(chan struct{}),
}
go i.decodeResponses()
return i
}
func (i *inmemPipeline) decodeResponses() {
timeout := i.trans.timeout
for {
select {
case inp := <-i.inprogressCh:
var timeoutCh <-chan time.Time
if timeout > 0 {
timeoutCh = time.After(timeout)
}
select {
case rpcResp := <-inp.respCh:
// Copy the result back
*inp.future.resp = *rpcResp.Response.(*AppendEntriesResponse)
inp.future.respond(rpcResp.Error)
select {
case i.doneCh <- inp.future:
case <-i.shutdownCh:
return
}
case <-timeoutCh:
inp.future.respond(fmt.Errorf("command timed out"))
select {
case i.doneCh <- inp.future:
case <-i.shutdownCh:
return
}
case <-i.shutdownCh:
return
}
case <-i.shutdownCh:
return
}
}
}
func (i *inmemPipeline) AppendEntries(args *AppendEntriesRequest, resp *AppendEntriesResponse) (AppendFuture, error) {
// Create a new future
future := &appendFuture{
start: time.Now(),
args: args,
resp: resp,
}
future.init()
// Handle a timeout
var timeout <-chan time.Time
if i.trans.timeout > 0 {
timeout = time.After(i.trans.timeout)
}
// Send the RPC over
respCh := make(chan RPCResponse, 1)
rpc := RPC{
Command: args,
RespChan: respCh,
}
select {
case i.peer.consumerCh <- rpc:
case <-timeout:
return nil, fmt.Errorf("command enqueue timeout")
case <-i.shutdownCh:
return nil, ErrPipelineShutdown
}
// Send to be decoded
select {
case i.inprogressCh <- &inmemPipelineInflight{future, respCh}:
return future, nil
case <-i.shutdownCh:
return nil, ErrPipelineShutdown
}
}
func (i *inmemPipeline) Consumer() <-chan AppendFuture {
return i.doneCh
}
func (i *inmemPipeline) Close() error {
i.shutdownLock.Lock()
defer i.shutdownLock.Unlock()
if i.shutdown {
return nil
}
i.shutdown = true
close(i.shutdownCh)
return nil
}

@ -1,18 +0,0 @@
package raft
import (
"testing"
)
func TestInmemTransportImpl(t *testing.T) {
var inm interface{} = &InmemTransport{}
if _, ok := inm.(Transport); !ok {
t.Fatalf("InmemTransport is not a Transport")
}
if _, ok := inm.(LoopbackTransport); !ok {
t.Fatalf("InmemTransport is not a Loopback Transport")
}
if _, ok := inm.(WithPeers); !ok {
t.Fatalf("InmemTransport is not a WithPeers Transport")
}
}

@ -1,336 +0,0 @@
package raft
import (
"bytes"
"fmt"
"io/ioutil"
"log"
"os"
"testing"
"time"
)
// CheckInteg will skip a test if integration testing is not enabled.
func CheckInteg(t *testing.T) {
if !IsInteg() {
t.SkipNow()
}
}
// IsInteg returns a boolean telling you if we're in integ testing mode.
func IsInteg() bool {
return os.Getenv("INTEG_TESTS") != ""
}
type RaftEnv struct {
dir string
conf *Config
fsm *MockFSM
store *InmemStore
snapshot *FileSnapshotStore
peers *JSONPeers
trans *NetworkTransport
raft *Raft
logger *log.Logger
}
// Release shuts down and cleans up any stored data, its not restartable after this
func (r *RaftEnv) Release() {
r.Shutdown()
os.RemoveAll(r.dir)
}
// Shutdown shuts down raft & transport, but keeps track of its data, its restartable
// after a Shutdown() by calling Start()
func (r *RaftEnv) Shutdown() {
r.logger.Printf("[WARN] Shutdown node at %v", r.raft.localAddr)
f := r.raft.Shutdown()
if err := f.Error(); err != nil {
panic(err)
}
r.trans.Close()
}
// Restart will start a raft node that was previously Shutdown()
func (r *RaftEnv) Restart(t *testing.T) {
trans, err := NewTCPTransport(r.raft.localAddr, nil, 2, time.Second, nil)
if err != nil {
t.Fatalf("err: %v", err)
}
r.trans = trans
r.logger.Printf("[INFO] Starting node at %v", trans.LocalAddr())
raft, err := NewRaft(r.conf, r.fsm, r.store, r.store, r.snapshot, r.peers, r.trans)
if err != nil {
t.Fatalf("err: %v", err)
}
r.raft = raft
}
func MakeRaft(t *testing.T, conf *Config) *RaftEnv {
// Set the config
if conf == nil {
conf = inmemConfig(t)
}
dir, err := ioutil.TempDir("", "raft")
if err != nil {
t.Fatalf("err: %v ", err)
}
stable := NewInmemStore()
snap, err := NewFileSnapshotStore(dir, 3, nil)
if err != nil {
t.Fatalf("err: %v", err)
}
env := &RaftEnv{
conf: conf,
dir: dir,
store: stable,
snapshot: snap,
fsm: &MockFSM{},
}
trans, err := NewTCPTransport("127.0.0.1:0", nil, 2, time.Second, nil)
if err != nil {
t.Fatalf("err: %v", err)
}
env.logger = log.New(os.Stdout, trans.LocalAddr()+" :", log.Lmicroseconds)
env.trans = trans
env.peers = NewJSONPeers(dir, trans)
env.logger.Printf("[INFO] Starting node at %v", trans.LocalAddr())
conf.Logger = env.logger
raft, err := NewRaft(conf, env.fsm, stable, stable, snap, env.peers, trans)
if err != nil {
t.Fatalf("err: %v", err)
}
env.raft = raft
return env
}
func WaitFor(env *RaftEnv, state RaftState) error {
limit := time.Now().Add(200 * time.Millisecond)
for env.raft.State() != state {
if time.Now().Before(limit) {
time.Sleep(10 * time.Millisecond)
} else {
return fmt.Errorf("failed to transition to state %v", state)
}
}
return nil
}
func WaitForAny(state RaftState, envs []*RaftEnv) (*RaftEnv, error) {
limit := time.Now().Add(200 * time.Millisecond)
CHECK:
for _, env := range envs {
if env.raft.State() == state {
return env, nil
}
}
if time.Now().Before(limit) {
goto WAIT
}
return nil, fmt.Errorf("failed to find node in %v state", state)
WAIT:
time.Sleep(10 * time.Millisecond)
goto CHECK
}
func WaitFuture(f Future, t *testing.T) error {
timer := time.AfterFunc(200*time.Millisecond, func() {
panic(fmt.Errorf("timeout waiting for future %v", f))
})
defer timer.Stop()
return f.Error()
}
func NoErr(err error, t *testing.T) {
if err != nil {
t.Fatalf("err: %v", err)
}
}
func CheckConsistent(envs []*RaftEnv, t *testing.T) {
limit := time.Now().Add(400 * time.Millisecond)
first := envs[0]
first.fsm.Lock()
defer first.fsm.Unlock()
var err error
CHECK:
l1 := len(first.fsm.logs)
for i := 1; i < len(envs); i++ {
env := envs[i]
env.fsm.Lock()
l2 := len(env.fsm.logs)
if l1 != l2 {
err = fmt.Errorf("log length mismatch %d %d", l1, l2)
env.fsm.Unlock()
goto ERR
}
for idx, log := range first.fsm.logs {
other := env.fsm.logs[idx]
if bytes.Compare(log, other) != 0 {
err = fmt.Errorf("log entry %d mismatch between %s/%s : '%s' / '%s'", idx, first.raft.localAddr, env.raft.localAddr, log, other)
env.fsm.Unlock()
goto ERR
}
}
env.fsm.Unlock()
}
return
ERR:
if time.Now().After(limit) {
t.Fatalf("%v", err)
}
first.fsm.Unlock()
time.Sleep(20 * time.Millisecond)
first.fsm.Lock()
goto CHECK
}
// return a log entry that's at least sz long that has the prefix 'test i '
func logBytes(i, sz int) []byte {
var logBuffer bytes.Buffer
fmt.Fprintf(&logBuffer, "test %d ", i)
for logBuffer.Len() < sz {
logBuffer.WriteByte('x')
}
return logBuffer.Bytes()
}
// Tests Raft by creating a cluster, growing it to 5 nodes while
// causing various stressful conditions
func TestRaft_Integ(t *testing.T) {
CheckInteg(t)
conf := DefaultConfig()
conf.HeartbeatTimeout = 50 * time.Millisecond
conf.ElectionTimeout = 50 * time.Millisecond
conf.LeaderLeaseTimeout = 50 * time.Millisecond
conf.CommitTimeout = 5 * time.Millisecond
conf.SnapshotThreshold = 100
conf.TrailingLogs = 10
conf.EnableSingleNode = true
// Create a single node
env1 := MakeRaft(t, conf)
NoErr(WaitFor(env1, Leader), t)
totalApplied := 0
applyAndWait := func(leader *RaftEnv, n int, sz int) {
// Do some commits
var futures []ApplyFuture
for i := 0; i < n; i++ {
futures = append(futures, leader.raft.Apply(logBytes(i, sz), 0))
}
for _, f := range futures {
NoErr(WaitFuture(f, t), t)
leader.logger.Printf("[DEBUG] Applied at %d, size %d", f.Index(), sz)
}
totalApplied += n
}
// Do some commits
applyAndWait(env1, 100, 10)
// Do a snapshot
NoErr(WaitFuture(env1.raft.Snapshot(), t), t)
// Join a few nodes!
var envs []*RaftEnv
for i := 0; i < 4; i++ {
env := MakeRaft(t, conf)
addr := env.trans.LocalAddr()
NoErr(WaitFuture(env1.raft.AddPeer(addr), t), t)
envs = append(envs, env)
}
// Wait for a leader
leader, err := WaitForAny(Leader, append([]*RaftEnv{env1}, envs...))
NoErr(err, t)
// Do some more commits
applyAndWait(leader, 100, 10)
// snapshot the leader
NoErr(WaitFuture(leader.raft.Snapshot(), t), t)
CheckConsistent(append([]*RaftEnv{env1}, envs...), t)
// shutdown a follower
disconnected := envs[len(envs)-1]
disconnected.Shutdown()
// Do some more commits [make sure the resulting snapshot will be a reasonable size]
applyAndWait(leader, 100, 10000)
// snapshot the leader [leaders log should be compacted past the disconnected follower log now]
NoErr(WaitFuture(leader.raft.Snapshot(), t), t)
// Unfortuantly we need to wait for the leader to start backing off RPCs to the down follower
// such that when the follower comes back up it'll run an election before it gets an rpc from
// the leader
time.Sleep(time.Second * 5)
// start the now out of date follower back up
disconnected.Restart(t)
// wait for it to get caught up
timeout := time.Now().Add(time.Second * 10)
for disconnected.raft.getLastApplied() < leader.raft.getLastApplied() {
time.Sleep(time.Millisecond)
if time.Now().After(timeout) {
t.Fatalf("Gave up waiting for follower to get caught up to leader")
}
}
CheckConsistent(append([]*RaftEnv{env1}, envs...), t)
// Shoot two nodes in the head!
rm1, rm2 := envs[0], envs[1]
rm1.Release()
rm2.Release()
envs = envs[2:]
time.Sleep(10 * time.Millisecond)
// Wait for a leader
leader, err = WaitForAny(Leader, append([]*RaftEnv{env1}, envs...))
NoErr(err, t)
// Do some more commits
applyAndWait(leader, 100, 10)
// Join a few new nodes!
for i := 0; i < 2; i++ {
env := MakeRaft(t, conf)
addr := env.trans.LocalAddr()
NoErr(WaitFuture(leader.raft.AddPeer(addr), t), t)
envs = append(envs, env)
}
// Remove the old nodes
NoErr(WaitFuture(leader.raft.RemovePeer(rm1.raft.localAddr), t), t)
NoErr(WaitFuture(leader.raft.RemovePeer(rm2.raft.localAddr), t), t)
// Shoot the leader
env1.Release()
time.Sleep(3 * conf.HeartbeatTimeout)
// Wait for a leader
leader, err = WaitForAny(Leader, envs)
NoErr(err, t)
allEnvs := append([]*RaftEnv{env1}, envs...)
CheckConsistent(allEnvs, t)
if len(env1.fsm.logs) != totalApplied {
t.Fatalf("should apply %d logs! %d", totalApplied, len(env1.fsm.logs))
}
for _, e := range envs {
e.Release()
}
}

@ -1,67 +0,0 @@
package raft
// LogType describes various types of log entries.
type LogType uint8
const (
// LogCommand is applied to a user FSM.
LogCommand LogType = iota
// LogNoop is used to assert leadership.
LogNoop
// LogAddPeer is used to add a new peer.
LogAddPeer
// LogRemovePeer is used to remove an existing peer.
LogRemovePeer
// LogBarrier is used to ensure all preceding operations have been
// applied to the FSM. It is similar to LogNoop, but instead of returning
// once committed, it only returns once the FSM manager acks it. Otherwise
// it is possible there are operations committed but not yet applied to
// the FSM.
LogBarrier
)
// Log entries are replicated to all members of the Raft cluster
// and form the heart of the replicated state machine.
type Log struct {
// Index holds the index of the log entry.
Index uint64
// Term holds the election term of the log entry.
Term uint64
// Type holds the type of the log entry.
Type LogType
// Data holds the log entry's type-specific data.
Data []byte
// peer is not exported since it is not transmitted, only used
// internally to construct the Data field.
peer string
}
// LogStore is used to provide an interface for storing
// and retrieving logs in a durable fashion.
type LogStore interface {
// FirstIndex returns the first index written. 0 for no entries.
FirstIndex() (uint64, error)
// LastIndex returns the last index written. 0 for no entries.
LastIndex() (uint64, error)
// GetLog gets a log entry at a given index.
GetLog(index uint64, log *Log) error
// StoreLog stores a log entry.
StoreLog(log *Log) error
// StoreLogs stores multiple log entries.
StoreLogs(logs []*Log) error
// DeleteRange deletes a range of log entries. The range is inclusive.
DeleteRange(min, max uint64) error
}

@ -1,79 +0,0 @@
package raft
import (
"fmt"
"sync"
)
// LogCache wraps any LogStore implementation to provide an
// in-memory ring buffer. This is used to cache access to
// the recently written entries. For implementations that do not
// cache themselves, this can provide a substantial boost by
// avoiding disk I/O on recent entries.
type LogCache struct {
store LogStore
cache []*Log
l sync.RWMutex
}
// NewLogCache is used to create a new LogCache with the
// given capacity and backend store.
func NewLogCache(capacity int, store LogStore) (*LogCache, error) {
if capacity <= 0 {
return nil, fmt.Errorf("capacity must be positive")
}
c := &LogCache{
store: store,
cache: make([]*Log, capacity),
}
return c, nil
}
func (c *LogCache) GetLog(idx uint64, log *Log) error {
// Check the buffer for an entry
c.l.RLock()
cached := c.cache[idx%uint64(len(c.cache))]
c.l.RUnlock()
// Check if entry is valid
if cached != nil && cached.Index == idx {
*log = *cached
return nil
}
// Forward request on cache miss
return c.store.GetLog(idx, log)
}
func (c *LogCache) StoreLog(log *Log) error {
return c.StoreLogs([]*Log{log})
}
func (c *LogCache) StoreLogs(logs []*Log) error {
// Insert the logs into the ring buffer
c.l.Lock()
for _, l := range logs {
c.cache[l.Index%uint64(len(c.cache))] = l
}
c.l.Unlock()
return c.store.StoreLogs(logs)
}
func (c *LogCache) FirstIndex() (uint64, error) {
return c.store.FirstIndex()
}
func (c *LogCache) LastIndex() (uint64, error) {
return c.store.LastIndex()
}
func (c *LogCache) DeleteRange(min, max uint64) error {
// Invalidate the cache on deletes
c.l.Lock()
c.cache = make([]*Log, len(c.cache))
c.l.Unlock()
return c.store.DeleteRange(min, max)
}

@ -1,88 +0,0 @@
package raft
import (
"testing"
)
func TestLogCache(t *testing.T) {
store := NewInmemStore()
c, _ := NewLogCache(16, store)
// Insert into the in-mem store
for i := 0; i < 32; i++ {
log := &Log{Index: uint64(i) + 1}
store.StoreLog(log)
}
// Check the indexes
if idx, _ := c.FirstIndex(); idx != 1 {
t.Fatalf("bad: %d", idx)
}
if idx, _ := c.LastIndex(); idx != 32 {
t.Fatalf("bad: %d", idx)
}
// Try get log with a miss
var out Log
err := c.GetLog(1, &out)
if err != nil {
t.Fatalf("err: %v", err)
}
if out.Index != 1 {
t.Fatalf("bad: %#v", out)
}
// Store logs
l1 := &Log{Index: 33}
l2 := &Log{Index: 34}
err = c.StoreLogs([]*Log{l1, l2})
if err != nil {
t.Fatalf("err: %v", err)
}
if idx, _ := c.LastIndex(); idx != 34 {
t.Fatalf("bad: %d", idx)
}
// Check that it wrote-through
err = store.GetLog(33, &out)
if err != nil {
t.Fatalf("err: %v", err)
}
err = store.GetLog(34, &out)
if err != nil {
t.Fatalf("err: %v", err)
}
// Delete in the backend
err = store.DeleteRange(33, 34)
if err != nil {
t.Fatalf("err: %v", err)
}
// Should be in the ring buffer
err = c.GetLog(33, &out)
if err != nil {
t.Fatalf("err: %v", err)
}
err = c.GetLog(34, &out)
if err != nil {
t.Fatalf("err: %v", err)
}
// Purge the ring buffer
err = c.DeleteRange(33, 34)
if err != nil {
t.Fatalf("err: %v", err)
}
// Should not be in the ring buffer
err = c.GetLog(33, &out)
if err != ErrLogNotFound {
t.Fatalf("err: %v", err)
}
err = c.GetLog(34, &out)
if err != ErrLogNotFound {
t.Fatalf("err: %v", err)
}
}

@ -1,622 +0,0 @@
package raft
import (
"bufio"
"errors"
"fmt"
"io"
"log"
"net"
"os"
"sync"
"time"
"github.com/hashicorp/go-msgpack/codec"
)
const (
rpcAppendEntries uint8 = iota
rpcRequestVote
rpcInstallSnapshot
// DefaultTimeoutScale is the default TimeoutScale in a NetworkTransport.
DefaultTimeoutScale = 256 * 1024 // 256KB
// rpcMaxPipeline controls the maximum number of outstanding
// AppendEntries RPC calls.
rpcMaxPipeline = 128
)
var (
// ErrTransportShutdown is returned when operations on a transport are
// invoked after it's been terminated.
ErrTransportShutdown = errors.New("transport shutdown")
// ErrPipelineShutdown is returned when the pipeline is closed.
ErrPipelineShutdown = errors.New("append pipeline closed")
)
/*
NetworkTransport provides a network based transport that can be
used to communicate with Raft on remote machines. It requires
an underlying stream layer to provide a stream abstraction, which can
be simple TCP, TLS, etc.
This transport is very simple and lightweight. Each RPC request is
framed by sending a byte that indicates the message type, followed
by the MsgPack encoded request.
The response is an error string followed by the response object,
both are encoded using MsgPack.
InstallSnapshot is special, in that after the RPC request we stream
the entire state. That socket is not re-used as the connection state
is not known if there is an error.
*/
type NetworkTransport struct {
connPool map[string][]*netConn
connPoolLock sync.Mutex
consumeCh chan RPC
heartbeatFn func(RPC)
heartbeatFnLock sync.Mutex
logger *log.Logger
maxPool int
shutdown bool
shutdownCh chan struct{}
shutdownLock sync.Mutex
stream StreamLayer
timeout time.Duration
TimeoutScale int
}
// StreamLayer is used with the NetworkTransport to provide
// the low level stream abstraction.
type StreamLayer interface {
net.Listener
// Dial is used to create a new outgoing connection
Dial(address string, timeout time.Duration) (net.Conn, error)
}
type netConn struct {
target string
conn net.Conn
r *bufio.Reader
w *bufio.Writer
dec *codec.Decoder
enc *codec.Encoder
}
func (n *netConn) Release() error {
return n.conn.Close()
}
type netPipeline struct {
conn *netConn
trans *NetworkTransport
doneCh chan AppendFuture
inprogressCh chan *appendFuture
shutdown bool
shutdownCh chan struct{}
shutdownLock sync.Mutex
}
// NewNetworkTransport creates a new network transport with the given dialer
// and listener. The maxPool controls how many connections we will pool. The
// timeout is used to apply I/O deadlines. For InstallSnapshot, we multiply
// the timeout by (SnapshotSize / TimeoutScale).
func NewNetworkTransport(
stream StreamLayer,
maxPool int,
timeout time.Duration,
logOutput io.Writer,
) *NetworkTransport {
if logOutput == nil {
logOutput = os.Stderr
}
return NewNetworkTransportWithLogger(stream, maxPool, timeout, log.New(logOutput, "", log.LstdFlags))
}
// NewNetworkTransportWithLogger creates a new network transport with the given dialer
// and listener. The maxPool controls how many connections we will pool. The
// timeout is used to apply I/O deadlines. For InstallSnapshot, we multiply
// the timeout by (SnapshotSize / TimeoutScale).
func NewNetworkTransportWithLogger(
stream StreamLayer,
maxPool int,
timeout time.Duration,
logger *log.Logger,
) *NetworkTransport {
if logger == nil {
logger = log.New(os.Stderr, "", log.LstdFlags)
}
trans := &NetworkTransport{
connPool: make(map[string][]*netConn),
consumeCh: make(chan RPC),
logger: logger,
maxPool: maxPool,
shutdownCh: make(chan struct{}),
stream: stream,
timeout: timeout,
TimeoutScale: DefaultTimeoutScale,
}
go trans.listen()
return trans
}
// SetHeartbeatHandler is used to setup a heartbeat handler
// as a fast-pass. This is to avoid head-of-line blocking from
// disk IO.
func (n *NetworkTransport) SetHeartbeatHandler(cb func(rpc RPC)) {
n.heartbeatFnLock.Lock()
defer n.heartbeatFnLock.Unlock()
n.heartbeatFn = cb
}
// Close is used to stop the network transport.
func (n *NetworkTransport) Close() error {
n.shutdownLock.Lock()
defer n.shutdownLock.Unlock()
if !n.shutdown {
close(n.shutdownCh)
n.stream.Close()
n.shutdown = true
}
return nil
}
// Consumer implements the Transport interface.
func (n *NetworkTransport) Consumer() <-chan RPC {
return n.consumeCh
}
// LocalAddr implements the Transport interface.
func (n *NetworkTransport) LocalAddr() string {
return n.stream.Addr().String()
}
// IsShutdown is used to check if the transport is shutdown.
func (n *NetworkTransport) IsShutdown() bool {
select {
case <-n.shutdownCh:
return true
default:
return false
}
}
// getExistingConn is used to grab a pooled connection.
func (n *NetworkTransport) getPooledConn(target string) *netConn {
n.connPoolLock.Lock()
defer n.connPoolLock.Unlock()
conns, ok := n.connPool[target]
if !ok || len(conns) == 0 {
return nil
}
var conn *netConn
num := len(conns)
conn, conns[num-1] = conns[num-1], nil
n.connPool[target] = conns[:num-1]
return conn
}
// getConn is used to get a connection from the pool.
func (n *NetworkTransport) getConn(target string) (*netConn, error) {
// Check for a pooled conn
if conn := n.getPooledConn(target); conn != nil {
return conn, nil
}
// Dial a new connection
conn, err := n.stream.Dial(target, n.timeout)
if err != nil {
return nil, err
}
// Wrap the conn
netConn := &netConn{
target: target,
conn: conn,
r: bufio.NewReader(conn),
w: bufio.NewWriter(conn),
}
// Setup encoder/decoders
netConn.dec = codec.NewDecoder(netConn.r, &codec.MsgpackHandle{})
netConn.enc = codec.NewEncoder(netConn.w, &codec.MsgpackHandle{})
// Done
return netConn, nil
}
// returnConn returns a connection back to the pool.
func (n *NetworkTransport) returnConn(conn *netConn) {
n.connPoolLock.Lock()
defer n.connPoolLock.Unlock()
key := conn.target
conns, _ := n.connPool[key]
if !n.IsShutdown() && len(conns) < n.maxPool {
n.connPool[key] = append(conns, conn)
} else {
conn.Release()
}
}
// AppendEntriesPipeline returns an interface that can be used to pipeline
// AppendEntries requests.
func (n *NetworkTransport) AppendEntriesPipeline(target string) (AppendPipeline, error) {
// Get a connection
conn, err := n.getConn(target)
if err != nil {
return nil, err
}
// Create the pipeline
return newNetPipeline(n, conn), nil
}
// AppendEntries implements the Transport interface.
func (n *NetworkTransport) AppendEntries(target string, args *AppendEntriesRequest, resp *AppendEntriesResponse) error {
return n.genericRPC(target, rpcAppendEntries, args, resp)
}
// RequestVote implements the Transport interface.
func (n *NetworkTransport) RequestVote(target string, args *RequestVoteRequest, resp *RequestVoteResponse) error {
return n.genericRPC(target, rpcRequestVote, args, resp)
}
// genericRPC handles a simple request/response RPC.
func (n *NetworkTransport) genericRPC(target string, rpcType uint8, args interface{}, resp interface{}) error {
// Get a conn
conn, err := n.getConn(target)
if err != nil {
return err
}
// Set a deadline
if n.timeout > 0 {
conn.conn.SetDeadline(time.Now().Add(n.timeout))
}
// Send the RPC
if err = sendRPC(conn, rpcType, args); err != nil {
return err
}
// Decode the response
canReturn, err := decodeResponse(conn, resp)
if canReturn {
n.returnConn(conn)
}
return err
}
// InstallSnapshot implements the Transport interface.
func (n *NetworkTransport) InstallSnapshot(target string, args *InstallSnapshotRequest, resp *InstallSnapshotResponse, data io.Reader) error {
// Get a conn, always close for InstallSnapshot
conn, err := n.getConn(target)
if err != nil {
return err
}
defer conn.Release()
// Set a deadline, scaled by request size
if n.timeout > 0 {
timeout := n.timeout * time.Duration(args.Size/int64(n.TimeoutScale))
if timeout < n.timeout {
timeout = n.timeout
}
conn.conn.SetDeadline(time.Now().Add(timeout))
}
// Send the RPC
if err = sendRPC(conn, rpcInstallSnapshot, args); err != nil {
return err
}
// Stream the state
if _, err = io.Copy(conn.w, data); err != nil {
return err
}
// Flush
if err = conn.w.Flush(); err != nil {
return err
}
// Decode the response, do not return conn
_, err = decodeResponse(conn, resp)
return err
}
// EncodePeer implements the Transport interface.
func (n *NetworkTransport) EncodePeer(p string) []byte {
return []byte(p)
}
// DecodePeer implements the Transport interface.
func (n *NetworkTransport) DecodePeer(buf []byte) string {
return string(buf)
}
// listen is used to handling incoming connections.
func (n *NetworkTransport) listen() {
for {
// Accept incoming connections
conn, err := n.stream.Accept()
if err != nil {
if n.IsShutdown() {
return
}
n.logger.Printf("[ERR] raft-net: Failed to accept connection: %v", err)
continue
}
n.logger.Printf("[DEBUG] raft-net: %v accepted connection from: %v", n.LocalAddr(), conn.RemoteAddr())
// Handle the connection in dedicated routine
go n.handleConn(conn)
}
}
// handleConn is used to handle an inbound connection for its lifespan.
func (n *NetworkTransport) handleConn(conn net.Conn) {
defer conn.Close()
r := bufio.NewReader(conn)
w := bufio.NewWriter(conn)
dec := codec.NewDecoder(r, &codec.MsgpackHandle{})
enc := codec.NewEncoder(w, &codec.MsgpackHandle{})
for {
if err := n.handleCommand(r, dec, enc); err != nil {
if err != io.EOF {
n.logger.Printf("[ERR] raft-net: Failed to decode incoming command: %v", err)
}
return
}
if err := w.Flush(); err != nil {
n.logger.Printf("[ERR] raft-net: Failed to flush response: %v", err)
return
}
}
}
// handleCommand is used to decode and dispatch a single command.
func (n *NetworkTransport) handleCommand(r *bufio.Reader, dec *codec.Decoder, enc *codec.Encoder) error {
// Get the rpc type
rpcType, err := r.ReadByte()
if err != nil {
return err
}
// Create the RPC object
respCh := make(chan RPCResponse, 1)
rpc := RPC{
RespChan: respCh,
}
// Decode the command
isHeartbeat := false
switch rpcType {
case rpcAppendEntries:
var req AppendEntriesRequest
if err := dec.Decode(&req); err != nil {
return err
}
rpc.Command = &req
// Check if this is a heartbeat
if req.Term != 0 && req.Leader != nil &&
req.PrevLogEntry == 0 && req.PrevLogTerm == 0 &&
len(req.Entries) == 0 && req.LeaderCommitIndex == 0 {
isHeartbeat = true
}
case rpcRequestVote:
var req RequestVoteRequest
if err := dec.Decode(&req); err != nil {
return err
}
rpc.Command = &req
case rpcInstallSnapshot:
var req InstallSnapshotRequest
if err := dec.Decode(&req); err != nil {
return err
}
rpc.Command = &req
rpc.Reader = io.LimitReader(r, req.Size)
default:
return fmt.Errorf("unknown rpc type %d", rpcType)
}
// Check for heartbeat fast-path
if isHeartbeat {
n.heartbeatFnLock.Lock()
fn := n.heartbeatFn
n.heartbeatFnLock.Unlock()
if fn != nil {
fn(rpc)
goto RESP
}
}
// Dispatch the RPC
select {
case n.consumeCh <- rpc:
case <-n.shutdownCh:
return ErrTransportShutdown
}
// Wait for response
RESP:
select {
case resp := <-respCh:
// Send the error first
respErr := ""
if resp.Error != nil {
respErr = resp.Error.Error()
}
if err := enc.Encode(respErr); err != nil {
return err
}
// Send the response
if err := enc.Encode(resp.Response); err != nil {
return err
}
case <-n.shutdownCh:
return ErrTransportShutdown
}
return nil
}
// decodeResponse is used to decode an RPC response and reports whether
// the connection can be reused.
func decodeResponse(conn *netConn, resp interface{}) (bool, error) {
// Decode the error if any
var rpcError string
if err := conn.dec.Decode(&rpcError); err != nil {
conn.Release()
return false, err
}
// Decode the response
if err := conn.dec.Decode(resp); err != nil {
conn.Release()
return false, err
}
// Format an error if any
if rpcError != "" {
return true, fmt.Errorf(rpcError)
}
return true, nil
}
// sendRPC is used to encode and send the RPC.
func sendRPC(conn *netConn, rpcType uint8, args interface{}) error {
// Write the request type
if err := conn.w.WriteByte(rpcType); err != nil {
conn.Release()
return err
}
// Send the request
if err := conn.enc.Encode(args); err != nil {
conn.Release()
return err
}
// Flush
if err := conn.w.Flush(); err != nil {
conn.Release()
return err
}
return nil
}
// newNetPipeline is used to construct a netPipeline from a given
// transport and connection.
func newNetPipeline(trans *NetworkTransport, conn *netConn) *netPipeline {
n := &netPipeline{
conn: conn,
trans: trans,
doneCh: make(chan AppendFuture, rpcMaxPipeline),
inprogressCh: make(chan *appendFuture, rpcMaxPipeline),
shutdownCh: make(chan struct{}),
}
go n.decodeResponses()
return n
}
// decodeResponses is a long running routine that decodes the responses
// sent on the connection.
func (n *netPipeline) decodeResponses() {
timeout := n.trans.timeout
for {
select {
case future := <-n.inprogressCh:
if timeout > 0 {
n.conn.conn.SetReadDeadline(time.Now().Add(timeout))
}
_, err := decodeResponse(n.conn, future.resp)
future.respond(err)
select {
case n.doneCh <- future:
case <-n.shutdownCh:
return
}
case <-n.shutdownCh:
return
}
}
}
// AppendEntries is used to pipeline a new append entries request.
func (n *netPipeline) AppendEntries(args *AppendEntriesRequest, resp *AppendEntriesResponse) (AppendFuture, error) {
// Create a new future
future := &appendFuture{
start: time.Now(),
args: args,
resp: resp,
}
future.init()
// Add a send timeout
if timeout := n.trans.timeout; timeout > 0 {
n.conn.conn.SetWriteDeadline(time.Now().Add(timeout))
}
// Send the RPC
if err := sendRPC(n.conn, rpcAppendEntries, future.args); err != nil {
return nil, err
}
// Hand-off for decoding, this can also cause back-pressure
// to prevent too many inflight requests
select {
case n.inprogressCh <- future:
return future, nil
case <-n.shutdownCh:
return nil, ErrPipelineShutdown
}
}
// Consumer returns a channel that can be used to consume complete futures.
func (n *netPipeline) Consumer() <-chan AppendFuture {
return n.doneCh
}
// Closed is used to shutdown the pipeline connection.
func (n *netPipeline) Close() error {
n.shutdownLock.Lock()
defer n.shutdownLock.Unlock()
if n.shutdown {
return nil
}
// Release the connection
n.conn.Release()
n.shutdown = true
close(n.shutdownCh)
return nil
}

@ -1,449 +0,0 @@
package raft
import (
"bytes"
"reflect"
"sync"
"testing"
"time"
)
func TestNetworkTransport_StartStop(t *testing.T) {
trans, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
if err != nil {
t.Fatalf("err: %v", err)
}
trans.Close()
}
func TestNetworkTransport_Heartbeat_FastPath(t *testing.T) {
// Transport 1 is consumer
trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
if err != nil {
t.Fatalf("err: %v", err)
}
defer trans1.Close()
// Make the RPC request
args := AppendEntriesRequest{
Term: 10,
Leader: []byte("cartman"),
}
resp := AppendEntriesResponse{
Term: 4,
LastLog: 90,
Success: true,
}
invoked := false
fastpath := func(rpc RPC) {
// Verify the command
req := rpc.Command.(*AppendEntriesRequest)
if !reflect.DeepEqual(req, &args) {
t.Fatalf("command mismatch: %#v %#v", *req, args)
}
rpc.Respond(&resp, nil)
invoked = true
}
trans1.SetHeartbeatHandler(fastpath)
// Transport 2 makes outbound request
trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
if err != nil {
t.Fatalf("err: %v", err)
}
defer trans2.Close()
var out AppendEntriesResponse
if err := trans2.AppendEntries(trans1.LocalAddr(), &args, &out); err != nil {
t.Fatalf("err: %v", err)
}
// Verify the response
if !reflect.DeepEqual(resp, out) {
t.Fatalf("command mismatch: %#v %#v", resp, out)
}
// Ensure fast-path is used
if !invoked {
t.Fatalf("fast-path not used")
}
}
func TestNetworkTransport_AppendEntries(t *testing.T) {
// Transport 1 is consumer
trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
if err != nil {
t.Fatalf("err: %v", err)
}
defer trans1.Close()
rpcCh := trans1.Consumer()
// Make the RPC request
args := AppendEntriesRequest{
Term: 10,
Leader: []byte("cartman"),
PrevLogEntry: 100,
PrevLogTerm: 4,
Entries: []*Log{
&Log{
Index: 101,
Term: 4,
Type: LogNoop,
},
},
LeaderCommitIndex: 90,
}
resp := AppendEntriesResponse{
Term: 4,
LastLog: 90,
Success: true,
}
// Listen for a request
go func() {
select {
case rpc := <-rpcCh:
// Verify the command
req := rpc.Command.(*AppendEntriesRequest)
if !reflect.DeepEqual(req, &args) {
t.Fatalf("command mismatch: %#v %#v", *req, args)
}
rpc.Respond(&resp, nil)
case <-time.After(200 * time.Millisecond):
t.Fatalf("timeout")
}
}()
// Transport 2 makes outbound request
trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
if err != nil {
t.Fatalf("err: %v", err)
}
defer trans2.Close()
var out AppendEntriesResponse
if err := trans2.AppendEntries(trans1.LocalAddr(), &args, &out); err != nil {
t.Fatalf("err: %v", err)
}
// Verify the response
if !reflect.DeepEqual(resp, out) {
t.Fatalf("command mismatch: %#v %#v", resp, out)
}
}
func TestNetworkTransport_AppendEntriesPipeline(t *testing.T) {
// Transport 1 is consumer
trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
if err != nil {
t.Fatalf("err: %v", err)
}
defer trans1.Close()
rpcCh := trans1.Consumer()
// Make the RPC request
args := AppendEntriesRequest{
Term: 10,
Leader: []byte("cartman"),
PrevLogEntry: 100,
PrevLogTerm: 4,
Entries: []*Log{
&Log{
Index: 101,
Term: 4,
Type: LogNoop,
},
},
LeaderCommitIndex: 90,
}
resp := AppendEntriesResponse{
Term: 4,
LastLog: 90,
Success: true,
}
// Listen for a request
go func() {
for i := 0; i < 10; i++ {
select {
case rpc := <-rpcCh:
// Verify the command
req := rpc.Command.(*AppendEntriesRequest)
if !reflect.DeepEqual(req, &args) {
t.Fatalf("command mismatch: %#v %#v", *req, args)
}
rpc.Respond(&resp, nil)
case <-time.After(200 * time.Millisecond):
t.Fatalf("timeout")
}
}
}()
// Transport 2 makes outbound request
trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
if err != nil {
t.Fatalf("err: %v", err)
}
defer trans2.Close()
pipeline, err := trans2.AppendEntriesPipeline(trans1.LocalAddr())
if err != nil {
t.Fatalf("err: %v", err)
}
defer pipeline.Close()
for i := 0; i < 10; i++ {
out := new(AppendEntriesResponse)
if _, err := pipeline.AppendEntries(&args, out); err != nil {
t.Fatalf("err: %v", err)
}
}
respCh := pipeline.Consumer()
for i := 0; i < 10; i++ {
select {
case ready := <-respCh:
// Verify the response
if !reflect.DeepEqual(&resp, ready.Response()) {
t.Fatalf("command mismatch: %#v %#v", &resp, ready.Response())
}
case <-time.After(200 * time.Millisecond):
t.Fatalf("timeout")
}
}
}
func TestNetworkTransport_RequestVote(t *testing.T) {
// Transport 1 is consumer
trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
if err != nil {
t.Fatalf("err: %v", err)
}
defer trans1.Close()
rpcCh := trans1.Consumer()
// Make the RPC request
args := RequestVoteRequest{
Term: 20,
Candidate: []byte("butters"),
LastLogIndex: 100,
LastLogTerm: 19,
}
resp := RequestVoteResponse{
Term: 100,
Peers: []byte("blah"),
Granted: false,
}
// Listen for a request
go func() {
select {
case rpc := <-rpcCh:
// Verify the command
req := rpc.Command.(*RequestVoteRequest)
if !reflect.DeepEqual(req, &args) {
t.Fatalf("command mismatch: %#v %#v", *req, args)
}
rpc.Respond(&resp, nil)
case <-time.After(200 * time.Millisecond):
t.Fatalf("timeout")
}
}()
// Transport 2 makes outbound request
trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
if err != nil {
t.Fatalf("err: %v", err)
}
defer trans2.Close()
var out RequestVoteResponse
if err := trans2.RequestVote(trans1.LocalAddr(), &args, &out); err != nil {
t.Fatalf("err: %v", err)
}
// Verify the response
if !reflect.DeepEqual(resp, out) {
t.Fatalf("command mismatch: %#v %#v", resp, out)
}
}
func TestNetworkTransport_InstallSnapshot(t *testing.T) {
// Transport 1 is consumer
trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
if err != nil {
t.Fatalf("err: %v", err)
}
defer trans1.Close()
rpcCh := trans1.Consumer()
// Make the RPC request
args := InstallSnapshotRequest{
Term: 10,
Leader: []byte("kyle"),
LastLogIndex: 100,
LastLogTerm: 9,
Peers: []byte("blah blah"),
Size: 10,
}
resp := InstallSnapshotResponse{
Term: 10,
Success: true,
}
// Listen for a request
go func() {
select {
case rpc := <-rpcCh:
// Verify the command
req := rpc.Command.(*InstallSnapshotRequest)
if !reflect.DeepEqual(req, &args) {
t.Fatalf("command mismatch: %#v %#v", *req, args)
}
// Try to read the bytes
buf := make([]byte, 10)
rpc.Reader.Read(buf)
// Compare
if bytes.Compare(buf, []byte("0123456789")) != 0 {
t.Fatalf("bad buf %v", buf)
}
rpc.Respond(&resp, nil)
case <-time.After(200 * time.Millisecond):
t.Fatalf("timeout")
}
}()
// Transport 2 makes outbound request
trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
if err != nil {
t.Fatalf("err: %v", err)
}
defer trans2.Close()
// Create a buffer
buf := bytes.NewBuffer([]byte("0123456789"))
var out InstallSnapshotResponse
if err := trans2.InstallSnapshot(trans1.LocalAddr(), &args, &out, buf); err != nil {
t.Fatalf("err: %v", err)
}
// Verify the response
if !reflect.DeepEqual(resp, out) {
t.Fatalf("command mismatch: %#v %#v", resp, out)
}
}
func TestNetworkTransport_EncodeDecode(t *testing.T) {
// Transport 1 is consumer
trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
if err != nil {
t.Fatalf("err: %v", err)
}
defer trans1.Close()
local := trans1.LocalAddr()
enc := trans1.EncodePeer(local)
dec := trans1.DecodePeer(enc)
if dec != local {
t.Fatalf("enc/dec fail: %v %v", dec, local)
}
}
func TestNetworkTransport_PooledConn(t *testing.T) {
// Transport 1 is consumer
trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t))
if err != nil {
t.Fatalf("err: %v", err)
}
defer trans1.Close()
rpcCh := trans1.Consumer()
// Make the RPC request
args := AppendEntriesRequest{
Term: 10,
Leader: []byte("cartman"),
PrevLogEntry: 100,
PrevLogTerm: 4,
Entries: []*Log{
&Log{
Index: 101,
Term: 4,
Type: LogNoop,
},
},
LeaderCommitIndex: 90,
}
resp := AppendEntriesResponse{
Term: 4,
LastLog: 90,
Success: true,
}
// Listen for a request
go func() {
for {
select {
case rpc := <-rpcCh:
// Verify the command
req := rpc.Command.(*AppendEntriesRequest)
if !reflect.DeepEqual(req, &args) {
t.Fatalf("command mismatch: %#v %#v", *req, args)
}
rpc.Respond(&resp, nil)
case <-time.After(200 * time.Millisecond):
return
}
}
}()
// Transport 2 makes outbound request, 3 conn pool
trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 3, time.Second, newTestLogger(t))
if err != nil {
t.Fatalf("err: %v", err)
}
defer trans2.Close()
// Create wait group
wg := &sync.WaitGroup{}
wg.Add(5)
appendFunc := func() {
defer wg.Done()
var out AppendEntriesResponse
if err := trans2.AppendEntries(trans1.LocalAddr(), &args, &out); err != nil {
t.Fatalf("err: %v", err)
}
// Verify the response
if !reflect.DeepEqual(resp, out) {
t.Fatalf("command mismatch: %#v %#v", resp, out)
}
}
// Try to do parallel appends, should stress the conn pool
for i := 0; i < 5; i++ {
go appendFunc()
}
// Wait for the routines to finish
wg.Wait()
// Check the conn pool size
addr := trans1.LocalAddr()
if len(trans2.connPool[addr]) != 3 {
t.Fatalf("Expected 2 pooled conns!")
}
}

@ -1,122 +0,0 @@
package raft
import (
"sync/atomic"
)
// Observation is sent along the given channel to observers when an event occurs.
type Observation struct {
// Raft holds the Raft instance generating the observation.
Raft *Raft
// Data holds observation-specific data. Possible types are
// *RequestVoteRequest, RaftState and LeaderObservation.
Data interface{}
}
// LeaderObservation is used in Observation.Data when leadership changes.
type LeaderObservation struct {
Leader string
}
// nextObserverId is used to provide a unique ID for each observer to aid in
// deregistration.
var nextObserverID uint64
// FilterFn is a function that can be registered in order to filter observations.
// The function reports whether the observation should be included - if
// it returns false, the observation will be filtered out.
type FilterFn func(o *Observation) bool
// Observer describes what to do with a given observation.
type Observer struct {
// numObserved and numDropped are performance counters for this observer.
// 64 bit types must be 64 bit aligned to use with atomic operations on
// 32 bit platforms, so keep them at the top of the struct.
numObserved uint64
numDropped uint64
// channel receives observations.
channel chan Observation
// blocking, if true, will cause Raft to block when sending an observation
// to this observer. This should generally be set to false.
blocking bool
// filter will be called to determine if an observation should be sent to
// the channel.
filter FilterFn
// id is the ID of this observer in the Raft map.
id uint64
}
// NewObserver creates a new observer that can be registered
// to make observations on a Raft instance. Observations
// will be sent on the given channel if they satisfy the
// given filter.
//
// If blocking is true, the observer will block when it can't
// send on the channel, otherwise it may discard events.
func NewObserver(channel chan Observation, blocking bool, filter FilterFn) *Observer {
return &Observer{
channel: channel,
blocking: blocking,
filter: filter,
id: atomic.AddUint64(&nextObserverID, 1),
}
}
// GetNumObserved returns the number of observations.
func (or *Observer) GetNumObserved() uint64 {
return atomic.LoadUint64(&or.numObserved)
}
// GetNumDropped returns the number of dropped observations due to blocking.
func (or *Observer) GetNumDropped() uint64 {
return atomic.LoadUint64(&or.numDropped)
}
// RegisterObserver registers a new observer.
func (r *Raft) RegisterObserver(or *Observer) {
r.observersLock.Lock()
defer r.observersLock.Unlock()
r.observers[or.id] = or
}
// DeregisterObserver deregisters an observer.
func (r *Raft) DeregisterObserver(or *Observer) {
r.observersLock.Lock()
defer r.observersLock.Unlock()
delete(r.observers, or.id)
}
// observe sends an observation to every observer.
func (r *Raft) observe(o interface{}) {
// In general observers should not block. But in any case this isn't
// disastrous as we only hold a read lock, which merely prevents
// registration / deregistration of observers.
r.observersLock.RLock()
defer r.observersLock.RUnlock()
for _, or := range r.observers {
// It's wasteful to do this in the loop, but for the common case
// where there are no observers we won't create any objects.
ob := Observation{Raft: r, Data: o}
if or.filter != nil && !or.filter(&ob) {
continue
}
if or.channel == nil {
continue
}
if or.blocking {
or.channel <- ob
atomic.AddUint64(&or.numObserved, 1)
} else {
select {
case or.channel <- ob:
atomic.AddUint64(&or.numObserved, 1)
default:
atomic.AddUint64(&or.numDropped, 1)
}
}
}
}

@ -1,122 +0,0 @@
package raft
import (
"bytes"
"encoding/json"
"io/ioutil"
"os"
"path/filepath"
"sync"
)
const (
jsonPeerPath = "peers.json"
)
// PeerStore provides an interface for persistent storage and
// retrieval of peers. We use a separate interface than StableStore
// since the peers may need to be edited by a human operator. For example,
// in a two node cluster, the failure of either node requires human intervention
// since consensus is impossible.
type PeerStore interface {
// Peers returns the list of known peers.
Peers() ([]string, error)
// SetPeers sets the list of known peers. This is invoked when a peer is
// added or removed.
SetPeers([]string) error
}
// StaticPeers is used to provide a static list of peers.
type StaticPeers struct {
StaticPeers []string
l sync.Mutex
}
// Peers implements the PeerStore interface.
func (s *StaticPeers) Peers() ([]string, error) {
s.l.Lock()
peers := s.StaticPeers
s.l.Unlock()
return peers, nil
}
// SetPeers implements the PeerStore interface.
func (s *StaticPeers) SetPeers(p []string) error {
s.l.Lock()
s.StaticPeers = p
s.l.Unlock()
return nil
}
// JSONPeers is used to provide peer persistence on disk in the form
// of a JSON file. This allows human operators to manipulate the file.
type JSONPeers struct {
l sync.Mutex
path string
trans Transport
}
// NewJSONPeers creates a new JSONPeers store. Requires a transport
// to handle the serialization of network addresses.
func NewJSONPeers(base string, trans Transport) *JSONPeers {
path := filepath.Join(base, jsonPeerPath)
store := &JSONPeers{
path: path,
trans: trans,
}
return store
}
// Peers implements the PeerStore interface.
func (j *JSONPeers) Peers() ([]string, error) {
j.l.Lock()
defer j.l.Unlock()
// Read the file
buf, err := ioutil.ReadFile(j.path)
if err != nil && !os.IsNotExist(err) {
return nil, err
}
// Check for no peers
if len(buf) == 0 {
return nil, nil
}
// Decode the peers
var peerSet []string
dec := json.NewDecoder(bytes.NewReader(buf))
if err := dec.Decode(&peerSet); err != nil {
return nil, err
}
// Deserialize each peer
var peers []string
for _, p := range peerSet {
peers = append(peers, j.trans.DecodePeer([]byte(p)))
}
return peers, nil
}
// SetPeers implements the PeerStore interface.
func (j *JSONPeers) SetPeers(peers []string) error {
j.l.Lock()
defer j.l.Unlock()
// Encode each peer
var peerSet []string
for _, p := range peers {
peerSet = append(peerSet, string(j.trans.EncodePeer(p)))
}
// Convert to JSON
var buf bytes.Buffer
enc := json.NewEncoder(&buf)
if err := enc.Encode(peerSet); err != nil {
return err
}
// Write out as JSON
return ioutil.WriteFile(j.path, buf.Bytes(), 0755)
}

@ -1,44 +0,0 @@
package raft
import (
"io/ioutil"
"os"
"testing"
)
func TestJSONPeers(t *testing.T) {
// Create a test dir
dir, err := ioutil.TempDir("", "raft")
if err != nil {
t.Fatalf("err: %v ", err)
}
defer os.RemoveAll(dir)
// Create the store
_, trans := NewInmemTransport("")
store := NewJSONPeers(dir, trans)
// Try a read, should get nothing
peers, err := store.Peers()
if err != nil {
t.Fatalf("err: %v", err)
}
if len(peers) != 0 {
t.Fatalf("peers: %v", peers)
}
// Initialize some peers
newPeers := []string{NewInmemAddr(), NewInmemAddr(), NewInmemAddr()}
if err := store.SetPeers(newPeers); err != nil {
t.Fatalf("err: %v", err)
}
// Try a read, should peers
peers, err = store.Peers()
if err != nil {
t.Fatalf("err: %v", err)
}
if len(peers) != 3 {
t.Fatalf("peers: %v", peers)
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,522 +0,0 @@
package raft
import (
"errors"
"fmt"
"sync"
"time"
"github.com/armon/go-metrics"
)
const (
maxFailureScale = 12
failureWait = 10 * time.Millisecond
)
var (
// ErrLogNotFound indicates a given log entry is not available.
ErrLogNotFound = errors.New("log not found")
// ErrPipelineReplicationNotSupported can be returned by the transport to
// signal that pipeline replication is not supported in general, and that
// no error message should be produced.
ErrPipelineReplicationNotSupported = errors.New("pipeline replication not supported")
)
type followerReplication struct {
peer string
inflight *inflight
stopCh chan uint64
triggerCh chan struct{}
currentTerm uint64
matchIndex uint64
nextIndex uint64
lastContact time.Time
lastContactLock sync.RWMutex
failures uint64
notifyCh chan struct{}
notify []*verifyFuture
notifyLock sync.Mutex
// stepDown is used to indicate to the leader that we
// should step down based on information from a follower.
stepDown chan struct{}
// allowPipeline is used to control it seems like
// pipeline replication should be enabled.
allowPipeline bool
}
// notifyAll is used to notify all the waiting verify futures
// if the follower believes we are still the leader.
func (s *followerReplication) notifyAll(leader bool) {
// Clear the waiting notifies minimizing lock time
s.notifyLock.Lock()
n := s.notify
s.notify = nil
s.notifyLock.Unlock()
// Submit our votes
for _, v := range n {
v.vote(leader)
}
}
// LastContact returns the time of last contact.
func (s *followerReplication) LastContact() time.Time {
s.lastContactLock.RLock()
last := s.lastContact
s.lastContactLock.RUnlock()
return last
}
// setLastContact sets the last contact to the current time.
func (s *followerReplication) setLastContact() {
s.lastContactLock.Lock()
s.lastContact = time.Now()
s.lastContactLock.Unlock()
}
// replicate is a long running routine that is used to manage
// the process of replicating logs to our followers.
func (r *Raft) replicate(s *followerReplication) {
// Start an async heartbeating routing
stopHeartbeat := make(chan struct{})
defer close(stopHeartbeat)
r.goFunc(func() { r.heartbeat(s, stopHeartbeat) })
RPC:
shouldStop := false
for !shouldStop {
select {
case maxIndex := <-s.stopCh:
// Make a best effort to replicate up to this index
if maxIndex > 0 {
r.replicateTo(s, maxIndex)
}
return
case <-s.triggerCh:
lastLogIdx, _ := r.getLastLog()
shouldStop = r.replicateTo(s, lastLogIdx)
case <-randomTimeout(r.conf.CommitTimeout):
lastLogIdx, _ := r.getLastLog()
shouldStop = r.replicateTo(s, lastLogIdx)
}
// If things looks healthy, switch to pipeline mode
if !shouldStop && s.allowPipeline {
goto PIPELINE
}
}
return
PIPELINE:
// Disable until re-enabled
s.allowPipeline = false
// Replicates using a pipeline for high performance. This method
// is not able to gracefully recover from errors, and so we fall back
// to standard mode on failure.
if err := r.pipelineReplicate(s); err != nil {
if err != ErrPipelineReplicationNotSupported {
r.logger.Printf("[ERR] raft: Failed to start pipeline replication to %s: %s", s.peer, err)
}
}
goto RPC
}
// replicateTo is used to replicate the logs up to a given last index.
// If the follower log is behind, we take care to bring them up to date.
func (r *Raft) replicateTo(s *followerReplication, lastIndex uint64) (shouldStop bool) {
// Create the base request
var req AppendEntriesRequest
var resp AppendEntriesResponse
var start time.Time
START:
// Prevent an excessive retry rate on errors
if s.failures > 0 {
select {
case <-time.After(backoff(failureWait, s.failures, maxFailureScale)):
case <-r.shutdownCh:
}
}
// Setup the request
if err := r.setupAppendEntries(s, &req, s.nextIndex, lastIndex); err == ErrLogNotFound {
goto SEND_SNAP
} else if err != nil {
return
}
// Make the RPC call
start = time.Now()
if err := r.trans.AppendEntries(s.peer, &req, &resp); err != nil {
r.logger.Printf("[ERR] raft: Failed to AppendEntries to %v: %v", s.peer, err)
s.failures++
return
}
appendStats(s.peer, start, float32(len(req.Entries)))
// Check for a newer term, stop running
if resp.Term > req.Term {
r.handleStaleTerm(s)
return true
}
// Update the last contact
s.setLastContact()
// Update s based on success
if resp.Success {
// Update our replication state
updateLastAppended(s, &req)
// Clear any failures, allow pipelining
s.failures = 0
s.allowPipeline = true
} else {
s.nextIndex = max(min(s.nextIndex-1, resp.LastLog+1), 1)
s.matchIndex = s.nextIndex - 1
if resp.NoRetryBackoff {
s.failures = 0
} else {
s.failures++
}
r.logger.Printf("[WARN] raft: AppendEntries to %v rejected, sending older logs (next: %d)", s.peer, s.nextIndex)
}
CHECK_MORE:
// Check if there are more logs to replicate
if s.nextIndex <= lastIndex {
goto START
}
return
// SEND_SNAP is used when we fail to get a log, usually because the follower
// is too far behind, and we must ship a snapshot down instead
SEND_SNAP:
if stop, err := r.sendLatestSnapshot(s); stop {
return true
} else if err != nil {
r.logger.Printf("[ERR] raft: Failed to send snapshot to %v: %v", s.peer, err)
return
}
// Check if there is more to replicate
goto CHECK_MORE
}
// sendLatestSnapshot is used to send the latest snapshot we have
// down to our follower.
func (r *Raft) sendLatestSnapshot(s *followerReplication) (bool, error) {
// Get the snapshots
snapshots, err := r.snapshots.List()
if err != nil {
r.logger.Printf("[ERR] raft: Failed to list snapshots: %v", err)
return false, err
}
// Check we have at least a single snapshot
if len(snapshots) == 0 {
return false, fmt.Errorf("no snapshots found")
}
// Open the most recent snapshot
snapID := snapshots[0].ID
meta, snapshot, err := r.snapshots.Open(snapID)
if err != nil {
r.logger.Printf("[ERR] raft: Failed to open snapshot %v: %v", snapID, err)
return false, err
}
defer snapshot.Close()
// Setup the request
req := InstallSnapshotRequest{
Term: s.currentTerm,
Leader: r.trans.EncodePeer(r.localAddr),
LastLogIndex: meta.Index,
LastLogTerm: meta.Term,
Peers: meta.Peers,
Size: meta.Size,
}
// Make the call
start := time.Now()
var resp InstallSnapshotResponse
if err := r.trans.InstallSnapshot(s.peer, &req, &resp, snapshot); err != nil {
r.logger.Printf("[ERR] raft: Failed to install snapshot %v: %v", snapID, err)
s.failures++
return false, err
}
metrics.MeasureSince([]string{"raft", "replication", "installSnapshot", s.peer}, start)
// Check for a newer term, stop running
if resp.Term > req.Term {
r.handleStaleTerm(s)
return true, nil
}
// Update the last contact
s.setLastContact()
// Check for success
if resp.Success {
// Mark any inflight logs as committed
s.inflight.CommitRange(s.matchIndex+1, meta.Index)
// Update the indexes
s.matchIndex = meta.Index
s.nextIndex = s.matchIndex + 1
// Clear any failures
s.failures = 0
// Notify we are still leader
s.notifyAll(true)
} else {
s.failures++
r.logger.Printf("[WARN] raft: InstallSnapshot to %v rejected", s.peer)
}
return false, nil
}
// heartbeat is used to periodically invoke AppendEntries on a peer
// to ensure they don't time out. This is done async of replicate(),
// since that routine could potentially be blocked on disk IO.
func (r *Raft) heartbeat(s *followerReplication, stopCh chan struct{}) {
var failures uint64
req := AppendEntriesRequest{
Term: s.currentTerm,
Leader: r.trans.EncodePeer(r.localAddr),
}
var resp AppendEntriesResponse
for {
// Wait for the next heartbeat interval or forced notify
select {
case <-s.notifyCh:
case <-randomTimeout(r.conf.HeartbeatTimeout / 10):
case <-stopCh:
return
}
start := time.Now()
if err := r.trans.AppendEntries(s.peer, &req, &resp); err != nil {
r.logger.Printf("[ERR] raft: Failed to heartbeat to %v: %v", s.peer, err)
failures++
select {
case <-time.After(backoff(failureWait, failures, maxFailureScale)):
case <-stopCh:
}
} else {
s.setLastContact()
failures = 0
metrics.MeasureSince([]string{"raft", "replication", "heartbeat", s.peer}, start)
s.notifyAll(resp.Success)
}
}
}
// pipelineReplicate is used when we have synchronized our state with the follower,
// and want to switch to a higher performance pipeline mode of replication.
// We only pipeline AppendEntries commands, and if we ever hit an error, we fall
// back to the standard replication which can handle more complex situations.
func (r *Raft) pipelineReplicate(s *followerReplication) error {
// Create a new pipeline
pipeline, err := r.trans.AppendEntriesPipeline(s.peer)
if err != nil {
return err
}
defer pipeline.Close()
// Log start and stop of pipeline
r.logger.Printf("[INFO] raft: pipelining replication to peer %v", s.peer)
defer r.logger.Printf("[INFO] raft: aborting pipeline replication to peer %v", s.peer)
// Create a shutdown and finish channel
stopCh := make(chan struct{})
finishCh := make(chan struct{})
// Start a dedicated decoder
r.goFunc(func() { r.pipelineDecode(s, pipeline, stopCh, finishCh) })
// Start pipeline sends at the last good nextIndex
nextIndex := s.nextIndex
shouldStop := false
SEND:
for !shouldStop {
select {
case <-finishCh:
break SEND
case maxIndex := <-s.stopCh:
if maxIndex > 0 {
r.pipelineSend(s, pipeline, &nextIndex, maxIndex)
}
break SEND
case <-s.triggerCh:
lastLogIdx, _ := r.getLastLog()
shouldStop = r.pipelineSend(s, pipeline, &nextIndex, lastLogIdx)
case <-randomTimeout(r.conf.CommitTimeout):
lastLogIdx, _ := r.getLastLog()
shouldStop = r.pipelineSend(s, pipeline, &nextIndex, lastLogIdx)
}
}
// Stop our decoder, and wait for it to finish
close(stopCh)
select {
case <-finishCh:
case <-r.shutdownCh:
}
return nil
}
// pipelineSend is used to send data over a pipeline.
func (r *Raft) pipelineSend(s *followerReplication, p AppendPipeline, nextIdx *uint64, lastIndex uint64) (shouldStop bool) {
// Create a new append request
req := new(AppendEntriesRequest)
if err := r.setupAppendEntries(s, req, *nextIdx, lastIndex); err != nil {
return true
}
// Pipeline the append entries
if _, err := p.AppendEntries(req, new(AppendEntriesResponse)); err != nil {
r.logger.Printf("[ERR] raft: Failed to pipeline AppendEntries to %v: %v", s.peer, err)
return true
}
// Increase the next send log to avoid re-sending old logs
if n := len(req.Entries); n > 0 {
last := req.Entries[n-1]
*nextIdx = last.Index + 1
}
return false
}
// pipelineDecode is used to decode the responses of pipelined requests.
func (r *Raft) pipelineDecode(s *followerReplication, p AppendPipeline, stopCh, finishCh chan struct{}) {
defer close(finishCh)
respCh := p.Consumer()
for {
select {
case ready := <-respCh:
req, resp := ready.Request(), ready.Response()
appendStats(s.peer, ready.Start(), float32(len(req.Entries)))
// Check for a newer term, stop running
if resp.Term > req.Term {
r.handleStaleTerm(s)
return
}
// Update the last contact
s.setLastContact()
// Abort pipeline if not successful
if !resp.Success {
return
}
// Update our replication state
updateLastAppended(s, req)
case <-stopCh:
return
}
}
}
// setupAppendEntries is used to setup an append entries request.
func (r *Raft) setupAppendEntries(s *followerReplication, req *AppendEntriesRequest, nextIndex, lastIndex uint64) error {
req.Term = s.currentTerm
req.Leader = r.trans.EncodePeer(r.localAddr)
req.LeaderCommitIndex = r.getCommitIndex()
if err := r.setPreviousLog(req, nextIndex); err != nil {
return err
}
if err := r.setNewLogs(req, nextIndex, lastIndex); err != nil {
return err
}
return nil
}
// setPreviousLog is used to setup the PrevLogEntry and PrevLogTerm for an
// AppendEntriesRequest given the next index to replicate.
func (r *Raft) setPreviousLog(req *AppendEntriesRequest, nextIndex uint64) error {
// Guard for the first index, since there is no 0 log entry
// Guard against the previous index being a snapshot as well
lastSnapIdx, lastSnapTerm := r.getLastSnapshot()
if nextIndex == 1 {
req.PrevLogEntry = 0
req.PrevLogTerm = 0
} else if (nextIndex - 1) == lastSnapIdx {
req.PrevLogEntry = lastSnapIdx
req.PrevLogTerm = lastSnapTerm
} else {
var l Log
if err := r.logs.GetLog(nextIndex-1, &l); err != nil {
r.logger.Printf("[ERR] raft: Failed to get log at index %d: %v",
nextIndex-1, err)
return err
}
// Set the previous index and term (0 if nextIndex is 1)
req.PrevLogEntry = l.Index
req.PrevLogTerm = l.Term
}
return nil
}
// setNewLogs is used to setup the logs which should be appended for a request.
func (r *Raft) setNewLogs(req *AppendEntriesRequest, nextIndex, lastIndex uint64) error {
// Append up to MaxAppendEntries or up to the lastIndex
req.Entries = make([]*Log, 0, r.conf.MaxAppendEntries)
maxIndex := min(nextIndex+uint64(r.conf.MaxAppendEntries)-1, lastIndex)
for i := nextIndex; i <= maxIndex; i++ {
oldLog := new(Log)
if err := r.logs.GetLog(i, oldLog); err != nil {
r.logger.Printf("[ERR] raft: Failed to get log at index %d: %v", i, err)
return err
}
req.Entries = append(req.Entries, oldLog)
}
return nil
}
// appendStats is used to emit stats about an AppendEntries invocation.
func appendStats(peer string, start time.Time, logs float32) {
metrics.MeasureSince([]string{"raft", "replication", "appendEntries", "rpc", peer}, start)
metrics.IncrCounter([]string{"raft", "replication", "appendEntries", "logs", peer}, logs)
}
// handleStaleTerm is used when a follower indicates that we have a stale term.
func (r *Raft) handleStaleTerm(s *followerReplication) {
r.logger.Printf("[ERR] raft: peer %v has newer term, stopping replication", s.peer)
s.notifyAll(false) // No longer leader
asyncNotifyCh(s.stepDown)
}
// updateLastAppended is used to update follower replication state after a successful
// AppendEntries RPC.
func updateLastAppended(s *followerReplication, req *AppendEntriesRequest) {
// Mark any inflight logs as committed
if logs := req.Entries; len(logs) > 0 {
first := logs[0]
last := logs[len(logs)-1]
s.inflight.CommitRange(first.Index, last.Index)
// Update the indexes
s.matchIndex = last.Index
s.nextIndex = last.Index + 1
}
// Notify still leader
s.notifyAll(true)
}

@ -1,40 +0,0 @@
package raft
import (
"io"
)
// SnapshotMeta is for metadata of a snapshot.
type SnapshotMeta struct {
ID string // ID is opaque to the store, and is used for opening
Index uint64
Term uint64
Peers []byte
Size int64
}
// SnapshotStore interface is used to allow for flexible implementations
// of snapshot storage and retrieval. For example, a client could implement
// a shared state store such as S3, allowing new nodes to restore snapshots
// without streaming from the leader.
type SnapshotStore interface {
// Create is used to begin a snapshot at a given index and term,
// with the current peer set already encoded.
Create(index, term uint64, peers []byte) (SnapshotSink, error)
// List is used to list the available snapshots in the store.
// It should return then in descending order, with the highest index first.
List() ([]*SnapshotMeta, error)
// Open takes a snapshot ID and provides a ReadCloser. Once close is
// called it is assumed the snapshot is no longer needed.
Open(id string) (*SnapshotMeta, io.ReadCloser, error)
}
// SnapshotSink is returned by StartSnapshot. The FSM will Write state
// to the sink and call Close on completion. On error, Cancel will be invoked.
type SnapshotSink interface {
io.WriteCloser
ID() string
Cancel() error
}

@ -1,15 +0,0 @@
package raft
// StableStore is used to provide stable storage
// of key configurations to ensure safety.
type StableStore interface {
Set(key []byte, val []byte) error
// Get returns the value for key, or an empty byte slice if key was not found.
Get(key []byte) ([]byte, error)
SetUint64(key []byte, val uint64) error
// GetUint64 returns the uint64 value for key, or 0 if key was not found.
GetUint64(key []byte) (uint64, error)
}

@ -1,171 +0,0 @@
package raft
import (
"sync"
"sync/atomic"
)
// RaftState captures the state of a Raft node: Follower, Candidate, Leader,
// or Shutdown.
type RaftState uint32
const (
// Follower is the initial state of a Raft node.
Follower RaftState = iota
// Candidate is one of the valid states of a Raft node.
Candidate
// Leader is one of the valid states of a Raft node.
Leader
// Shutdown is the terminal state of a Raft node.
Shutdown
)
func (s RaftState) String() string {
switch s {
case Follower:
return "Follower"
case Candidate:
return "Candidate"
case Leader:
return "Leader"
case Shutdown:
return "Shutdown"
default:
return "Unknown"
}
}
// raftState is used to maintain various state variables
// and provides an interface to set/get the variables in a
// thread safe manner.
type raftState struct {
// currentTerm commitIndex, lastApplied, must be kept at the top of
// the struct so they're 64 bit aligned which is a requirement for
// atomic ops on 32 bit platforms.
// The current term, cache of StableStore
currentTerm uint64
// Highest committed log entry
commitIndex uint64
// Last applied log to the FSM
lastApplied uint64
// protects 4 next fields
lastLock sync.Mutex
// Cache the latest snapshot index/term
lastSnapshotIndex uint64
lastSnapshotTerm uint64
// Cache the latest log from LogStore
lastLogIndex uint64
lastLogTerm uint64
// Tracks running goroutines
routinesGroup sync.WaitGroup
// The current state
state RaftState
}
func (r *raftState) getState() RaftState {
stateAddr := (*uint32)(&r.state)
return RaftState(atomic.LoadUint32(stateAddr))
}
func (r *raftState) setState(s RaftState) {
stateAddr := (*uint32)(&r.state)
atomic.StoreUint32(stateAddr, uint32(s))
}
func (r *raftState) getCurrentTerm() uint64 {
return atomic.LoadUint64(&r.currentTerm)
}
func (r *raftState) setCurrentTerm(term uint64) {
atomic.StoreUint64(&r.currentTerm, term)
}
func (r *raftState) getLastLog() (index, term uint64) {
r.lastLock.Lock()
index = r.lastLogIndex
term = r.lastLogTerm
r.lastLock.Unlock()
return
}
func (r *raftState) setLastLog(index, term uint64) {
r.lastLock.Lock()
r.lastLogIndex = index
r.lastLogTerm = term
r.lastLock.Unlock()
}
func (r *raftState) getLastSnapshot() (index, term uint64) {
r.lastLock.Lock()
index = r.lastSnapshotIndex
term = r.lastSnapshotTerm
r.lastLock.Unlock()
return
}
func (r *raftState) setLastSnapshot(index, term uint64) {
r.lastLock.Lock()
r.lastSnapshotIndex = index
r.lastSnapshotTerm = term
r.lastLock.Unlock()
}
func (r *raftState) getCommitIndex() uint64 {
return atomic.LoadUint64(&r.commitIndex)
}
func (r *raftState) setCommitIndex(index uint64) {
atomic.StoreUint64(&r.commitIndex, index)
}
func (r *raftState) getLastApplied() uint64 {
return atomic.LoadUint64(&r.lastApplied)
}
func (r *raftState) setLastApplied(index uint64) {
atomic.StoreUint64(&r.lastApplied, index)
}
// Start a goroutine and properly handle the race between a routine
// starting and incrementing, and exiting and decrementing.
func (r *raftState) goFunc(f func()) {
r.routinesGroup.Add(1)
go func() {
defer r.routinesGroup.Done()
f()
}()
}
func (r *raftState) waitShutdown() {
r.routinesGroup.Wait()
}
// getLastIndex returns the last index in stable storage.
// Either from the last log or from the last snapshot.
func (r *raftState) getLastIndex() uint64 {
r.lastLock.Lock()
defer r.lastLock.Unlock()
return max(r.lastLogIndex, r.lastSnapshotIndex)
}
// getLastEntry returns the last index and term in stable storage.
// Either from the last log or from the last snapshot.
func (r *raftState) getLastEntry() (uint64, uint64) {
r.lastLock.Lock()
defer r.lastLock.Unlock()
if r.lastLogIndex >= r.lastSnapshotIndex {
return r.lastLogIndex, r.lastLogTerm
}
return r.lastSnapshotIndex, r.lastSnapshotTerm
}

@ -1,16 +0,0 @@
#!/usr/bin/env bash
set -e
# The version must be supplied from the environment. Do not include the
# leading "v".
if [ -z $VERSION ]; then
echo "Please specify a version."
exit 1
fi
# Generate the tag.
echo "==> Tagging version $VERSION..."
git commit --allow-empty -a --gpg-sign=348FFC4C -m "Release v$VERSION"
git tag -a -m "Version $VERSION" -s -u 348FFC4C "v${VERSION}" master
exit 0

@ -1,105 +0,0 @@
package raft
import (
"errors"
"io"
"log"
"net"
"time"
)
var (
errNotAdvertisable = errors.New("local bind address is not advertisable")
errNotTCP = errors.New("local address is not a TCP address")
)
// TCPStreamLayer implements StreamLayer interface for plain TCP.
type TCPStreamLayer struct {
advertise net.Addr
listener *net.TCPListener
}
// NewTCPTransport returns a NetworkTransport that is built on top of
// a TCP streaming transport layer.
func NewTCPTransport(
bindAddr string,
advertise net.Addr,
maxPool int,
timeout time.Duration,
logOutput io.Writer,
) (*NetworkTransport, error) {
return newTCPTransport(bindAddr, advertise, maxPool, timeout, func(stream StreamLayer) *NetworkTransport {
return NewNetworkTransport(stream, maxPool, timeout, logOutput)
})
}
// NewTCPTransportWithLogger returns a NetworkTransport that is built on top of
// a TCP streaming transport layer, with log output going to the supplied Logger
func NewTCPTransportWithLogger(
bindAddr string,
advertise net.Addr,
maxPool int,
timeout time.Duration,
logger *log.Logger,
) (*NetworkTransport, error) {
return newTCPTransport(bindAddr, advertise, maxPool, timeout, func(stream StreamLayer) *NetworkTransport {
return NewNetworkTransportWithLogger(stream, maxPool, timeout, logger)
})
}
func newTCPTransport(bindAddr string,
advertise net.Addr,
maxPool int,
timeout time.Duration,
transportCreator func(stream StreamLayer) *NetworkTransport) (*NetworkTransport, error) {
// Try to bind
list, err := net.Listen("tcp", bindAddr)
if err != nil {
return nil, err
}
// Create stream
stream := &TCPStreamLayer{
advertise: advertise,
listener: list.(*net.TCPListener),
}
// Verify that we have a usable advertise address
addr, ok := stream.Addr().(*net.TCPAddr)
if !ok {
list.Close()
return nil, errNotTCP
}
if addr.IP.IsUnspecified() {
list.Close()
return nil, errNotAdvertisable
}
// Create the network transport
trans := transportCreator(stream)
return trans, nil
}
// Dial implements the StreamLayer interface.
func (t *TCPStreamLayer) Dial(address string, timeout time.Duration) (net.Conn, error) {
return net.DialTimeout("tcp", address, timeout)
}
// Accept implements the net.Listener interface.
func (t *TCPStreamLayer) Accept() (c net.Conn, err error) {
return t.listener.Accept()
}
// Close implements the net.Listener interface.
func (t *TCPStreamLayer) Close() (err error) {
return t.listener.Close()
}
// Addr implements the net.Listener interface.
func (t *TCPStreamLayer) Addr() net.Addr {
// Use an advertise addr if provided
if t.advertise != nil {
return t.advertise
}
return t.listener.Addr()
}

@ -1,24 +0,0 @@
package raft
import (
"net"
"testing"
)
func TestTCPTransport_BadAddr(t *testing.T) {
_, err := NewTCPTransportWithLogger("0.0.0.0:0", nil, 1, 0, newTestLogger(t))
if err != errNotAdvertisable {
t.Fatalf("err: %v", err)
}
}
func TestTCPTransport_WithAdvertise(t *testing.T) {
addr := &net.TCPAddr{IP: []byte{127, 0, 0, 1}, Port: 12345}
trans, err := NewTCPTransportWithLogger("0.0.0.0:0", addr, 1, 0, newTestLogger(t))
if err != nil {
t.Fatalf("err: %v", err)
}
if trans.LocalAddr() != "127.0.0.1:12345" {
t.Fatalf("bad: %v", trans.LocalAddr())
}
}

@ -1,124 +0,0 @@
package raft
import (
"io"
"time"
)
// RPCResponse captures both a response and a potential error.
type RPCResponse struct {
Response interface{}
Error error
}
// RPC has a command, and provides a response mechanism.
type RPC struct {
Command interface{}
Reader io.Reader // Set only for InstallSnapshot
RespChan chan<- RPCResponse
}
// Respond is used to respond with a response, error or both
func (r *RPC) Respond(resp interface{}, err error) {
r.RespChan <- RPCResponse{resp, err}
}
// Transport provides an interface for network transports
// to allow Raft to communicate with other nodes.
type Transport interface {
// Consumer returns a channel that can be used to
// consume and respond to RPC requests.
Consumer() <-chan RPC
// LocalAddr is used to return our local address to distinguish from our peers.
LocalAddr() string
// AppendEntriesPipeline returns an interface that can be used to pipeline
// AppendEntries requests.
AppendEntriesPipeline(target string) (AppendPipeline, error)
// AppendEntries sends the appropriate RPC to the target node.
AppendEntries(target string, args *AppendEntriesRequest, resp *AppendEntriesResponse) error
// RequestVote sends the appropriate RPC to the target node.
RequestVote(target string, args *RequestVoteRequest, resp *RequestVoteResponse) error
// InstallSnapshot is used to push a snapshot down to a follower. The data is read from
// the ReadCloser and streamed to the client.
InstallSnapshot(target string, args *InstallSnapshotRequest, resp *InstallSnapshotResponse, data io.Reader) error
// EncodePeer is used to serialize a peer name.
EncodePeer(string) []byte
// DecodePeer is used to deserialize a peer name.
DecodePeer([]byte) string
// SetHeartbeatHandler is used to setup a heartbeat handler
// as a fast-pass. This is to avoid head-of-line blocking from
// disk IO. If a Transport does not support this, it can simply
// ignore the call, and push the heartbeat onto the Consumer channel.
SetHeartbeatHandler(cb func(rpc RPC))
}
// WithClose is an interface that a transport may provide which
// allows a transport to be shut down cleanly when a Raft instance
// shuts down.
//
// It is defined separately from Transport as unfortunately it wasn't in the
// original interface specification.
type WithClose interface {
// Close permanently closes a transport, stopping
// any associated goroutines and freeing other resources.
Close() error
}
// LoopbackTransport is an interface that provides a loopback transport suitable for testing
// e.g. InmemTransport. It's there so we don't have to rewrite tests.
type LoopbackTransport interface {
Transport // Embedded transport reference
WithPeers // Embedded peer management
WithClose // with a close routine
}
// WithPeers is an interface that a transport may provide which allows for connection and
// disconnection. Unless the transport is a loopback transport, the transport specified to
// "Connect" is likely to be nil.
type WithPeers interface {
Connect(peer string, t Transport) // Connect a peer
Disconnect(peer string) // Disconnect a given peer
DisconnectAll() // Disconnect all peers, possibly to reconnect them later
}
// AppendPipeline is used for pipelining AppendEntries requests. It is used
// to increase the replication throughput by masking latency and better
// utilizing bandwidth.
type AppendPipeline interface {
// AppendEntries is used to add another request to the pipeline.
// The send may block which is an effective form of back-pressure.
AppendEntries(args *AppendEntriesRequest, resp *AppendEntriesResponse) (AppendFuture, error)
// Consumer returns a channel that can be used to consume
// response futures when they are ready.
Consumer() <-chan AppendFuture
// Close closes the pipeline and cancels all inflight RPCs
Close() error
}
// AppendFuture is used to return information about a pipelined AppendEntries request.
type AppendFuture interface {
Future
// Start returns the time that the append request was started.
// It is always OK to call this method.
Start() time.Time
// Request holds the parameters of the AppendEntries call.
// It is always OK to call this method.
Request() *AppendEntriesRequest
// Response holds the results of the AppendEntries call.
// This method must only be called after the Error
// method returns, and will only be valid on success.
Response() *AppendEntriesResponse
}

@ -1,313 +0,0 @@
package raft
import (
"bytes"
"reflect"
"testing"
"time"
)
const (
TT_Inmem = iota
// NOTE: must be last
numTestTransports
)
func NewTestTransport(ttype int, addr string) (string, LoopbackTransport) {
switch ttype {
case TT_Inmem:
addr, lt := NewInmemTransport(addr)
return addr, lt
default:
panic("Unknown transport type")
}
}
func TestTransport_StartStop(t *testing.T) {
for ttype := 0; ttype < numTestTransports; ttype++ {
_, trans := NewTestTransport(ttype, "")
if err := trans.Close(); err != nil {
t.Fatalf("err: %v", err)
}
}
}
func TestTransport_AppendEntries(t *testing.T) {
for ttype := 0; ttype < numTestTransports; ttype++ {
addr1, trans1 := NewTestTransport(ttype, "")
defer trans1.Close()
rpcCh := trans1.Consumer()
// Make the RPC request
args := AppendEntriesRequest{
Term: 10,
Leader: []byte("cartman"),
PrevLogEntry: 100,
PrevLogTerm: 4,
Entries: []*Log{
&Log{
Index: 101,
Term: 4,
Type: LogNoop,
},
},
LeaderCommitIndex: 90,
}
resp := AppendEntriesResponse{
Term: 4,
LastLog: 90,
Success: true,
}
// Listen for a request
go func() {
select {
case rpc := <-rpcCh:
// Verify the command
req := rpc.Command.(*AppendEntriesRequest)
if !reflect.DeepEqual(req, &args) {
t.Fatalf("command mismatch: %#v %#v", *req, args)
}
rpc.Respond(&resp, nil)
case <-time.After(200 * time.Millisecond):
t.Fatalf("timeout")
}
}()
// Transport 2 makes outbound request
addr2, trans2 := NewTestTransport(ttype, "")
defer trans2.Close()
trans1.Connect(addr2, trans2)
trans2.Connect(addr1, trans1)
var out AppendEntriesResponse
if err := trans2.AppendEntries(trans1.LocalAddr(), &args, &out); err != nil {
t.Fatalf("err: %v", err)
}
// Verify the response
if !reflect.DeepEqual(resp, out) {
t.Fatalf("command mismatch: %#v %#v", resp, out)
}
}
}
func TestTransport_AppendEntriesPipeline(t *testing.T) {
for ttype := 0; ttype < numTestTransports; ttype++ {
addr1, trans1 := NewTestTransport(ttype, "")
defer trans1.Close()
rpcCh := trans1.Consumer()
// Make the RPC request
args := AppendEntriesRequest{
Term: 10,
Leader: []byte("cartman"),
PrevLogEntry: 100,
PrevLogTerm: 4,
Entries: []*Log{
&Log{
Index: 101,
Term: 4,
Type: LogNoop,
},
},
LeaderCommitIndex: 90,
}
resp := AppendEntriesResponse{
Term: 4,
LastLog: 90,
Success: true,
}
// Listen for a request
go func() {
for i := 0; i < 10; i++ {
select {
case rpc := <-rpcCh:
// Verify the command
req := rpc.Command.(*AppendEntriesRequest)
if !reflect.DeepEqual(req, &args) {
t.Fatalf("command mismatch: %#v %#v", *req, args)
}
rpc.Respond(&resp, nil)
case <-time.After(200 * time.Millisecond):
t.Fatalf("timeout")
}
}
}()
// Transport 2 makes outbound request
addr2, trans2 := NewTestTransport(ttype, "")
defer trans2.Close()
trans1.Connect(addr2, trans2)
trans2.Connect(addr1, trans1)
pipeline, err := trans2.AppendEntriesPipeline(trans1.LocalAddr())
if err != nil {
t.Fatalf("err: %v", err)
}
defer pipeline.Close()
for i := 0; i < 10; i++ {
out := new(AppendEntriesResponse)
if _, err := pipeline.AppendEntries(&args, out); err != nil {
t.Fatalf("err: %v", err)
}
}
respCh := pipeline.Consumer()
for i := 0; i < 10; i++ {
select {
case ready := <-respCh:
// Verify the response
if !reflect.DeepEqual(&resp, ready.Response()) {
t.Fatalf("command mismatch: %#v %#v", &resp, ready.Response())
}
case <-time.After(200 * time.Millisecond):
t.Fatalf("timeout")
}
}
}
}
func TestTransport_RequestVote(t *testing.T) {
for ttype := 0; ttype < numTestTransports; ttype++ {
addr1, trans1 := NewTestTransport(ttype, "")
defer trans1.Close()
rpcCh := trans1.Consumer()
// Make the RPC request
args := RequestVoteRequest{
Term: 20,
Candidate: []byte("butters"),
LastLogIndex: 100,
LastLogTerm: 19,
}
resp := RequestVoteResponse{
Term: 100,
Peers: []byte("blah"),
Granted: false,
}
// Listen for a request
go func() {
select {
case rpc := <-rpcCh:
// Verify the command
req := rpc.Command.(*RequestVoteRequest)
if !reflect.DeepEqual(req, &args) {
t.Fatalf("command mismatch: %#v %#v", *req, args)
}
rpc.Respond(&resp, nil)
case <-time.After(200 * time.Millisecond):
t.Fatalf("timeout")
}
}()
// Transport 2 makes outbound request
addr2, trans2 := NewTestTransport(ttype, "")
defer trans2.Close()
trans1.Connect(addr2, trans2)
trans2.Connect(addr1, trans1)
var out RequestVoteResponse
if err := trans2.RequestVote(trans1.LocalAddr(), &args, &out); err != nil {
t.Fatalf("err: %v", err)
}
// Verify the response
if !reflect.DeepEqual(resp, out) {
t.Fatalf("command mismatch: %#v %#v", resp, out)
}
}
}
func TestTransport_InstallSnapshot(t *testing.T) {
for ttype := 0; ttype < numTestTransports; ttype++ {
addr1, trans1 := NewTestTransport(ttype, "")
defer trans1.Close()
rpcCh := trans1.Consumer()
// Make the RPC request
args := InstallSnapshotRequest{
Term: 10,
Leader: []byte("kyle"),
LastLogIndex: 100,
LastLogTerm: 9,
Peers: []byte("blah blah"),
Size: 10,
}
resp := InstallSnapshotResponse{
Term: 10,
Success: true,
}
// Listen for a request
go func() {
select {
case rpc := <-rpcCh:
// Verify the command
req := rpc.Command.(*InstallSnapshotRequest)
if !reflect.DeepEqual(req, &args) {
t.Fatalf("command mismatch: %#v %#v", *req, args)
}
// Try to read the bytes
buf := make([]byte, 10)
rpc.Reader.Read(buf)
// Compare
if bytes.Compare(buf, []byte("0123456789")) != 0 {
t.Fatalf("bad buf %v", buf)
}
rpc.Respond(&resp, nil)
case <-time.After(200 * time.Millisecond):
t.Fatalf("timeout")
}
}()
// Transport 2 makes outbound request
addr2, trans2 := NewTestTransport(ttype, "")
defer trans2.Close()
trans1.Connect(addr2, trans2)
trans2.Connect(addr1, trans1)
// Create a buffer
buf := bytes.NewBuffer([]byte("0123456789"))
var out InstallSnapshotResponse
if err := trans2.InstallSnapshot(trans1.LocalAddr(), &args, &out, buf); err != nil {
t.Fatalf("err: %v", err)
}
// Verify the response
if !reflect.DeepEqual(resp, out) {
t.Fatalf("command mismatch: %#v %#v", resp, out)
}
}
}
func TestTransport_EncodeDecode(t *testing.T) {
for ttype := 0; ttype < numTestTransports; ttype++ {
_, trans1 := NewTestTransport(ttype, "")
defer trans1.Close()
local := trans1.LocalAddr()
enc := trans1.EncodePeer(local)
dec := trans1.DecodePeer(enc)
if dec != local {
t.Fatalf("enc/dec fail: %v %v", dec, local)
}
}
}

@ -1,179 +0,0 @@
package raft
import (
"bytes"
crand "crypto/rand"
"fmt"
"math"
"math/big"
"math/rand"
"time"
"github.com/hashicorp/go-msgpack/codec"
)
func init() {
// Ensure we use a high-entropy seed for the psuedo-random generator
rand.Seed(newSeed())
}
// returns an int64 from a crypto random source
// can be used to seed a source for a math/rand.
func newSeed() int64 {
r, err := crand.Int(crand.Reader, big.NewInt(math.MaxInt64))
if err != nil {
panic(fmt.Errorf("failed to read random bytes: %v", err))
}
return r.Int64()
}
// randomTimeout returns a value that is between the minVal and 2x minVal.
func randomTimeout(minVal time.Duration) <-chan time.Time {
if minVal == 0 {
return nil
}
extra := (time.Duration(rand.Int63()) % minVal)
return time.After(minVal + extra)
}
// min returns the minimum.
func min(a, b uint64) uint64 {
if a <= b {
return a
}
return b
}
// max returns the maximum.
func max(a, b uint64) uint64 {
if a >= b {
return a
}
return b
}
// generateUUID is used to generate a random UUID.
func generateUUID() string {
buf := make([]byte, 16)
if _, err := crand.Read(buf); err != nil {
panic(fmt.Errorf("failed to read random bytes: %v", err))
}
return fmt.Sprintf("%08x-%04x-%04x-%04x-%12x",
buf[0:4],
buf[4:6],
buf[6:8],
buf[8:10],
buf[10:16])
}
// asyncNotifyCh is used to do an async channel send
// to a single channel without blocking.
func asyncNotifyCh(ch chan struct{}) {
select {
case ch <- struct{}{}:
default:
}
}
// asyncNotifyBool is used to do an async notification
// on a bool channel.
func asyncNotifyBool(ch chan bool, v bool) {
select {
case ch <- v:
default:
}
}
// ExcludePeer is used to exclude a single peer from a list of peers.
func ExcludePeer(peers []string, peer string) []string {
otherPeers := make([]string, 0, len(peers))
for _, p := range peers {
if p != peer {
otherPeers = append(otherPeers, p)
}
}
return otherPeers
}
// PeerContained checks if a given peer is contained in a list.
func PeerContained(peers []string, peer string) bool {
for _, p := range peers {
if p == peer {
return true
}
}
return false
}
// AddUniquePeer is used to add a peer to a list of existing
// peers only if it is not already contained.
func AddUniquePeer(peers []string, peer string) []string {
if PeerContained(peers, peer) {
return peers
}
return append(peers, peer)
}
// encodePeers is used to serialize a list of peers.
func encodePeers(peers []string, trans Transport) []byte {
// Encode each peer
var encPeers [][]byte
for _, p := range peers {
encPeers = append(encPeers, trans.EncodePeer(p))
}
// Encode the entire array
buf, err := encodeMsgPack(encPeers)
if err != nil {
panic(fmt.Errorf("failed to encode peers: %v", err))
}
return buf.Bytes()
}
// decodePeers is used to deserialize a list of peers.
func decodePeers(buf []byte, trans Transport) []string {
// Decode the buffer first
var encPeers [][]byte
if err := decodeMsgPack(buf, &encPeers); err != nil {
panic(fmt.Errorf("failed to decode peers: %v", err))
}
// Deserialize each peer
var peers []string
for _, enc := range encPeers {
peers = append(peers, trans.DecodePeer(enc))
}
return peers
}
// Decode reverses the encode operation on a byte slice input.
func decodeMsgPack(buf []byte, out interface{}) error {
r := bytes.NewBuffer(buf)
hd := codec.MsgpackHandle{}
dec := codec.NewDecoder(r, &hd)
return dec.Decode(out)
}
// Encode writes an encoded object to a new bytes buffer.
func encodeMsgPack(in interface{}) (*bytes.Buffer, error) {
buf := bytes.NewBuffer(nil)
hd := codec.MsgpackHandle{}
enc := codec.NewEncoder(buf, &hd)
err := enc.Encode(in)
return buf, err
}
// backoff is used to compute an exponential backoff
// duration. Base time is scaled by the current round,
// up to some maximum scale factor.
func backoff(base time.Duration, round, limit uint64) time.Duration {
power := min(round, limit)
for power > 2 {
base *= 2
power--
}
return base
}

@ -1,152 +0,0 @@
package raft
import (
"reflect"
"regexp"
"testing"
"time"
)
func TestRandomTimeout(t *testing.T) {
start := time.Now()
timeout := randomTimeout(time.Millisecond)
select {
case <-timeout:
diff := time.Now().Sub(start)
if diff < time.Millisecond {
t.Fatalf("fired early")
}
case <-time.After(3 * time.Millisecond):
t.Fatalf("timeout")
}
}
func TestNewSeed(t *testing.T) {
vals := make(map[int64]bool)
for i := 0; i < 1000; i++ {
seed := newSeed()
if _, exists := vals[seed]; exists {
t.Fatal("newSeed() return a value it'd previously returned")
}
vals[seed] = true
}
}
func TestRandomTimeout_NoTime(t *testing.T) {
timeout := randomTimeout(0)
if timeout != nil {
t.Fatalf("expected nil channel")
}
}
func TestMin(t *testing.T) {
if min(1, 1) != 1 {
t.Fatalf("bad min")
}
if min(2, 1) != 1 {
t.Fatalf("bad min")
}
if min(1, 2) != 1 {
t.Fatalf("bad min")
}
}
func TestMax(t *testing.T) {
if max(1, 1) != 1 {
t.Fatalf("bad max")
}
if max(2, 1) != 2 {
t.Fatalf("bad max")
}
if max(1, 2) != 2 {
t.Fatalf("bad max")
}
}
func TestGenerateUUID(t *testing.T) {
prev := generateUUID()
for i := 0; i < 100; i++ {
id := generateUUID()
if prev == id {
t.Fatalf("Should get a new ID!")
}
matched, err := regexp.MatchString(
`[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}`, id)
if !matched || err != nil {
t.Fatalf("expected match %s %v %s", id, matched, err)
}
}
}
func TestExcludePeer(t *testing.T) {
peers := []string{NewInmemAddr(), NewInmemAddr(), NewInmemAddr()}
peer := peers[2]
after := ExcludePeer(peers, peer)
if len(after) != 2 {
t.Fatalf("Bad length")
}
if after[0] == peer || after[1] == peer {
t.Fatalf("should not contain peer")
}
}
func TestPeerContained(t *testing.T) {
peers := []string{NewInmemAddr(), NewInmemAddr(), NewInmemAddr()}
if !PeerContained(peers, peers[2]) {
t.Fatalf("Expect contained")
}
if PeerContained(peers, NewInmemAddr()) {
t.Fatalf("unexpected contained")
}
}
func TestAddUniquePeer(t *testing.T) {
peers := []string{NewInmemAddr(), NewInmemAddr(), NewInmemAddr()}
after := AddUniquePeer(peers, peers[2])
if !reflect.DeepEqual(after, peers) {
t.Fatalf("unexpected append")
}
after = AddUniquePeer(peers, NewInmemAddr())
if len(after) != 4 {
t.Fatalf("expected append")
}
}
func TestEncodeDecodePeers(t *testing.T) {
peers := []string{NewInmemAddr(), NewInmemAddr(), NewInmemAddr()}
_, trans := NewInmemTransport("")
// Try to encode/decode
buf := encodePeers(peers, trans)
decoded := decodePeers(buf, trans)
if !reflect.DeepEqual(peers, decoded) {
t.Fatalf("mismatch %v %v", peers, decoded)
}
}
func TestBackoff(t *testing.T) {
b := backoff(10*time.Millisecond, 1, 8)
if b != 10*time.Millisecond {
t.Fatalf("bad: %v", b)
}
b = backoff(20*time.Millisecond, 2, 8)
if b != 20*time.Millisecond {
t.Fatalf("bad: %v", b)
}
b = backoff(10*time.Millisecond, 8, 8)
if b != 640*time.Millisecond {
t.Fatalf("bad: %v", b)
}
b = backoff(10*time.Millisecond, 9, 8)
if b != 640*time.Millisecond {
t.Fatalf("bad: %v", b)
}
}
Loading…
Cancel
Save