1
0
Fork 0

Merge pull request #1599 from rqlite/no-upload-after-leader-change

Test no upload after Leader change
master
Philip O'Toole 8 months ago committed by GitHub
commit 029eda8ce1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -5,6 +5,7 @@
- [PR #1596](https://github.com/rqlite/rqlite/pull/1596): Track Raft logs which change the database.
- [PR #1597](https://github.com/rqlite/rqlite/pull/1597): Clarify end-to-end testing code.
- [PR #1598](https://github.com/rqlite/rqlite/pull/1598): Refactor Store-level index tracking.
- [PR #1599](https://github.com/rqlite/rqlite/pull/1599): Test no upload after Leader change.
## 8.16.4 (January 12th 2024)
### Implementation changes and bug fixes

@ -1796,15 +1796,12 @@ func Test_MultiNodeDBAppliedIndex(t *testing.T) {
if err != nil {
t.Fatalf("failed to execute on single node: %s", err.Error())
}
if _, err := s0.WaitForAppliedFSM(5 * time.Second); err != nil {
t.Fatalf("failed to wait for FSM to apply on leader")
}
testPoll(t, func() bool {
qr := queryRequestFromString("SELECT count(*) FROM foo", false, true)
qr.Level = proto.QueryRequest_QUERY_REQUEST_LEVEL_NONE
r, err := s1.Query(qr)
return err == nil && asJSON(r[0].Values) == `[[3]]`
return s0.DBAppliedIndex() == s1.DBAppliedIndex()
}, 250*time.Millisecond, 3*time.Second)
if s0.DBAppliedIndex() != s1.DBAppliedIndex() {
t.Fatalf("applied index mismatch (%d, %d)", s0.DBAppliedIndex(), s1.DBAppliedIndex())
}
// Create a third node, make sure it joins the cluster, and check that the DBAppliedIndex
// is correct.
@ -1822,14 +1819,9 @@ func Test_MultiNodeDBAppliedIndex(t *testing.T) {
t.Fatalf("failed to wait for leader on follower: %s", err.Error())
}
testPoll(t, func() bool {
qr := queryRequestFromString("SELECT count(*) FROM foo", false, true)
qr.Level = proto.QueryRequest_QUERY_REQUEST_LEVEL_NONE
r, err := s2.Query(qr)
return err == nil && asJSON(r[0].Values) == `[[3]]`
return s0.DBAppliedIndex() == s2.DBAppliedIndex()
}, 250*time.Millisecond, 3*time.Second)
if s0.DBAppliedIndex() != s2.DBAppliedIndex() {
t.Fatalf("applied index mismatch (%d, %d)", s0.DBAppliedIndex(), s2.DBAppliedIndex())
}
// Noop, then snapshot, truncating all logs. Then have another node join the cluster.
if af, err := s0.Noop("don't care"); err != nil || af.Error() != nil {
@ -1852,14 +1844,8 @@ func Test_MultiNodeDBAppliedIndex(t *testing.T) {
t.Fatalf("failed to wait for leader on follower: %s", err.Error())
}
testPoll(t, func() bool {
qr := queryRequestFromString("SELECT count(*) FROM foo", false, true)
qr.Level = proto.QueryRequest_QUERY_REQUEST_LEVEL_NONE
r, err := s3.Query(qr)
return err == nil && asJSON(r[0].Values) == `[[3]]`
}, 250*time.Millisecond, 3*time.Second)
if s0.DBAppliedIndex() > s2.DBAppliedIndex() {
t.Fatalf("applied index on new node is not correct (%d, %d)", s0.DBAppliedIndex(), s2.DBAppliedIndex())
}
return s0.DBAppliedIndex() <= s3.DBAppliedIndex()
}, 250*time.Millisecond, 5*time.Second)
// Write one last row, and everything should be in sync.
er = executeRequestFromStrings([]string{
@ -1869,6 +1855,9 @@ func Test_MultiNodeDBAppliedIndex(t *testing.T) {
if err != nil {
t.Fatalf("failed to execute on single node: %s", err.Error())
}
if _, err := s0.WaitForAppliedFSM(5 * time.Second); err != nil {
t.Fatalf("failed to wait for FSM to apply on leader")
}
testPoll(t, func() bool {
i := s0.DBAppliedIndex()

@ -6,7 +6,7 @@ import unittest
import sqlite3
import time
from helpers import Node, deprovision_node, write_random_file, random_string, env_present, gunzip_file, gzip_compress, temp_file, d_
from helpers import Node, deprovision_node, write_random_file, random_string, env_present, gunzip_file, gzip_compress, temp_file, d_, Cluster
from s3 import download_s3_object, delete_s3_object, upload_s3_object
S3_BUCKET = 'rqlite-testing-circleci'
@ -340,7 +340,7 @@ class TestAutoBackupS3(unittest.TestCase):
node.wait_until_uploads_idle()
# Write one more record, wait for a backup to happen.
i = node.num_auto_backups()[0]
i = node.num_auto_backups()['ok']
node.execute('INSERT INTO foo(name) VALUES("fiona")')
j = node.query('SELECT count(*) FROM foo', level='strong')
self.assertEqual(j, d_("{'results': [{'values': [[100]], 'types': ['integer'], 'columns': ['count(*)']}]}"))
@ -409,7 +409,7 @@ class TestAutoBackupS3(unittest.TestCase):
# Confirm that the follower has performed no backups.
time.sleep(5)
self.assertEqual(follower.num_auto_backups()[0], 0)
self.assertEqual(follower.num_auto_backups()['ok'], 0)
delete_s3_object(access_key_id, secret_access_key_id, S3_BUCKET, path)
deprovision_node(leader)
@ -470,6 +470,67 @@ class TestAutoBackupS3(unittest.TestCase):
deprovision_node(node)
os.remove(cfg)
@unittest.skipUnless(env_present('RQLITE_S3_ACCESS_KEY'), "S3 credentials not available")
def test_no_upload_leader_change(self):
'''Test that when a cluster changes leader, the new leader doesn't upload again'''
node = None
cfg = None
path = None
access_key_id = os.environ['RQLITE_S3_ACCESS_KEY']
secret_access_key_id = os.environ['RQLITE_S3_SECRET_ACCESS_KEY']
# Create the auto-backup config file
path = random_string(32)
auto_backup_cfg = {
"version": 1,
"type": "s3",
"interval": "100ms",
"no_compress": True,
"sub" : {
"access_key_id": access_key_id,
"secret_access_key": secret_access_key_id,
"region": S3_BUCKET_REGION,
"bucket": S3_BUCKET,
"path": path
}
}
cfg = write_random_file(json.dumps(auto_backup_cfg))
# Create a cluster with automatic backups enabled. An initial
# backup will happen because there is no data in the cloud.
n0 = Node(RQLITED_PATH, '0', auto_backup=cfg)
n0.start()
n0.wait_for_leader()
n0.wait_for_upload(1)
# Then create a table and insert a row. Wait for another backup to happen.
n0.execute('CREATE TABLE foo (id INTEGER NOT NULL PRIMARY KEY, name TEXT)')
n0.wait_for_upload(2)
# Create a cluster with two more followers
n1 = Node(RQLITED_PATH, '1', auto_backup=cfg)
n1.start(join=n0.RaftAddr())
n1.wait_for_leader()
n2 = Node(RQLITED_PATH, '2', auto_backup=cfg)
n2.start(join=n0.RaftAddr())
n2.wait_for_leader()
# Kill the leader, and get the new leader
cluster = Cluster([n0, n1, n2])
l = cluster.wait_for_leader()
l.stop(graceful=False)
new_leader = cluster.wait_for_leader(node_exc=l)
# Ensure new leader didn't do a backup
new_leader.wait_until_uploads_idle()
self.assertEqual(new_leader.num_auto_backups()['ok'], 0)
delete_s3_object(access_key_id, secret_access_key_id, S3_BUCKET, path)
deprovision_node(n0)
deprovision_node(n1)
deprovision_node(n2)
os.remove(cfg)
if __name__ == "__main__":
unittest.main(verbosity=2)

@ -376,12 +376,14 @@ class Node(object):
def num_auto_backups(self):
'''
Return a tuple of the number of successful, failed, skipped auto-backups, skipped sum auto-backups.
Return a dict of the number of successful, failed, skipped auto-backups, skipped sum auto-backups.
'''
return (int(self.expvar()['uploader']['num_uploads_ok']),
int(self.expvar()['uploader']['num_uploads_fail']),
int(self.expvar()['uploader']['num_uploads_skipped']),
int(self.expvar()['uploader']['num_uploads_skipped_sum']))
return {
'ok': int(self.expvar()['uploader']['num_uploads_ok']),
'fail': int(self.expvar()['uploader']['num_uploads_fail']),
'skip': int(self.expvar()['uploader']['num_uploads_skipped']),
'skip_sum': int(self.expvar()['uploader']['num_uploads_skipped_sum'])
}
def wait_for_upload(self, i, timeout=TIMEOUT):
'''
@ -389,7 +391,7 @@ class Node(object):
'''
t = 0
while t < timeout:
if self.num_auto_backups()[0] == i:
if self.num_auto_backups()['ok'] == i:
return self.num_auto_backups()
time.sleep(0.1)
t+=1
@ -402,7 +404,7 @@ class Node(object):
'''
t = 0
while t < timeout:
if self.num_auto_backups()[3] >= i:
if self.num_auto_backups()['skip_sum'] >= i:
return self.num_auto_backups()
time.sleep(0.1)
t+=1
@ -413,15 +415,27 @@ class Node(object):
'''
Wait until uploads go idle.
'''
i = self.num_auto_backups()[2]
t = 0
while t < timeout:
if self.num_auto_backups()[2] > i:
return self.num_auto_backups()
time.sleep(0.1)
t+=1
backups = self.num_auto_backups()['ok']
skipped = self.num_auto_backups()['skip']
skipped_sum = self.num_auto_backups()['skip_sum']
time.sleep(0.5)
if self.num_auto_backups()['skip'] + self.num_auto_backups()['skip_sum'] == skipped + skipped_sum:
# Skipped uploads are not increasing, so uploads are not idle
t+=1
continue
# OK, skipped uploads are increasing, but has the number of backups stayed the same?
if self.num_auto_backups()['ok'] != backups:
t+=1
continue
# Backups are idle
return self.num_auto_backups()
n = self.num_auto_backups()
raise Exception('rqlite node failed to idle backups within %d seconds (%d, %d, %d, %d)' % (timeout, n[0], n[1], n[2], n[3]))
raise Exception('rqlite node failed to idle backups within %d seconds (%s)' % n)
def wait_for_fsm_index(self, index, timeout=TIMEOUT):
'''

Loading…
Cancel
Save