Merge pull request #1599 from rqlite/no-upload-after-leader-change

Test no upload after Leader change
8 months ago · 029eda8ce1
parent 34aa54ddec 3ff2360609
commit 029eda8ce1
4 changed files with 103 additions and 38 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,6 +5,7 @@
 - [PR #1596](https://github.com/rqlite/rqlite/pull/1596): Track Raft logs which change the database.
 - [PR #1597](https://github.com/rqlite/rqlite/pull/1597): Clarify end-to-end testing code.
 - [PR #1598](https://github.com/rqlite/rqlite/pull/1598): Refactor Store-level index tracking.
+- [PR #1599](https://github.com/rqlite/rqlite/pull/1599): Test no upload after Leader change.

 ## 8.16.4 (January 12th 2024)
 ### Implementation changes and bug fixes
--- a/store/store_test.go
+++ b/store/store_test.go
@ -1796,15 +1796,12 @@ func Test_MultiNodeDBAppliedIndex(t *testing.T) {
 	if err != nil {
 		t.Fatalf("failed to execute on single node: %s", err.Error())
 	}
+	if _, err := s0.WaitForAppliedFSM(5 * time.Second); err != nil {
+		t.Fatalf("failed to wait for FSM to apply on leader")
+	}
 	testPoll(t, func() bool {
-		qr := queryRequestFromString("SELECT count(*) FROM foo", false, true)
-		qr.Level = proto.QueryRequest_QUERY_REQUEST_LEVEL_NONE
-		r, err := s1.Query(qr)
-		return err == nil && asJSON(r[0].Values) == `[[3]]`
+		return s0.DBAppliedIndex() == s1.DBAppliedIndex()
 	}, 250*time.Millisecond, 3*time.Second)
-	if s0.DBAppliedIndex() != s1.DBAppliedIndex() {
-		t.Fatalf("applied index mismatch (%d, %d)", s0.DBAppliedIndex(), s1.DBAppliedIndex())
-	}

 	// Create a third node, make sure it joins the cluster, and check that the DBAppliedIndex
 	// is correct.
@ -1822,14 +1819,9 @@ func Test_MultiNodeDBAppliedIndex(t *testing.T) {
 		t.Fatalf("failed to wait for leader on follower: %s", err.Error())
 	}
 	testPoll(t, func() bool {
-		qr := queryRequestFromString("SELECT count(*) FROM foo", false, true)
-		qr.Level = proto.QueryRequest_QUERY_REQUEST_LEVEL_NONE
-		r, err := s2.Query(qr)
-		return err == nil && asJSON(r[0].Values) == `[[3]]`
+		return s0.DBAppliedIndex() == s2.DBAppliedIndex()
+
 	}, 250*time.Millisecond, 3*time.Second)
-	if s0.DBAppliedIndex() != s2.DBAppliedIndex() {
-		t.Fatalf("applied index mismatch (%d, %d)", s0.DBAppliedIndex(), s2.DBAppliedIndex())
-	}

 	// Noop, then snapshot, truncating all logs. Then have another node join the cluster.
 	if af, err := s0.Noop("don't care"); err != nil || af.Error() != nil {
@ -1852,14 +1844,8 @@ func Test_MultiNodeDBAppliedIndex(t *testing.T) {
 		t.Fatalf("failed to wait for leader on follower: %s", err.Error())
 	}
 	testPoll(t, func() bool {
-		qr := queryRequestFromString("SELECT count(*) FROM foo", false, true)
-		qr.Level = proto.QueryRequest_QUERY_REQUEST_LEVEL_NONE
-		r, err := s3.Query(qr)
-		return err == nil && asJSON(r[0].Values) == `[[3]]`
-	}, 250*time.Millisecond, 3*time.Second)
-	if s0.DBAppliedIndex() > s2.DBAppliedIndex() {
-		t.Fatalf("applied index on new node is not correct (%d, %d)", s0.DBAppliedIndex(), s2.DBAppliedIndex())
-	}
+		return s0.DBAppliedIndex() <= s3.DBAppliedIndex()
+	}, 250*time.Millisecond, 5*time.Second)

 	// Write one last row, and everything should be in sync.
 	er = executeRequestFromStrings([]string{
@ -1869,6 +1855,9 @@ func Test_MultiNodeDBAppliedIndex(t *testing.T) {
 	if err != nil {
 		t.Fatalf("failed to execute on single node: %s", err.Error())
 	}
+	if _, err := s0.WaitForAppliedFSM(5 * time.Second); err != nil {
+		t.Fatalf("failed to wait for FSM to apply on leader")
+	}

 	testPoll(t, func() bool {
 		i := s0.DBAppliedIndex()
--- a/system_test/e2e/auto_state.py
+++ b/system_test/e2e/auto_state.py
@ -6,7 +6,7 @@ import unittest
 import sqlite3
 import time

-from helpers import Node, deprovision_node, write_random_file, random_string, env_present, gunzip_file, gzip_compress, temp_file, d_
+from helpers import Node, deprovision_node, write_random_file, random_string, env_present, gunzip_file, gzip_compress, temp_file, d_, Cluster
 from s3 import download_s3_object, delete_s3_object, upload_s3_object

 S3_BUCKET = 'rqlite-testing-circleci'
@ -340,7 +340,7 @@ class TestAutoBackupS3(unittest.TestCase):
    node.wait_until_uploads_idle()

    # Write one more record, wait for a backup to happen.
-    i = node.num_auto_backups()[0]
+    i = node.num_auto_backups()['ok']
    node.execute('INSERT INTO foo(name) VALUES("fiona")')
    j = node.query('SELECT count(*) FROM foo', level='strong')
    self.assertEqual(j, d_("{'results': [{'values': [[100]], 'types': ['integer'], 'columns': ['count(*)']}]}"))
@ -409,7 +409,7 @@ class TestAutoBackupS3(unittest.TestCase):

    # Confirm that the follower has performed no backups.
    time.sleep(5)
-    self.assertEqual(follower.num_auto_backups()[0], 0)
+    self.assertEqual(follower.num_auto_backups()['ok'], 0)

    delete_s3_object(access_key_id, secret_access_key_id, S3_BUCKET, path)
    deprovision_node(leader)
@ -470,6 +470,67 @@ class TestAutoBackupS3(unittest.TestCase):
    deprovision_node(node)
    os.remove(cfg)

+  @unittest.skipUnless(env_present('RQLITE_S3_ACCESS_KEY'), "S3 credentials not available")
+  def test_no_upload_leader_change(self):
+    '''Test that when a cluster changes leader, the new leader doesn't upload again'''
+    node = None
+    cfg = None
+    path = None
+    access_key_id = os.environ['RQLITE_S3_ACCESS_KEY']
+    secret_access_key_id = os.environ['RQLITE_S3_SECRET_ACCESS_KEY']
+
+    # Create the auto-backup config file
+    path = random_string(32)
+    auto_backup_cfg = {
+      "version": 1,
+      "type": "s3",
+      "interval": "100ms",
+      "no_compress": True,
+      "sub" : {
+        "access_key_id": access_key_id,
+        "secret_access_key": secret_access_key_id,
+        "region": S3_BUCKET_REGION,
+        "bucket": S3_BUCKET,
+         "path": path
+      }
+    }
+    cfg = write_random_file(json.dumps(auto_backup_cfg))
+
+    # Create a cluster with automatic backups enabled. An initial
+    # backup will happen because there is no data in the cloud.
+    n0 = Node(RQLITED_PATH, '0', auto_backup=cfg)
+    n0.start()
+    n0.wait_for_leader()
+    n0.wait_for_upload(1)
+
+    # Then create a table and insert a row. Wait for another backup to happen.
+    n0.execute('CREATE TABLE foo (id INTEGER NOT NULL PRIMARY KEY, name TEXT)')
+    n0.wait_for_upload(2)
+
+    # Create a cluster with two more followers
+    n1 = Node(RQLITED_PATH, '1', auto_backup=cfg)
+    n1.start(join=n0.RaftAddr())
+    n1.wait_for_leader()
+    n2 = Node(RQLITED_PATH, '2', auto_backup=cfg)
+    n2.start(join=n0.RaftAddr())
+    n2.wait_for_leader()
+
+    # Kill the leader, and get the new leader
+    cluster = Cluster([n0, n1, n2])
+    l = cluster.wait_for_leader()
+    l.stop(graceful=False)
+    new_leader = cluster.wait_for_leader(node_exc=l)
+
+    # Ensure new leader didn't do a backup
+    new_leader.wait_until_uploads_idle()
+    self.assertEqual(new_leader.num_auto_backups()['ok'], 0)
+
+    delete_s3_object(access_key_id, secret_access_key_id, S3_BUCKET, path)
+    deprovision_node(n0)
+    deprovision_node(n1)
+    deprovision_node(n2)
+    os.remove(cfg)
+

 if __name__ == "__main__":
  unittest.main(verbosity=2)
--- a/system_test/e2e/helpers.py
+++ b/system_test/e2e/helpers.py
@ -376,12 +376,14 @@ class Node(object):

  def num_auto_backups(self):
    '''
-    Return a tuple of the number of successful, failed, skipped auto-backups, skipped sum auto-backups.
+    Return a dict of the number of successful, failed, skipped auto-backups, skipped sum auto-backups.
    '''
-    return (int(self.expvar()['uploader']['num_uploads_ok']),
-            int(self.expvar()['uploader']['num_uploads_fail']),
-            int(self.expvar()['uploader']['num_uploads_skipped']),
-            int(self.expvar()['uploader']['num_uploads_skipped_sum']))
+    return {
+      'ok': int(self.expvar()['uploader']['num_uploads_ok']),
+      'fail': int(self.expvar()['uploader']['num_uploads_fail']),
+      'skip': int(self.expvar()['uploader']['num_uploads_skipped']),
+      'skip_sum': int(self.expvar()['uploader']['num_uploads_skipped_sum'])
+    }

  def wait_for_upload(self, i, timeout=TIMEOUT):
    '''
@ -389,7 +391,7 @@ class Node(object):
    '''
    t = 0
    while t < timeout:
-      if self.num_auto_backups()[0] == i:
+      if self.num_auto_backups()['ok'] == i:
        return self.num_auto_backups()
      time.sleep(0.1)
      t+=1
@ -402,7 +404,7 @@ class Node(object):
    '''
    t = 0
    while t < timeout:
-      if self.num_auto_backups()[3] >= i:
+      if self.num_auto_backups()['skip_sum'] >= i:
        return self.num_auto_backups()
      time.sleep(0.1)
      t+=1
@ -413,15 +415,27 @@ class Node(object):
    '''
    Wait until uploads go idle.
    '''
-    i = self.num_auto_backups()[2]
    t = 0
    while t < timeout:
-      if self.num_auto_backups()[2] > i:
-        return self.num_auto_backups()
-      time.sleep(0.1)
-      t+=1
+      backups = self.num_auto_backups()['ok']
+      skipped = self.num_auto_backups()['skip']
+      skipped_sum = self.num_auto_backups()['skip_sum']
+      time.sleep(0.5)
+      if self.num_auto_backups()['skip'] + self.num_auto_backups()['skip_sum'] == skipped + skipped_sum:
+        # Skipped uploads are not increasing, so uploads are not idle
+        t+=1
+        continue
+
+      # OK, skipped uploads are increasing, but has the number of backups stayed the same?
+      if self.num_auto_backups()['ok'] != backups:
+        t+=1
+        continue
+
+      # Backups are idle
+      return self.num_auto_backups()
+
    n = self.num_auto_backups()
-    raise Exception('rqlite node failed to idle backups within %d seconds (%d, %d, %d, %d)' % (timeout, n[0], n[1], n[2], n[3]))
+    raise Exception('rqlite node failed to idle backups within %d seconds (%s)' % n)

  def wait_for_fsm_index(self, index, timeout=TIMEOUT):
    '''