Try to auto recover the save operation on termination (#166)

This is very useful because it removes the need for user intervention in the event save on termination fails. Say the save operation fails due to 'some bad daemon' changing the directory's perms. Now skyd reports this error while trying to save upon termination. Our sysadmin now fixes the perms issue. The previous design would force the sysadmin to _somehow_ foreground skyd and hit enter. That is silly. The new design just attempts to do a save operation every 10 seconds. So in case the issue is fixed, the save operation will recover on its own. Why not exponential backoff? That's because the issue can be fixed some long time later and we may have reached a large backoff value so the save that could have succeeded would have to wait for a long duration before it can do anything meaningful. This also fixes a bug that caused BGSAVE errors to be reported as info class log entries.
3 years ago · 7349e461e6
parent 8df9901740
commit 7349e461e6
2 changed files with 25 additions and 21 deletions
--- a/server/src/main.rs
+++ b/server/src/main.rs
@ -35,7 +35,8 @@ use crate::config::PortConfig;
 use crate::config::SnapshotConfig;
 use libsky::URL;
 use libsky::VERSION;
-use std::io::{self, prelude::*};
+use std::thread;
+use std::time;
 mod config;
 use std::env;
 mod actions;
@ -99,26 +100,22 @@ fn main() {
        1,
        "Maybe the compiler reordered the drop causing more than one instance of CoreDB to live at this point"
    );
-    if let Err(e) = services::bgsave::_bgsave_blocking_section(&db) {
-        log::error!("Failed to flush data to disk with '{}'", e);
-        loop {
-            // Keep looping until we successfully write the in-memory table to disk
-            log::warn!("Press enter to try again...");
-            io::stdout().flush().unwrap();
-            io::stdin().read_exact(&mut [0]).unwrap();
-            match services::bgsave::_bgsave_blocking_section(&db) {
-                Ok(_) => {
-                    log::info!("Successfully saved data to disk");
-                    break;
-                }
-                Err(e) => {
-                    log::error!("Failed to flush data to disk with '{}'", e);
-                    continue;
-                }
+    log::info!("Stopped accepting incoming connections");
+    loop {
+        // Keep looping until we successfully write the in-memory table to disk
+        match services::bgsave::run_bgsave(&db) {
+            Ok(_) => {
+                log::info!("Successfully saved data to disk");
+                break;
+            }
+            Err(e) => {
+                log::error!(
+                    "Failed to write data with error '{}'. Attempting to retry in 10s",
+                    e
+                );
            }
        }
-    } else {
-        log::info!("Successfully saved data to disk");
+        thread::sleep(time::Duration::from_secs(10));
    }
    terminal::write_info("Goodbye :)\n").unwrap();
 }
--- a/server/src/services/bgsave.rs
+++ b/server/src/services/bgsave.rs
@ -86,7 +86,7 @@ pub async fn bgsave_scheduler(handle: CoreDB, bgsave_cfg: BGSave, mut terminator
 /// by using [`fs::rename`]. This provides us with two gurantees:
 /// 1. No silly logic is seen if the user deletes the data.bin file and yet BGSAVE doesn't complain
 /// 2. If this method crashes midway, we can still be sure that the old file is intact
-pub fn _bgsave_blocking_section(handle: &CoreDB) -> TResult<()> {
+fn _bgsave_blocking_section(handle: &CoreDB) -> TResult<()> {
    // first lock our temporary file
    let mut file = flock::FileLock::lock(SKY_TEMP_FILE)?;
    // get a read lock on the coretable
@ -106,6 +106,13 @@ pub fn _bgsave_blocking_section(handle: &CoreDB) -> TResult<()> {
    Ok(())
 }

+/// Run bgsave
+/// 
+/// This function just hides away the BGSAVE blocking section from the _public API_
+pub fn run_bgsave(handle: &CoreDB) -> TResult<()> {
+    _bgsave_blocking_section(handle)
+}
+
 /// This just wraps around [`_bgsave_blocking_section`] and prints nice log messages depending on the outcome
 fn bgsave_blocking_section(handle: CoreDB) -> bool {
    match _bgsave_blocking_section(&handle) {
@ -115,7 +122,7 @@ fn bgsave_blocking_section(handle: CoreDB) -> bool {
            true
        }
        Err(e) => {
-            log::info!("BGSAVE failed with error: {}", e);
+            log::error!("BGSAVE failed with error: {}", e);
            handle.poison();
            false
        }