From 7349e461e6a1c3e737957b8f0b5ee40bad9b2bb2 Mon Sep 17 00:00:00 2001 From: Sayan Date: Thu, 10 Jun 2021 20:46:12 +0530 Subject: [PATCH] Try to auto recover the save operation on termination (#166) This is very useful because it removes the need for user intervention in the event save on termination fails. Say the save operation fails due to 'some bad daemon' changing the directory's perms. Now skyd reports this error while trying to save upon termination. Our sysadmin now fixes the perms issue. The previous design would force the sysadmin to _somehow_ foreground skyd and hit enter. That is silly. The new design just attempts to do a save operation every 10 seconds. So in case the issue is fixed, the save operation will recover on its own. Why not exponential backoff? That's because the issue can be fixed some long time later and we may have reached a large backoff value so the save that could have succeeded would have to wait for a long duration before it can do anything meaningful. This also fixes a bug that caused BGSAVE errors to be reported as info class log entries. --- server/src/main.rs | 35 ++++++++++++++++------------------- server/src/services/bgsave.rs | 11 +++++++++-- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/server/src/main.rs b/server/src/main.rs index e3fc5d35..2c614a11 100644 --- a/server/src/main.rs +++ b/server/src/main.rs @@ -35,7 +35,8 @@ use crate::config::PortConfig; use crate::config::SnapshotConfig; use libsky::URL; use libsky::VERSION; -use std::io::{self, prelude::*}; +use std::thread; +use std::time; mod config; use std::env; mod actions; @@ -99,26 +100,22 @@ fn main() { 1, "Maybe the compiler reordered the drop causing more than one instance of CoreDB to live at this point" ); - if let Err(e) = services::bgsave::_bgsave_blocking_section(&db) { - log::error!("Failed to flush data to disk with '{}'", e); - loop { - // Keep looping until we successfully write the in-memory table to disk - log::warn!("Press enter to try again..."); - io::stdout().flush().unwrap(); - io::stdin().read_exact(&mut [0]).unwrap(); - match services::bgsave::_bgsave_blocking_section(&db) { - Ok(_) => { - log::info!("Successfully saved data to disk"); - break; - } - Err(e) => { - log::error!("Failed to flush data to disk with '{}'", e); - continue; - } + log::info!("Stopped accepting incoming connections"); + loop { + // Keep looping until we successfully write the in-memory table to disk + match services::bgsave::run_bgsave(&db) { + Ok(_) => { + log::info!("Successfully saved data to disk"); + break; + } + Err(e) => { + log::error!( + "Failed to write data with error '{}'. Attempting to retry in 10s", + e + ); } } - } else { - log::info!("Successfully saved data to disk"); + thread::sleep(time::Duration::from_secs(10)); } terminal::write_info("Goodbye :)\n").unwrap(); } diff --git a/server/src/services/bgsave.rs b/server/src/services/bgsave.rs index 6ec2271b..04907501 100644 --- a/server/src/services/bgsave.rs +++ b/server/src/services/bgsave.rs @@ -86,7 +86,7 @@ pub async fn bgsave_scheduler(handle: CoreDB, bgsave_cfg: BGSave, mut terminator /// by using [`fs::rename`]. This provides us with two gurantees: /// 1. No silly logic is seen if the user deletes the data.bin file and yet BGSAVE doesn't complain /// 2. If this method crashes midway, we can still be sure that the old file is intact -pub fn _bgsave_blocking_section(handle: &CoreDB) -> TResult<()> { +fn _bgsave_blocking_section(handle: &CoreDB) -> TResult<()> { // first lock our temporary file let mut file = flock::FileLock::lock(SKY_TEMP_FILE)?; // get a read lock on the coretable @@ -106,6 +106,13 @@ pub fn _bgsave_blocking_section(handle: &CoreDB) -> TResult<()> { Ok(()) } +/// Run bgsave +/// +/// This function just hides away the BGSAVE blocking section from the _public API_ +pub fn run_bgsave(handle: &CoreDB) -> TResult<()> { + _bgsave_blocking_section(handle) +} + /// This just wraps around [`_bgsave_blocking_section`] and prints nice log messages depending on the outcome fn bgsave_blocking_section(handle: CoreDB) -> bool { match _bgsave_blocking_section(&handle) { @@ -115,7 +122,7 @@ fn bgsave_blocking_section(handle: CoreDB) -> bool { true } Err(e) => { - log::info!("BGSAVE failed with error: {}", e); + log::error!("BGSAVE failed with error: {}", e); handle.poison(); false }