Try to auto recover the save operation on termination (#166)

This is very useful because it removes the need for user intervention in
the event save on termination fails. Say the save operation fails due to
'some bad daemon' changing the directory's perms. Now skyd reports this
error while trying to save upon termination. Our sysadmin now fixes the
perms issue. The previous design would force the sysadmin to _somehow_
foreground skyd and hit enter. That is silly. The new design just
attempts to do a save operation every 10 seconds. So in case the issue
is fixed, the save operation will recover on its own.

Why not exponential backoff?
That's because the issue can be fixed some long time later and we may
have reached a large backoff value so the save that could have succeeded
would have to wait for a long duration before it can do anything
meaningful.

This also fixes a bug that caused BGSAVE errors to be reported as info
class log entries.
next
Sayan 3 years ago committed by GitHub
parent 8df9901740
commit 7349e461e6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -35,7 +35,8 @@ use crate::config::PortConfig;
use crate::config::SnapshotConfig;
use libsky::URL;
use libsky::VERSION;
use std::io::{self, prelude::*};
use std::thread;
use std::time;
mod config;
use std::env;
mod actions;
@ -99,26 +100,22 @@ fn main() {
1,
"Maybe the compiler reordered the drop causing more than one instance of CoreDB to live at this point"
);
if let Err(e) = services::bgsave::_bgsave_blocking_section(&db) {
log::error!("Failed to flush data to disk with '{}'", e);
loop {
// Keep looping until we successfully write the in-memory table to disk
log::warn!("Press enter to try again...");
io::stdout().flush().unwrap();
io::stdin().read_exact(&mut [0]).unwrap();
match services::bgsave::_bgsave_blocking_section(&db) {
Ok(_) => {
log::info!("Successfully saved data to disk");
break;
}
Err(e) => {
log::error!("Failed to flush data to disk with '{}'", e);
continue;
}
log::info!("Stopped accepting incoming connections");
loop {
// Keep looping until we successfully write the in-memory table to disk
match services::bgsave::run_bgsave(&db) {
Ok(_) => {
log::info!("Successfully saved data to disk");
break;
}
Err(e) => {
log::error!(
"Failed to write data with error '{}'. Attempting to retry in 10s",
e
);
}
}
} else {
log::info!("Successfully saved data to disk");
thread::sleep(time::Duration::from_secs(10));
}
terminal::write_info("Goodbye :)\n").unwrap();
}

@ -86,7 +86,7 @@ pub async fn bgsave_scheduler(handle: CoreDB, bgsave_cfg: BGSave, mut terminator
/// by using [`fs::rename`]. This provides us with two gurantees:
/// 1. No silly logic is seen if the user deletes the data.bin file and yet BGSAVE doesn't complain
/// 2. If this method crashes midway, we can still be sure that the old file is intact
pub fn _bgsave_blocking_section(handle: &CoreDB) -> TResult<()> {
fn _bgsave_blocking_section(handle: &CoreDB) -> TResult<()> {
// first lock our temporary file
let mut file = flock::FileLock::lock(SKY_TEMP_FILE)?;
// get a read lock on the coretable
@ -106,6 +106,13 @@ pub fn _bgsave_blocking_section(handle: &CoreDB) -> TResult<()> {
Ok(())
}
/// Run bgsave
///
/// This function just hides away the BGSAVE blocking section from the _public API_
pub fn run_bgsave(handle: &CoreDB) -> TResult<()> {
_bgsave_blocking_section(handle)
}
/// This just wraps around [`_bgsave_blocking_section`] and prints nice log messages depending on the outcome
fn bgsave_blocking_section(handle: CoreDB) -> bool {
match _bgsave_blocking_section(&handle) {
@ -115,7 +122,7 @@ fn bgsave_blocking_section(handle: CoreDB) -> bool {
true
}
Err(e) => {
log::info!("BGSAVE failed with error: {}", e);
log::error!("BGSAVE failed with error: {}", e);
handle.poison();
false
}

Loading…
Cancel
Save