Fix infinite loop on save failure during termination

next
Sayan Nandan 3 years ago
parent e6ae290334
commit 33b0693fcb
No known key found for this signature in database
GPG Key ID: 8BC07A0A4D41DD52

@ -13,6 +13,8 @@ All changes in this project will be noted in this file.
### Fixes
- Fixed infinite wait (loop) when sample space for key generation is not large enough
- Fixed infinite save-on-termination loop. Now loop uses a threshold and on 4 subsequent `SIGINT` signals,
it will quit forcefully
- Removed `upgrade` subcommand from `skyd` which was removed in 0.7, but was erroneously accepted in the CLI
parameters
- Restored ability to use `--restore <backupdir>` to restore data from previous snapshots which was silently

@ -29,10 +29,23 @@ use crate::config::SnapshotConfig;
use crate::config::SnapshotPref;
use crate::corestore::Corestore;
use crate::dbnet::{self, Terminator};
use crate::diskstore::flock::FileLock;
use crate::services;
use crate::storage::sengine::SnapshotEngine;
use libsky::util::terminal;
use std::sync::Arc;
use tokio::sync::broadcast;
use std::thread::sleep;
use tokio::{
signal::ctrl_c,
sync::{
broadcast,
mpsc::{self, Sender},
},
task::{self, JoinHandle},
time::Duration,
};
const TERMSIG_THRESHOLD: usize = 3;
#[cfg(unix)]
use core::{future::Future, pin::Pin, task::Context, task::Poll};
@ -145,3 +158,69 @@ pub async fn run(
let _ = bgsave_handle.await;
Ok(db)
}
fn spawn_task(tx: Sender<bool>, db: Corestore, do_sleep: bool) -> JoinHandle<()> {
task::spawn_blocking(move || {
if do_sleep {
log::info!("Waiting for 10 seconds before retrying ...");
sleep(Duration::from_secs(10));
}
let ret = match crate::services::bgsave::run_bgsave(&db) {
Ok(()) => true,
Err(e) => {
log::error!("Failed to run save on termination: {e}");
false
}
};
tx.blocking_send(ret).expect("Receiver dropped");
})
}
pub fn finalize_shutdown(corestore: Corestore, pid_file: FileLock) {
let rt = tokio::runtime::Builder::new_multi_thread()
.thread_name("server-final")
.enable_all()
.build()
.unwrap();
let dbc = corestore.clone();
let mut okay: bool = rt.block_on(async move {
let db = dbc;
let (tx, mut rx) = mpsc::channel::<bool>(1);
spawn_task(tx.clone(), db.clone(), false);
let mut threshold = TERMSIG_THRESHOLD;
loop {
tokio::select! {
ret = rx.recv() => {
if ret.unwrap() {
// that's good to go
log::info!("Save before termination successful");
break true;
} else {
let txc = tx.clone();
let dbc = db.clone();
// we failed, so we better sleep
// now spawn it again to see the state
spawn_task(txc, dbc, true);
}
}
_ = ctrl_c() => {
if threshold == 0 {
log::error!("SIGTERM received but failed to flush data. Quitting because threshold exceeded");
break false;
} else {
log::error!("SIGTERM received but failed to flush data. Threshold is at {threshold}");
threshold -= 1;
continue;
}
}
}
}
});
okay &= services::pre_shutdown_cleanup(pid_file, Some(corestore.get_store()));
if okay {
terminal::write_success("Goodbye :)").unwrap()
} else {
log::error!("Didn't terminate successfully");
crate::exit_error();
}
}

@ -157,9 +157,6 @@ pub enum DdlError {
pub struct Memstore {
/// the keyspaces
pub keyspaces: Coremap<ObjectID, Arc<Keyspace>>,
/// the snapshot configuration
/// A **virtual lock** on the preload file
preload_lock: QuickLock<()>,
}
impl Memstore {
@ -167,14 +164,10 @@ impl Memstore {
pub fn new_empty() -> Self {
Self {
keyspaces: Coremap::new(),
preload_lock: QuickLock::new(()),
}
}
pub fn init_with_all(keyspaces: Coremap<ObjectID, Arc<Keyspace>>) -> Self {
Self {
keyspaces,
preload_lock: QuickLock::new(()),
}
Self { keyspaces }
}
/// Create a new in-memory table with the default keyspace and the default
/// tables. So, whenever you're calling this, this is what you get:
@ -202,7 +195,6 @@ impl Memstore {
n.true_if_insert(SYSTEM, Arc::new(Keyspace::empty()));
n
},
preload_lock: QuickLock::new(()),
}
}
/// Get an atomic reference to a keyspace
@ -308,6 +300,7 @@ pub struct Keyspace {
/// the tables
pub tables: Coremap<ObjectID, Arc<Table>>,
/// the replication strategy for this keyspace
#[allow(dead_code)] // TODO: Remove this once we're ready with replication
replication_strategy: cluster::ReplicationStrategy,
/// A **virtual lock** on the partmap for this keyspace
partmap_lock: QuickLock<()>,

@ -34,16 +34,13 @@
//! is the most important part of the project. There are several modules within this crate; see
//! the modules for their respective documentation.
use crate::corestore::memstore::Memstore;
use crate::diskstore::flock::FileLock;
pub use crate::util::exit_error;
use env_logger::Builder;
use libsky::util::terminal;
use libsky::URL;
use libsky::VERSION;
use std::env;
use std::process;
use std::thread;
use std::time;
#[macro_use]
mod util;
mod actions;
@ -109,48 +106,12 @@ fn main() {
Err(e) => {
// uh oh, something happened while starting up
log::error!("{}", e);
pre_shutdown_cleanup(pid_file, None);
services::pre_shutdown_cleanup(pid_file, None);
process::exit(1);
}
};
assert_eq!(
db.strong_count(),
1,
"Maybe the compiler reordered the drop causing more than one instance of Corestore to live at this point"
);
log::info!("Stopped accepting incoming connections");
loop {
// Keep looping until we successfully write the in-memory table to disk
match services::bgsave::run_bgsave(&db) {
Ok(_) => {
log::info!("Successfully saved data to disk");
break;
}
Err(e) => {
log::error!(
"Failed to write data with error '{}'. Attempting to retry in 10s",
e
);
}
}
thread::sleep(time::Duration::from_secs(10));
}
pre_shutdown_cleanup(pid_file, Some(db.get_store()));
terminal::write_info("Goodbye :)\n").unwrap();
}
pub fn pre_shutdown_cleanup(mut pid_file: FileLock, mr: Option<&Memstore>) {
if let Err(e) = pid_file.unlock() {
log::error!("Shutdown failure: Failed to unlock pid file: {}", e);
process::exit(0x01);
}
if let Some(mr) = mr {
log::info!("Compacting tree");
if let Err(e) = storage::interface::cleanup_tree(mr) {
log::error!("Failed to compact tree: {}", e);
process::exit(0x01);
}
}
arbiter::finalize_shutdown(db, pid_file);
}
use self::config::ConfigurationSet;

@ -26,6 +26,9 @@
pub mod bgsave;
pub mod snapshot;
use crate::corestore::memstore::Memstore;
use crate::diskstore::flock::FileLock;
use crate::storage;
use crate::util::os;
use crate::IoResult;
@ -37,3 +40,18 @@ pub fn restore_data(src: Option<String>) -> IoResult<()> {
}
Ok(())
}
pub fn pre_shutdown_cleanup(mut pid_file: FileLock, mr: Option<&Memstore>) -> bool {
if let Err(e) = pid_file.unlock() {
log::error!("Shutdown failure: Failed to unlock pid file: {}", e);
return false;
}
if let Some(mr) = mr {
log::info!("Compacting tree");
if let Err(e) = storage::interface::cleanup_tree(mr) {
log::error!("Failed to compact tree: {}", e);
return false;
}
}
true
}

@ -29,8 +29,8 @@ use super::interface::DIR_SNAPROOT;
use crate::corestore::iarray::IArray;
use crate::corestore::lazy::Lazy;
use crate::corestore::lock::QuickLock;
use crate::corestore::memstore::Memstore;
use crate::storage::interface::DIR_RSNAPROOT;
use crate::Memstore;
use bytes::Bytes;
use chrono::prelude::Utc;
use core::fmt;

@ -53,6 +53,7 @@ pub const fn cold_err<T>(v: T) -> T {
v
}
#[inline(always)]
#[allow(unused)]
pub const fn hot<T>(v: T) -> T {
if false {
cold()

@ -28,6 +28,7 @@
mod macros;
pub mod compiler;
pub mod os;
use std::process;
/// # Unsafe unwrapping
///
@ -62,3 +63,7 @@ unsafe impl<T> Unwrappable<T> for Option<T> {
}
}
}
pub fn exit_error() -> ! {
process::exit(0x01)
}

Loading…
Cancel
Save