Fix infinite loop on save failure during termination

3 years ago · 33b0693fcb
parent e6ae290334
commit 33b0693fcb
8 changed files with 112 additions and 53 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -13,6 +13,8 @@ All changes in this project will be noted in this file.
 ### Fixes

 - Fixed infinite wait (loop) when sample space for key generation is not large enough
+- Fixed infinite save-on-termination loop. Now loop uses a threshold and on 4 subsequent `SIGINT` signals,
+  it will quit forcefully
 - Removed `upgrade` subcommand from `skyd` which was removed in 0.7, but was erroneously accepted in the CLI
  parameters
 - Restored ability to use `--restore <backupdir>` to restore data from previous snapshots which was silently
--- a/server/src/arbiter.rs
+++ b/server/src/arbiter.rs
@ -29,10 +29,23 @@ use crate::config::SnapshotConfig;
 use crate::config::SnapshotPref;
 use crate::corestore::Corestore;
 use crate::dbnet::{self, Terminator};
+use crate::diskstore::flock::FileLock;
 use crate::services;
 use crate::storage::sengine::SnapshotEngine;
+use libsky::util::terminal;
 use std::sync::Arc;
-use tokio::sync::broadcast;
+use std::thread::sleep;
+use tokio::{
+    signal::ctrl_c,
+    sync::{
+        broadcast,
+        mpsc::{self, Sender},
+    },
+    task::{self, JoinHandle},
+    time::Duration,
+};
+
+const TERMSIG_THRESHOLD: usize = 3;

 #[cfg(unix)]
 use core::{future::Future, pin::Pin, task::Context, task::Poll};
@ -145,3 +158,69 @@ pub async fn run(
    let _ = bgsave_handle.await;
    Ok(db)
 }
+
+fn spawn_task(tx: Sender<bool>, db: Corestore, do_sleep: bool) -> JoinHandle<()> {
+    task::spawn_blocking(move || {
+        if do_sleep {
+            log::info!("Waiting for 10 seconds before retrying ...");
+            sleep(Duration::from_secs(10));
+        }
+        let ret = match crate::services::bgsave::run_bgsave(&db) {
+            Ok(()) => true,
+            Err(e) => {
+                log::error!("Failed to run save on termination: {e}");
+                false
+            }
+        };
+        tx.blocking_send(ret).expect("Receiver dropped");
+    })
+}
+
+pub fn finalize_shutdown(corestore: Corestore, pid_file: FileLock) {
+    let rt = tokio::runtime::Builder::new_multi_thread()
+        .thread_name("server-final")
+        .enable_all()
+        .build()
+        .unwrap();
+    let dbc = corestore.clone();
+    let mut okay: bool = rt.block_on(async move {
+        let db = dbc;
+        let (tx, mut rx) = mpsc::channel::<bool>(1);
+        spawn_task(tx.clone(), db.clone(), false);
+        let mut threshold = TERMSIG_THRESHOLD;
+        loop {
+            tokio::select! {
+                ret = rx.recv() => {
+                    if ret.unwrap() {
+                        // that's good to go
+                        log::info!("Save before termination successful");
+                        break true;
+                    } else {
+                        let txc = tx.clone();
+                        let dbc = db.clone();
+                        // we failed, so we better sleep
+                        // now spawn it again to see the state
+                        spawn_task(txc, dbc, true);
+                    }
+                }
+                _ = ctrl_c() => {
+                    if threshold == 0 {
+                        log::error!("SIGTERM received but failed to flush data. Quitting because threshold exceeded");
+                        break false;
+                    } else {
+                        log::error!("SIGTERM received but failed to flush data. Threshold is at {threshold}");
+                        threshold -= 1;
+                        continue;
+                    }
+                }
+            }
+        }
+    });
+    okay &= services::pre_shutdown_cleanup(pid_file, Some(corestore.get_store()));
+    if okay {
+        terminal::write_success("Goodbye :)").unwrap()
+    } else {
+        log::error!("Didn't terminate successfully");
+        crate::exit_error();
+    }
+}
--- a/server/src/corestore/memstore.rs
+++ b/server/src/corestore/memstore.rs
@ -157,9 +157,6 @@ pub enum DdlError {
 pub struct Memstore {
    /// the keyspaces
    pub keyspaces: Coremap<ObjectID, Arc<Keyspace>>,
-    /// the snapshot configuration
-    /// A **virtual lock** on the preload file
-    preload_lock: QuickLock<()>,
 }

 impl Memstore {
@ -167,14 +164,10 @@ impl Memstore {
    pub fn new_empty() -> Self {
        Self {
            keyspaces: Coremap::new(),
-            preload_lock: QuickLock::new(()),
        }
    }
    pub fn init_with_all(keyspaces: Coremap<ObjectID, Arc<Keyspace>>) -> Self {
-        Self {
-            keyspaces,
-            preload_lock: QuickLock::new(()),
-        }
+        Self { keyspaces }
    }
    /// Create a new in-memory table with the default keyspace and the default
    /// tables. So, whenever you're calling this, this is what you get:
@ -202,7 +195,6 @@ impl Memstore {
                n.true_if_insert(SYSTEM, Arc::new(Keyspace::empty()));
                n
            },
-            preload_lock: QuickLock::new(()),
        }
    }
    /// Get an atomic reference to a keyspace
@ -308,6 +300,7 @@ pub struct Keyspace {
    /// the tables
    pub tables: Coremap<ObjectID, Arc<Table>>,
    /// the replication strategy for this keyspace
+    #[allow(dead_code)] // TODO: Remove this once we're ready with replication
    replication_strategy: cluster::ReplicationStrategy,
    /// A **virtual lock** on the partmap for this keyspace
    partmap_lock: QuickLock<()>,
--- a/server/src/main.rs
+++ b/server/src/main.rs
@ -34,16 +34,13 @@
 //! is the most important part of the project. There are several modules within this crate; see
 //! the modules for their respective documentation.

-use crate::corestore::memstore::Memstore;
 use crate::diskstore::flock::FileLock;
+pub use crate::util::exit_error;
 use env_logger::Builder;
-use libsky::util::terminal;
 use libsky::URL;
 use libsky::VERSION;
 use std::env;
 use std::process;
-use std::thread;
-use std::time;
 #[macro_use]
 mod util;
 mod actions;
@ -109,48 +106,12 @@ fn main() {
        Err(e) => {
            // uh oh, something happened while starting up
            log::error!("{}", e);
-            pre_shutdown_cleanup(pid_file, None);
+            services::pre_shutdown_cleanup(pid_file, None);
            process::exit(1);
        }
    };
-    assert_eq!(
-        db.strong_count(),
-        1,
-        "Maybe the compiler reordered the drop causing more than one instance of Corestore to live at this point"
-    );
    log::info!("Stopped accepting incoming connections");
-    loop {
-        // Keep looping until we successfully write the in-memory table to disk
-        match services::bgsave::run_bgsave(&db) {
-            Ok(_) => {
-                log::info!("Successfully saved data to disk");
-                break;
-            }
-            Err(e) => {
-                log::error!(
-                    "Failed to write data with error '{}'. Attempting to retry in 10s",
-                    e
-                );
-            }
-        }
-        thread::sleep(time::Duration::from_secs(10));
-    }
-    pre_shutdown_cleanup(pid_file, Some(db.get_store()));
-    terminal::write_info("Goodbye :)\n").unwrap();
-}
-
-pub fn pre_shutdown_cleanup(mut pid_file: FileLock, mr: Option<&Memstore>) {
-    if let Err(e) = pid_file.unlock() {
-        log::error!("Shutdown failure: Failed to unlock pid file: {}", e);
-        process::exit(0x01);
-    }
-    if let Some(mr) = mr {
-        log::info!("Compacting tree");
-        if let Err(e) = storage::interface::cleanup_tree(mr) {
-            log::error!("Failed to compact tree: {}", e);
-            process::exit(0x01);
-        }
-    }
+    arbiter::finalize_shutdown(db, pid_file);
 }

 use self::config::ConfigurationSet;
--- a/server/src/services/mod.rs
+++ b/server/src/services/mod.rs
@ -26,6 +26,9 @@

 pub mod bgsave;
 pub mod snapshot;
+use crate::corestore::memstore::Memstore;
+use crate::diskstore::flock::FileLock;
+use crate::storage;
 use crate::util::os;
 use crate::IoResult;

@ -37,3 +40,18 @@ pub fn restore_data(src: Option<String>) -> IoResult<()> {
    }
    Ok(())
 }
+
+pub fn pre_shutdown_cleanup(mut pid_file: FileLock, mr: Option<&Memstore>) -> bool {
+    if let Err(e) = pid_file.unlock() {
+        log::error!("Shutdown failure: Failed to unlock pid file: {}", e);
+        return false;
+    }
+    if let Some(mr) = mr {
+        log::info!("Compacting tree");
+        if let Err(e) = storage::interface::cleanup_tree(mr) {
+            log::error!("Failed to compact tree: {}", e);
+            return false;
+        }
+    }
+    true
+}
--- a/server/src/storage/sengine.rs
+++ b/server/src/storage/sengine.rs
@ -29,8 +29,8 @@ use super::interface::DIR_SNAPROOT;
 use crate::corestore::iarray::IArray;
 use crate::corestore::lazy::Lazy;
 use crate::corestore::lock::QuickLock;
+use crate::corestore::memstore::Memstore;
 use crate::storage::interface::DIR_RSNAPROOT;
-use crate::Memstore;
 use bytes::Bytes;
 use chrono::prelude::Utc;
 use core::fmt;
--- a/server/src/util/compiler.rs
+++ b/server/src/util/compiler.rs
@ -53,6 +53,7 @@ pub const fn cold_err<T>(v: T) -> T {
    v
 }
 #[inline(always)]
+#[allow(unused)]
 pub const fn hot<T>(v: T) -> T {
    if false {
        cold()
--- a/server/src/util/mod.rs
+++ b/server/src/util/mod.rs
@ -28,6 +28,7 @@
 mod macros;
 pub mod compiler;
 pub mod os;
+use std::process;

 /// # Unsafe unwrapping
 ///
@ -62,3 +63,7 @@ unsafe impl<T> Unwrappable<T> for Option<T> {
        }
    }
 }
+
+pub fn exit_error() -> ! {
+    process::exit(0x01)
+}