From 10e1f50d78db2bd340a9a3c6f9f6958da674a9ad Mon Sep 17 00:00:00 2001 From: Sayan Nandan Date: Mon, 4 Mar 2024 11:47:14 +0530 Subject: [PATCH] Fix system health reporting --- CHANGELOG.md | 7 ++++ server/src/engine/core/dcl.rs | 8 ++++- server/src/engine/core/space.rs | 8 +++-- server/src/engine/fractal/drivers.rs | 3 +- server/src/engine/fractal/mgr.rs | 2 ++ server/src/engine/fractal/mod.rs | 45 +++++++++++++++++++++++-- server/src/engine/fractal/test_utils.rs | 7 +++- 7 files changed, 73 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a18924bd..708eedfb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,13 @@ All changes in this project will be noted in this file. +## Version 0.8.1 + +### Fixes + +- Fixed migration from v1 SE (released with v0.8.0-beta) to v2 SE (released in v0.8.0) +- Fixed health reporting + ## Version 0.8.0 > This is the first release of Skytable Octave, and it changes the query API entirely making all previous versions incompatible diff --git a/server/src/engine/core/dcl.rs b/server/src/engine/core/dcl.rs index 31c3d511..bac0c3e7 100644 --- a/server/src/engine/core/dcl.rs +++ b/server/src/engine/core/dcl.rs @@ -46,7 +46,13 @@ pub fn exec( SysctlCommand::CreateUser(new) => create_user(&g, new), SysctlCommand::DropUser(drop) => drop_user(&g, current_user, drop), SysctlCommand::AlterUser(usermod) => alter_user(&g, current_user, usermod), - SysctlCommand::ReportStatus => Ok(()), + SysctlCommand::ReportStatus => { + if g.health().status_okay() { + Ok(()) + } else { + Err(QueryError::SysServerError) + } + } } } diff --git a/server/src/engine/core/space.rs b/server/src/engine/core/space.rs index 4f905379..30c3174e 100644 --- a/server/src/engine/core/space.rs +++ b/server/src/engine/core/space.rs @@ -271,9 +271,13 @@ impl Space { // UNSAFE(@ohsayan): I want to try what the borrow checker has been trying core::mem::transmute(EntityIDRef::new(space_name.as_str(), &model)) }; - models.st_delete(&e); + let mdl = models.st_delete_return(&e).unwrap(); // no need to purge model drive since the dir itself is deleted. our work here is to just - // remove this from the linked models from the model ns + // remove this from the linked models from the model ns. but we should update the global state + if mdl.driver().status().is_iffy() { + // yes this driver had a fault but it's being purged anyway so update global status + global.health().report_removal_of_faulty_source(); + } } let _ = spaces.st_delete(space_name.as_str()); if if_exists { diff --git a/server/src/engine/fractal/drivers.rs b/server/src/engine/fractal/drivers.rs index e22e8881..560d5c8a 100644 --- a/server/src/engine/fractal/drivers.rs +++ b/server/src/engine/fractal/drivers.rs @@ -67,10 +67,11 @@ impl FractalGNSDriver { match f(&mut txn_driver) { Ok(v) => Ok(v), Err(e) => compiler::cold_call(|| { - error!("GNS driver failed with: {e}"); self.status.set_iffy(); + g.health().report_fault(); on_failure(); g.taskmgr_post_high_priority(Task::new(CriticalTask::CheckGNSDriver)); + error!("GNS driver failed with: {e}"); Err(QueryError::SysServerError) }), } diff --git a/server/src/engine/fractal/mgr.rs b/server/src/engine/fractal/mgr.rs index c699d137..71fb33ce 100644 --- a/server/src/engine/fractal/mgr.rs +++ b/server/src/engine/fractal/mgr.rs @@ -333,6 +333,7 @@ impl FractalMgr { Ok(()) => { info!("GNS driver has been successfully auto-recovered"); global.state().gns_driver().status().set_okay(); + global.health().report_recovery(); } Err(e) => { error!("failed to autorecover GNS driver with error `{e}`. will try again"); @@ -357,6 +358,7 @@ impl FractalMgr { match drv.__lwt_heartbeat() { Ok(()) => { mdl.driver().status().set_okay(); + global.health().report_recovery(); info!("model driver for {mdl_id} has been successfully auto-recovered"); } Err(e) => { diff --git a/server/src/engine/fractal/mod.rs b/server/src/engine/fractal/mod.rs index eeb6b0cc..f153ab75 100644 --- a/server/src/engine/fractal/mod.rs +++ b/server/src/engine/fractal/mod.rs @@ -34,7 +34,11 @@ use { }, }, crate::{engine::error::RuntimeResult, util::compiler}, - std::{fmt, mem::MaybeUninit}, + std::{ + fmt, + mem::MaybeUninit, + sync::atomic::{AtomicUsize, Ordering}, + }, tokio::sync::mpsc::unbounded_channel, }; @@ -87,9 +91,34 @@ pub unsafe fn load_and_enable_all(gns: GlobalNS) -> GlobalStateStart { global access */ +pub struct GlobalHealth { + faults: AtomicUsize, +} + +impl GlobalHealth { + pub fn status_okay(&self) -> bool { + self.faults.load(Ordering::Acquire) == 0 + } + const fn new() -> Self { + Self { + faults: AtomicUsize::new(0), + } + } + fn report_fault(&self) { + self.faults.fetch_add(1, Ordering::Release); + } + fn report_recovery(&self) { + self.faults.fetch_sub(1, Ordering::Release); + } + pub fn report_removal_of_faulty_source(&self) { + self.report_recovery() + } +} + /// Something that represents the global state pub trait GlobalInstanceLike { // stat + fn health(&self) -> &GlobalHealth; fn get_max_delta_size(&self) -> usize; // global namespace fn state(&self) -> &GlobalNS; @@ -150,6 +179,13 @@ impl GlobalInstanceLike for Global { fn state(&self) -> &GlobalNS { self._namespace() } + fn health(&self) -> &GlobalHealth { + &unsafe { + // UNSAFE(@ohsayan): we expect the system to be initialized + self.__gref() + } + .health + } // taskmgr fn taskmgr_post_high_priority(&self, task: Task) { self._post_high_priority_task(task) @@ -258,11 +294,16 @@ impl Global { struct GlobalState { gns: GlobalNS, task_mgr: mgr::FractalMgr, + health: GlobalHealth, } impl GlobalState { fn new(gns: GlobalNS, task_mgr: mgr::FractalMgr) -> Self { - Self { gns, task_mgr } + Self { + gns, + task_mgr, + health: GlobalHealth::new(), + } } pub(self) fn fractal_mgr(&self) -> &mgr::FractalMgr { &self.task_mgr diff --git a/server/src/engine/fractal/test_utils.rs b/server/src/engine/fractal/test_utils.rs index 284f7a97..f2572e0b 100644 --- a/server/src/engine/fractal/test_utils.rs +++ b/server/src/engine/fractal/test_utils.rs @@ -26,7 +26,7 @@ use { super::{ - drivers::FractalGNSDriver, CriticalTask, FractalModelDriver, GenericTask, + drivers::FractalGNSDriver, CriticalTask, FractalModelDriver, GenericTask, GlobalHealth, GlobalInstanceLike, Task, }, crate::engine::{ @@ -47,6 +47,7 @@ pub struct TestGlobal { gns: GlobalNS, lp_queue: RwLock>>, max_delta_size: usize, + health: GlobalHealth, } impl TestGlobal { @@ -55,6 +56,7 @@ impl TestGlobal { gns, lp_queue: RwLock::default(), max_delta_size: usize::MAX, + health: GlobalHealth::new(), } } pub fn set_max_data_pressure(&mut self, max_data_pressure: usize) { @@ -110,6 +112,9 @@ impl TestGlobal { } impl GlobalInstanceLike for TestGlobal { + fn health(&self) -> &GlobalHealth { + &self.health + } fn state(&self) -> &GlobalNS { &self.gns }