Fix system health reporting

next
Sayan Nandan 7 months ago
parent 310933099b
commit 10e1f50d78
No known key found for this signature in database
GPG Key ID: 0EBD769024B24F0A

@ -2,6 +2,13 @@
All changes in this project will be noted in this file. All changes in this project will be noted in this file.
## Version 0.8.1
### Fixes
- Fixed migration from v1 SE (released with v0.8.0-beta) to v2 SE (released in v0.8.0)
- Fixed health reporting
## Version 0.8.0 ## Version 0.8.0
> This is the first release of Skytable Octave, and it changes the query API entirely making all previous versions incompatible > This is the first release of Skytable Octave, and it changes the query API entirely making all previous versions incompatible

@ -46,7 +46,13 @@ pub fn exec<G: GlobalInstanceLike>(
SysctlCommand::CreateUser(new) => create_user(&g, new), SysctlCommand::CreateUser(new) => create_user(&g, new),
SysctlCommand::DropUser(drop) => drop_user(&g, current_user, drop), SysctlCommand::DropUser(drop) => drop_user(&g, current_user, drop),
SysctlCommand::AlterUser(usermod) => alter_user(&g, current_user, usermod), SysctlCommand::AlterUser(usermod) => alter_user(&g, current_user, usermod),
SysctlCommand::ReportStatus => Ok(()), SysctlCommand::ReportStatus => {
if g.health().status_okay() {
Ok(())
} else {
Err(QueryError::SysServerError)
}
}
} }
} }

@ -271,9 +271,13 @@ impl Space {
// UNSAFE(@ohsayan): I want to try what the borrow checker has been trying // UNSAFE(@ohsayan): I want to try what the borrow checker has been trying
core::mem::transmute(EntityIDRef::new(space_name.as_str(), &model)) core::mem::transmute(EntityIDRef::new(space_name.as_str(), &model))
}; };
models.st_delete(&e); let mdl = models.st_delete_return(&e).unwrap();
// no need to purge model drive since the dir itself is deleted. our work here is to just // no need to purge model drive since the dir itself is deleted. our work here is to just
// remove this from the linked models from the model ns // remove this from the linked models from the model ns. but we should update the global state
if mdl.driver().status().is_iffy() {
// yes this driver had a fault but it's being purged anyway so update global status
global.health().report_removal_of_faulty_source();
}
} }
let _ = spaces.st_delete(space_name.as_str()); let _ = spaces.st_delete(space_name.as_str());
if if_exists { if if_exists {

@ -67,10 +67,11 @@ impl FractalGNSDriver {
match f(&mut txn_driver) { match f(&mut txn_driver) {
Ok(v) => Ok(v), Ok(v) => Ok(v),
Err(e) => compiler::cold_call(|| { Err(e) => compiler::cold_call(|| {
error!("GNS driver failed with: {e}");
self.status.set_iffy(); self.status.set_iffy();
g.health().report_fault();
on_failure(); on_failure();
g.taskmgr_post_high_priority(Task::new(CriticalTask::CheckGNSDriver)); g.taskmgr_post_high_priority(Task::new(CriticalTask::CheckGNSDriver));
error!("GNS driver failed with: {e}");
Err(QueryError::SysServerError) Err(QueryError::SysServerError)
}), }),
} }

@ -333,6 +333,7 @@ impl FractalMgr {
Ok(()) => { Ok(()) => {
info!("GNS driver has been successfully auto-recovered"); info!("GNS driver has been successfully auto-recovered");
global.state().gns_driver().status().set_okay(); global.state().gns_driver().status().set_okay();
global.health().report_recovery();
} }
Err(e) => { Err(e) => {
error!("failed to autorecover GNS driver with error `{e}`. will try again"); error!("failed to autorecover GNS driver with error `{e}`. will try again");
@ -357,6 +358,7 @@ impl FractalMgr {
match drv.__lwt_heartbeat() { match drv.__lwt_heartbeat() {
Ok(()) => { Ok(()) => {
mdl.driver().status().set_okay(); mdl.driver().status().set_okay();
global.health().report_recovery();
info!("model driver for {mdl_id} has been successfully auto-recovered"); info!("model driver for {mdl_id} has been successfully auto-recovered");
} }
Err(e) => { Err(e) => {

@ -34,7 +34,11 @@ use {
}, },
}, },
crate::{engine::error::RuntimeResult, util::compiler}, crate::{engine::error::RuntimeResult, util::compiler},
std::{fmt, mem::MaybeUninit}, std::{
fmt,
mem::MaybeUninit,
sync::atomic::{AtomicUsize, Ordering},
},
tokio::sync::mpsc::unbounded_channel, tokio::sync::mpsc::unbounded_channel,
}; };
@ -87,9 +91,34 @@ pub unsafe fn load_and_enable_all(gns: GlobalNS) -> GlobalStateStart {
global access global access
*/ */
pub struct GlobalHealth {
faults: AtomicUsize,
}
impl GlobalHealth {
pub fn status_okay(&self) -> bool {
self.faults.load(Ordering::Acquire) == 0
}
const fn new() -> Self {
Self {
faults: AtomicUsize::new(0),
}
}
fn report_fault(&self) {
self.faults.fetch_add(1, Ordering::Release);
}
fn report_recovery(&self) {
self.faults.fetch_sub(1, Ordering::Release);
}
pub fn report_removal_of_faulty_source(&self) {
self.report_recovery()
}
}
/// Something that represents the global state /// Something that represents the global state
pub trait GlobalInstanceLike { pub trait GlobalInstanceLike {
// stat // stat
fn health(&self) -> &GlobalHealth;
fn get_max_delta_size(&self) -> usize; fn get_max_delta_size(&self) -> usize;
// global namespace // global namespace
fn state(&self) -> &GlobalNS; fn state(&self) -> &GlobalNS;
@ -150,6 +179,13 @@ impl GlobalInstanceLike for Global {
fn state(&self) -> &GlobalNS { fn state(&self) -> &GlobalNS {
self._namespace() self._namespace()
} }
fn health(&self) -> &GlobalHealth {
&unsafe {
// UNSAFE(@ohsayan): we expect the system to be initialized
self.__gref()
}
.health
}
// taskmgr // taskmgr
fn taskmgr_post_high_priority(&self, task: Task<CriticalTask>) { fn taskmgr_post_high_priority(&self, task: Task<CriticalTask>) {
self._post_high_priority_task(task) self._post_high_priority_task(task)
@ -258,11 +294,16 @@ impl Global {
struct GlobalState { struct GlobalState {
gns: GlobalNS, gns: GlobalNS,
task_mgr: mgr::FractalMgr, task_mgr: mgr::FractalMgr,
health: GlobalHealth,
} }
impl GlobalState { impl GlobalState {
fn new(gns: GlobalNS, task_mgr: mgr::FractalMgr) -> Self { fn new(gns: GlobalNS, task_mgr: mgr::FractalMgr) -> Self {
Self { gns, task_mgr } Self {
gns,
task_mgr,
health: GlobalHealth::new(),
}
} }
pub(self) fn fractal_mgr(&self) -> &mgr::FractalMgr { pub(self) fn fractal_mgr(&self) -> &mgr::FractalMgr {
&self.task_mgr &self.task_mgr

@ -26,7 +26,7 @@
use { use {
super::{ super::{
drivers::FractalGNSDriver, CriticalTask, FractalModelDriver, GenericTask, drivers::FractalGNSDriver, CriticalTask, FractalModelDriver, GenericTask, GlobalHealth,
GlobalInstanceLike, Task, GlobalInstanceLike, Task,
}, },
crate::engine::{ crate::engine::{
@ -47,6 +47,7 @@ pub struct TestGlobal {
gns: GlobalNS, gns: GlobalNS,
lp_queue: RwLock<Vec<Task<GenericTask>>>, lp_queue: RwLock<Vec<Task<GenericTask>>>,
max_delta_size: usize, max_delta_size: usize,
health: GlobalHealth,
} }
impl TestGlobal { impl TestGlobal {
@ -55,6 +56,7 @@ impl TestGlobal {
gns, gns,
lp_queue: RwLock::default(), lp_queue: RwLock::default(),
max_delta_size: usize::MAX, max_delta_size: usize::MAX,
health: GlobalHealth::new(),
} }
} }
pub fn set_max_data_pressure(&mut self, max_data_pressure: usize) { pub fn set_max_data_pressure(&mut self, max_data_pressure: usize) {
@ -110,6 +112,9 @@ impl TestGlobal {
} }
impl GlobalInstanceLike for TestGlobal { impl GlobalInstanceLike for TestGlobal {
fn health(&self) -> &GlobalHealth {
&self.health
}
fn state(&self) -> &GlobalNS { fn state(&self) -> &GlobalNS {
&self.gns &self.gns
} }

Loading…
Cancel
Save