1
0
Fork 0

Merge pull request #1638 from rqlite/more-snapshot-metrics

More Snapshotting metrics
master
Philip O'Toole 8 months ago committed by GitHub
commit 54a0e9853c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -1,3 +1,7 @@
## 8.18.3 (unreleased)
### Implementation changes and bug fixes
- [PR #1638](https://github.com/rqlite/rqlite/pull/1638): More Snapshotting metrics.
## 8.18.2 (January 27th 2024)
### Implementation changes and bug fixes
- [PR #1635](https://github.com/rqlite/rqlite/pull/1635): Always execute ExecuteRequests locally if possible.

@ -34,7 +34,7 @@ const (
numCheckpointErrors = "checkpoint_errors"
numCheckpointedPages = "checkpointed_pages"
numCheckpointedMoves = "checkpointed_moves"
checkpointDuration = "checkpoint_duration_ns"
checkpointDuration = "checkpoint_duration_ms"
numExecutions = "executions"
numExecutionErrors = "execution_errors"
numQueries = "queries"
@ -353,7 +353,7 @@ func (db *DB) CheckpointWithTimeout(mode CheckpointMode, dur time.Duration) (err
if err != nil {
stats.Add(numCheckpointErrors, 1)
} else {
stats.Get(checkpointDuration).(*expvar.Int).Set(time.Since(start).Nanoseconds())
stats.Get(checkpointDuration).(*expvar.Int).Set(time.Since(start).Milliseconds())
stats.Add(numCheckpoints, 1)
}
}()

@ -108,47 +108,50 @@ const (
)
const (
numSnapshots = "num_snapshots"
numSnapshotsFailed = "num_snapshots_failed"
numUserSnapshots = "num_user_snapshots"
numUserSnapshotsFailed = "num_user_snapshots_failed"
numWALSnapshots = "num_wal_snapshots"
numWALSnapshotsFailed = "num_wal_snapshots_failed"
numSnapshotsFull = "num_snapshots_full"
numSnapshotsIncremental = "num_snapshots_incremental"
numFullCheckpointFailed = "num_full_checkpoint_failed"
numWALCheckpointRestartFailed = "num_wal_checkpoint_restart_failed"
numWALCheckpointTruncateFailed = "num_wal_checkpoint_truncate_failed"
numAutoVacuums = "num_auto_vacuums"
numAutoVacuumsFailed = "num_auto_vacuums_failed"
autoVacuumDuration = "auto_vacuum_duration"
numBoots = "num_boots"
numBackups = "num_backups"
numLoads = "num_loads"
numRestores = "num_restores"
numRestoresFailed = "num_restores_failed"
numAutoRestores = "num_auto_restores"
numAutoRestoresSkipped = "num_auto_restores_skipped"
numAutoRestoresFailed = "num_auto_restores_failed"
numRecoveries = "num_recoveries"
numProviderChecks = "num_provider_checks"
numProviderProvides = "num_provider_provides"
numProviderProvidesFail = "num_provider_provides_fail"
numUncompressedCommands = "num_uncompressed_commands"
numCompressedCommands = "num_compressed_commands"
numJoins = "num_joins"
numIgnoredJoins = "num_ignored_joins"
numRemovedBeforeJoins = "num_removed_before_joins"
numDBStatsErrors = "num_db_stats_errors"
snapshotCreateDuration = "snapshot_create_duration"
snapshotPersistDuration = "snapshot_persist_duration"
snapshotPrecompactWALSize = "snapshot_precompact_wal_size"
snapshotWALSize = "snapshot_wal_size"
leaderChangesObserved = "leader_changes_observed"
leaderChangesDropped = "leader_changes_dropped"
failedHeartbeatObserved = "failed_heartbeat_observed"
nodesReapedOK = "nodes_reaped_ok"
nodesReapedFailed = "nodes_reaped_failed"
numSnapshots = "num_snapshots"
numSnapshotsFailed = "num_snapshots_failed"
numUserSnapshots = "num_user_snapshots"
numUserSnapshotsFailed = "num_user_snapshots_failed"
numWALSnapshots = "num_wal_snapshots"
numWALSnapshotsFailed = "num_wal_snapshots_failed"
numSnapshotsFull = "num_snapshots_full"
numSnapshotsIncremental = "num_snapshots_incremental"
numFullCheckpointFailed = "num_full_checkpoint_failed"
numWALCheckpointRestartFailed = "num_wal_checkpoint_restart_failed"
numWALCheckpointTruncateFailed = "num_wal_checkpoint_truncate_failed"
numAutoVacuums = "num_auto_vacuums"
numAutoVacuumsFailed = "num_auto_vacuums_failed"
autoVacuumDuration = "auto_vacuum_duration"
numBoots = "num_boots"
numBackups = "num_backups"
numLoads = "num_loads"
numRestores = "num_restores"
numRestoresFailed = "num_restores_failed"
numAutoRestores = "num_auto_restores"
numAutoRestoresSkipped = "num_auto_restores_skipped"
numAutoRestoresFailed = "num_auto_restores_failed"
numRecoveries = "num_recoveries"
numProviderChecks = "num_provider_checks"
numProviderProvides = "num_provider_provides"
numProviderProvidesFail = "num_provider_provides_fail"
numUncompressedCommands = "num_uncompressed_commands"
numCompressedCommands = "num_compressed_commands"
numJoins = "num_joins"
numIgnoredJoins = "num_ignored_joins"
numRemovedBeforeJoins = "num_removed_before_joins"
numDBStatsErrors = "num_db_stats_errors"
snapshotCreateDuration = "snapshot_create_duration"
snapshotCreateChkRestartDuration = "snapshot_create_chk_restart_duration"
snapshotCreateChkTruncateDuration = "snapshot_create_chk_truncate_duration"
snapshotCreateWALCompactDuration = "snapshot_create_wal_compact_duration"
snapshotPersistDuration = "snapshot_persist_duration"
snapshotPrecompactWALSize = "snapshot_precompact_wal_size"
snapshotWALSize = "snapshot_wal_size"
leaderChangesObserved = "leader_changes_observed"
leaderChangesDropped = "leader_changes_dropped"
failedHeartbeatObserved = "failed_heartbeat_observed"
nodesReapedOK = "nodes_reaped_ok"
nodesReapedFailed = "nodes_reaped_failed"
)
// stats captures stats for the Store.
@ -195,6 +198,9 @@ func ResetStats() {
stats.Add(numRemovedBeforeJoins, 0)
stats.Add(numDBStatsErrors, 0)
stats.Add(snapshotCreateDuration, 0)
stats.Add(snapshotCreateChkRestartDuration, 0)
stats.Add(snapshotCreateChkTruncateDuration, 0)
stats.Add(snapshotCreateWALCompactDuration, 0)
stats.Add(snapshotPersistDuration, 0)
stats.Add(snapshotPrecompactWALSize, 0)
stats.Add(snapshotWALSize, 0)
@ -1919,10 +1925,12 @@ func (s *Store) fsmSnapshot() (fSnap raft.FSMSnapshot, retErr error) {
var fsmSnapshot raft.FSMSnapshot
if fullNeeded {
chkStartTime := time.Now()
if err := s.db.Checkpoint(sql.CheckpointTruncate); err != nil {
stats.Add(numFullCheckpointFailed, 1)
return nil, err
}
stats.Get(snapshotCreateChkTruncateDuration).(*expvar.Int).Set(time.Since(chkStartTime).Milliseconds())
dbFD, err := os.Open(s.db.Path())
if err != nil {
return nil, err
@ -1938,11 +1946,14 @@ func (s *Store) fsmSnapshot() (fSnap raft.FSMSnapshot, retErr error) {
// it fails if some query is in progress. If it fails, return an error
// and Raft will retry later. But if it succeeds it means that all readers
// are reading from the main database file.
chkRStartTime := time.Now()
if err := s.db.Checkpoint(sql.CheckpointRestart); err != nil {
stats.Add(numWALCheckpointRestartFailed, 1)
return nil, err
}
stats.Get(snapshotCreateChkRestartDuration).(*expvar.Int).Set(time.Since(chkRStartTime).Milliseconds())
compactStartTime := time.Now()
// Read a compacted version of the WAL into memory, and write it
// to the Snapshot store.
walFD, err := os.Open(s.walPath)
@ -1962,6 +1973,7 @@ func (s *Store) fsmSnapshot() (fSnap raft.FSMSnapshot, retErr error) {
return nil, err
}
walFD.Close() // We need it closed for the next step.
stats.Get(snapshotCreateWALCompactDuration).(*expvar.Int).Set(time.Since(compactStartTime).Milliseconds())
// Clean-up by truncating the WAL. This should be fast because all the pages
// have been checkpointed into the main database file, and writes are
@ -1971,10 +1983,12 @@ func (s *Store) fsmSnapshot() (fSnap raft.FSMSnapshot, retErr error) {
if err != nil {
return nil, err
}
chkTStartTime := time.Now()
if err := s.db.Checkpoint(sql.CheckpointTruncate); err != nil {
stats.Add(numWALCheckpointTruncateFailed, 1)
return nil, fmt.Errorf("failed to truncate WAL: %s", err.Error())
}
stats.Get(snapshotCreateChkTruncateDuration).(*expvar.Int).Set(time.Since(chkTStartTime).Milliseconds())
stats.Get(snapshotWALSize).(*expvar.Int).Set(int64(compactedBuf.Len()))
stats.Get(snapshotPrecompactWALSize).(*expvar.Int).Set(walSz)
}

Loading…
Cancel
Save