Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[stormond] Added new dynamic field 'last_sync_time' to STATE_DB #535

Merged
merged 7 commits into from
Nov 27, 2024
Merged
77 changes: 45 additions & 32 deletions sonic-stormond/scripts/stormond
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import shutil
import json
import time

from datetime import datetime
from sonic_py_common import daemon_base, device_info, syslogger
from swsscommon import swsscommon
from sonic_platform_base.sonic_storage.storage_devices import StorageDevices, BLKDEV_BASE_PATH
Expand Down Expand Up @@ -49,6 +50,8 @@ class DaemonStorage(daemon_base.DaemonBase):
self.log = syslogger.SysLogger(SYSLOG_IDENTIFIER)
super(DaemonStorage, self).__init__(log_identifier)

self.log.log_notice("Starting Storage Monitoring Daemon")

self.timeout = STORMOND_PERIODIC_STATEDB_SYNC_SECS
self.fsstats_sync_interval = STORMOND_SYNC_TO_DISK_SECS
self.stop_event = threading.Event()
Expand All @@ -68,6 +71,9 @@ class DaemonStorage(daemon_base.DaemonBase):
self.fsio_rw_json = {disk:{} for disk in self.storage.devices}
self.fsio_rw_statedb = {disk:{} for disk in self.storage.devices}

# This is the time format string
self.time_format_string = "%Y-%m-%d %H:%M:%S"

# This time is set at init and then subsequently after each FSIO JSON file sync
self.fsio_sync_time = time.time()

Expand All @@ -82,7 +88,8 @@ class DaemonStorage(daemon_base.DaemonBase):
"total_fsio_writes", \
"disk_io_reads", \
"disk_io_writes", \
"reserved_blocks"]
"reserved_blocks", \
"last_sync_time"]

# These are the fields that we are interested in saving to disk to protect against
# reboots or crashes
Expand All @@ -97,21 +104,26 @@ class DaemonStorage(daemon_base.DaemonBase):
self._load_fsio_rw_json()
self._determine_sot()

# This function is used to convert the epoch time to a user friendly formatted string
def get_formatted_time(self, time_since_epoch):
return datetime.fromtimestamp(time_since_epoch).strftime(self.time_format_string)

# This function is used to configure the polling and sync intervals for the daemon
def get_configdb_intervals(self):
self.config_db = daemon_base.db_connect("CONFIG_DB")
config_info = dict(self.config_db.hgetall('STORMOND_CONFIG|INTERVALS'))
self.timeout = int(config_info.get('daemon_polling_interval', STORMOND_PERIODIC_STATEDB_SYNC_SECS))
self.fsstats_sync_interval = int(config_info.get('fsstats_sync_interval', STORMOND_SYNC_TO_DISK_SECS))

self.log_info("Polling Interval set to {} seconds".format(self.timeout))
self.log_info("FSIO JSON file Interval set to {} seconds".format(self.fsstats_sync_interval))
self.log.log_notice("Polling Interval set to {} seconds".format(self.timeout))
self.log.log_notice("FSIO JSON file Interval set to {} seconds".format(self.fsstats_sync_interval))


# Get the total and latest FSIO reads and writes from JSON file
def _load_fsio_rw_json(self):
try:
if not os.path.exists(FSIO_RW_JSON_FILE):
self.log_info("{} not present.".format(FSIO_RW_JSON_FILE))
self.log.log_notice("{} not present.".format(FSIO_RW_JSON_FILE))
return

# Load JSON file
Expand All @@ -123,21 +135,21 @@ class DaemonStorage(daemon_base.DaemonBase):
for field in self.statedb_json_sync_fields:

if self.fsio_rw_json[storage_device][field] == None:
self.log_warning("{}:{} value = None in JSON file".format(storage_device, field))
self.log.log_warning("{}:{} value = None in JSON file".format(storage_device, field))
return

self.fsio_json_file_loaded = True

except Exception as e:
self.log_error("JSON file could not be loaded: {}".format(str(e)))
self.log.log_error("JSON file could not be loaded: {}".format(str(e)))

return


# Sync the total and latest procfs reads and writes from STATE_DB to JSON file on disk
def sync_fsio_rw_json(self):

self.log_info("Syncing total and latest procfs reads and writes from STATE_DB to JSON file")
self.log.log_notice("Syncing total and latest procfs reads and writes from STATE_DB to JSON file")

json_file_dict = {disk:{} for disk in self.storage.devices}
try:
Expand All @@ -146,21 +158,21 @@ class DaemonStorage(daemon_base.DaemonBase):
json_file_dict[device][field] = self.state_db.hget('STORAGE_INFO|{}'.format(device), field)

self.fsio_sync_time = time.time()
json_file_dict["successful_sync_time"] = str(self.fsio_sync_time)
json_file_dict["successful_sync_time"] = str(self.get_formatted_time(self.fsio_sync_time))

with open(FSIO_RW_JSON_FILE, 'w+') as f:
json.dump(json_file_dict, f)

return True

except Exception as ex:
self.log_error("Unable to sync state_db to disk: {}".format(str(ex)))
self.log.log_error("Unable to sync state_db to disk: {}".format(str(ex)))
return False


# Update the successful sync time to STATE_DB
def write_sync_time_statedb(self):
self.state_db.hset("{}|{}".format(STORAGE_DEVICE_TABLE,FSSTATS_SYNC_TIME_KEY), "successful_sync_time", str(self.fsio_sync_time))
self.state_db.hset("{}|{}".format(STORAGE_DEVICE_TABLE,FSSTATS_SYNC_TIME_KEY), "successful_sync_time", str(self.get_formatted_time(self.fsio_sync_time)))

# Run a sanity check on the state_db. If successful, get total, latest
# FSIO reads and writes for each storage device from STATE_DB
Expand All @@ -185,12 +197,12 @@ class DaemonStorage(daemon_base.DaemonBase):
self.fsio_rw_statedb[storage_device][field] = "0" if value is None else value

if value is None:
self.log_warning("{}:{} value = None in StateDB".format(storage_device, field))
self.log.log_warning("{}:{} value = None in StateDB".format(storage_device, field))
return

self.statedb_storage_info_loaded = True
except Exception as e:
self.log_error("Reading STATE_DB failed with: {}".format(str(e)))
self.log.log_error("Reading STATE_DB failed with: {}".format(str(e)))


def _determine_sot(self):
Expand Down Expand Up @@ -269,21 +281,21 @@ class DaemonStorage(daemon_base.DaemonBase):
try:
# Unlikely scenario
if storage_object is None:
self.log_info("{} does not have an instantiated object. Static Information cannot be gathered.".format(storage_device))
self.log.log_notice("{} does not have an instantiated object. Static Information cannot be gathered.".format(storage_device))
continue

static_kvp_dict = {}

static_kvp_dict["device_model"] = storage_object.get_model()
static_kvp_dict["serial"] = storage_object.get_serial()

self.log_info("Storage Device: {}, Device Model: {}, Serial: {}".format(storage_device, static_kvp_dict["device_model"], static_kvp_dict["serial"]))
self.log.log_notice("Storage Device: {}, Device Model: {}, Serial: {}".format(storage_device, static_kvp_dict["device_model"], static_kvp_dict["serial"]))

# update Storage Device Status to DB
self.update_storage_info_status_db(storage_device, static_kvp_dict)

except Exception as ex:
self.log_error("get_static_fields_update_state_db() failed with: {}".format(str(ex)))
self.log.log_error("get_static_fields_update_state_db() failed with: {}".format(str(ex)))

# Get Dynamic attributes and update the State DB
def get_dynamic_fields_update_state_db(self):
Expand All @@ -292,7 +304,7 @@ class DaemonStorage(daemon_base.DaemonBase):
for storage_device, storage_object in self.storage.devices.items():
try:
if storage_object is None:
self.log_info("Storage device '{}' does not have an instantiated object. Dynamic Information cannot be gathered.".format(storage_device))
self.log.log_notice("Storage device '{}' does not have an instantiated object. Dynamic Information cannot be gathered.".format(storage_device))
continue

# Fetch the latest dynamic info
Expand All @@ -309,20 +321,23 @@ class DaemonStorage(daemon_base.DaemonBase):
dynamic_kvp_dict["disk_io_reads"] = storage_object.get_disk_io_reads()
dynamic_kvp_dict["disk_io_writes"] = storage_object.get_disk_io_writes()
dynamic_kvp_dict["reserved_blocks"] = storage_object.get_reserved_blocks()
dynamic_kvp_dict["last_sync_time"] = self.get_formatted_time(time.time())

dynamic_kvp_dict["total_fsio_reads"], dynamic_kvp_dict["total_fsio_writes"] = self._reconcile_fsio_rw_values(dynamic_kvp_dict, storage_device)

self.log_info("Storage Device: {}, Firmware: {}, health: {}%, Temp: {}C, FS IO Reads: {}, FS IO Writes: {}".format(\
storage_device, dynamic_kvp_dict["firmware"], dynamic_kvp_dict["health"], dynamic_kvp_dict["temperature"], dynamic_kvp_dict["total_fsio_reads"],dynamic_kvp_dict["total_fsio_writes"]))
self.log_info("Latest FSIO Reads: {}, Latest FSIO Writes: {}".format(dynamic_kvp_dict["latest_fsio_reads"], dynamic_kvp_dict["latest_fsio_writes"]))
self.log_info("Disk IO Reads: {}, Disk IO Writes: {}, Reserved Blocks: {}".format(dynamic_kvp_dict["disk_io_reads"], dynamic_kvp_dict["disk_io_writes"], \
dynamic_kvp_dict["reserved_blocks"]))

# Update storage device statistics to STATE_DB
self.update_storage_info_status_db(storage_device, dynamic_kvp_dict)

# Log to syslog
self.log.log_notice("Storage Device: {}, Firmware: {}, health: {}%, Temp: {}C, FS IO Reads: {}, FS IO Writes: {}".format(\
storage_device, dynamic_kvp_dict["firmware"], dynamic_kvp_dict["health"], dynamic_kvp_dict["temperature"], dynamic_kvp_dict["total_fsio_reads"],dynamic_kvp_dict["total_fsio_writes"]))
self.log.log_notice("Latest FSIO Reads: {}, Latest FSIO Writes: {}".format(dynamic_kvp_dict["latest_fsio_reads"], dynamic_kvp_dict["latest_fsio_writes"]))
self.log.log_notice("Disk IO Reads: {}, Disk IO Writes: {}, Reserved Blocks: {}".format(dynamic_kvp_dict["disk_io_reads"], dynamic_kvp_dict["disk_io_writes"], \
dynamic_kvp_dict["reserved_blocks"]))
self.log.log_notice("Last successful sync time to STATE_DB: {}".format(dynamic_kvp_dict["last_sync_time"]))

except Exception as ex:
self.log_info("get_dynamic_fields_update_state_db() failed with: {}".format(str(ex)))
self.log.log_notice("get_dynamic_fields_update_state_db() failed with: {}".format(str(ex)))


# Override signal handler from DaemonBase
Expand All @@ -333,22 +348,22 @@ class DaemonStorage(daemon_base.DaemonBase):
global exit_code

if sig in FATAL_SIGNALS:
self.log_info("Caught signal '{}'".format(signal.Signals(sig).name))
self.log.log_notice("Caught signal '{}'".format(signal.Signals(sig).name))

if self.sync_fsio_rw_json():
self.write_sync_time_statedb()
else:
self.log_warning("Unable to sync latest and total procfs RW to disk")
self.log.log_warning("Unable to sync latest and total procfs RW to disk")

self.log_info("Exiting with {}".format(signal.Signals(sig).name))
self.log.log_notice("Exiting with {}".format(signal.Signals(sig).name))

# Make sure we exit with a non-zero code so that supervisor will try to restart us
exit_code = 128 + sig
self.stop_event.set()
elif sig in NONFATAL_SIGNALS:
self.log_info("Caught signal '{}' - ignoring...".format(signal.Signals(sig).name))
self.log.log_notice("Caught signal '{}' - ignoring...".format(signal.Signals(sig).name))
else:
self.log_warning("Caught unhandled signal '{}' - ignoring...".format(signal.Signals(sig).name))
self.log.log_warning("Caught unhandled signal '{}' - ignoring...".format(signal.Signals(sig).name))

# Main daemon logic
def run(self):
Expand All @@ -374,7 +389,7 @@ class DaemonStorage(daemon_base.DaemonBase):
if self.sync_fsio_rw_json():
self.write_sync_time_statedb()
else:
self.log_warning("Unable to sync latest and total procfs RW to disk")
self.log.log_warning("Unable to sync latest and total procfs RW to disk")

return True
#
Expand All @@ -385,15 +400,13 @@ class DaemonStorage(daemon_base.DaemonBase):
def main():
stormon = DaemonStorage(SYSLOG_IDENTIFIER)

stormon.log_info("Starting Storage Monitoring Daemon")

# Read and update Static Fields to the StateDB once
stormon.get_static_fields_update_state_db()

while stormon.run():
pass

stormon.log_info("Shutting down Storage Monitoring Daemon")
stormon.log.log_notice("Shutting down Storage Monitoring Daemon")

return exit_code

Expand Down
Loading
Loading