Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SSD Health API and generic implementation #47

Merged
merged 2 commits into from
Sep 18, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
'sonic_platform_base',
'sonic_platform_base.sonic_eeprom',
'sonic_platform_base.sonic_sfp',
'sonic_platform_base.sonic_ssd',
'sonic_psu',
'sonic_sfp',
],
Expand Down
Empty file.
75 changes: 75 additions & 0 deletions sonic_platform_base/sonic_ssd/ssd_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#
# ssd_base.py
#
# Base class for implementing common SSD health features
#


class SsdBase(object):
"""
Base class for interfacing with a SSD
"""
def __init__(self, diskdev):
"""
Constructor

Args:
diskdev: Linux device name to get parameters for
"""
pass

def get_health(self):
"""
Retrieves current disk health in percentages

Returns:
A float number of current ssd health
e.g. 83.5
"""
raise NotImplementedError

def get_temperature(self):
"""
Retrieves current disk temperature in Celsius

Returns:
A float number of current temperature in Celsius
e.g. 40.1
"""
raise NotImplementedError

def get_model(self):
"""
Retrieves model for the given disk device

Returns:
A string holding disk model as provided by the manufacturer
"""
raise NotImplementedError

def get_firmware(self):
"""
Retrieves firmware version for the given disk device

Returns:
A string holding disk firmware version as provided by the manufacturer
"""
raise NotImplementedError

def get_serial(self):
"""
Retrieves serial number for the given disk device

Returns:
A string holding disk serial number as provided by the manufacturer
"""
raise NotImplementedError

def get_vendor_output(self):
"""
Retrieves vendor specific data for the given disk device

Returns:
A string holding some vendor specific disk information
"""
raise NotImplementedError
157 changes: 157 additions & 0 deletions sonic_platform_base/sonic_ssd/ssd_generic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
#
# ssd_generic.py
#
# Generic implementation of the SSD health API
# SSD models supported:
# - InnoDisk
# - StorFly
# - Virtium

try:
import exceptions # Python 2
except ImportError:
import builtins as exceptions # Python 3
try:
import re
import subprocess
from .ssd_base import SsdBase
except ImportError as e:
raise ImportError (str(e) + "- required module not found")

SMARTCTL = "smartctl {} -a"
INNODISK = "iSmart -d {}"
VIRTIUM = "SmartCmd -m {}"

NOT_AVAILABLE = "N/A"


class SsdUtil(SsdBase):
"""
Generic implementation of the SSD health API
"""
model = NOT_AVAILABLE
serial = NOT_AVAILABLE
firmware = NOT_AVAILABLE
temperature = NOT_AVAILABLE
health = NOT_AVAILABLE
ssd_info = NOT_AVAILABLE
vendor_ssd_info = NOT_AVAILABLE

def __init__(self, diskdev):
self.vendor_ssd_utility = {
"Generic" : { "utility" : SMARTCTL, "parser" : self.parse_generic_ssd_info },
"InnoDisk" : { "utility" : INNODISK, "parser" : self.parse_innodisk_info },
"M.2" : { "utility" : INNODISK, "parser" : self.parse_innodisk_info },
"StorFly" : { "utility" : VIRTIUM, "parser" : self.parse_virtium_info },
"Virtium" : { "utility" : VIRTIUM, "parser" : self.parse_virtium_info }
}

self.dev = diskdev
# Generic part
self.fetch_generic_ssd_info(diskdev)
self.parse_generic_ssd_info()

# Known vendor part
if self.model:
model_short = self.model.split()[0]
if self.vendor_ssd_utility.has_key(model_short):
self.fetch_vendor_ssd_info(diskdev, model_short)
self.parse_vendor_ssd_info(model_short)
else:
# No handler registered for this disk model
pass
else:
# Failed to get disk model
jleveque marked this conversation as resolved.
Show resolved Hide resolved
pass

def _execute_shell(self, cmd):
process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
return output

def _parse_re(self, pattern, buffer):
res_list = re.findall(pattern, buffer)
return res_list[0] if res_list else NOT_AVAILABLE

def fetch_generic_ssd_info(self, diskdev):
self.ssd_info = self._execute_shell(self.vendor_ssd_utility["Generic"]["utility"].format(diskdev))

def parse_generic_ssd_info(self):
self.model = self._parse_re('Device Model:\s*(.+?)\n', self.ssd_info)
self.serial = self._parse_re('Serial Number:\s*(.+?)\n', self.ssd_info)
self.firmware = self._parse_re('Firmware Version:\s*(.+?)\n', self.ssd_info)

def parse_innodisk_info(self):
self.health = self._parse_re('Health:\s*(.+?)%', self.vendor_ssd_info)
self.temperature = self._parse_re('Temperature\s*\[\s*(.+?)\]', self.vendor_ssd_info)

def parse_virtium_info(self):
self.temperature = self._parse_re('Temperature_Celsius\s*\d*\s*(\d+?)\s+', self.vendor_ssd_info)
nand_endurance = self._parse_re('NAND_Endurance\s*\d*\s*(\d+?)\s+', self.vendor_ssd_info)
avg_erase_count = self._parse_re('Average_Erase_Count\s*\d*\s*(\d+?)\s+', self.vendor_ssd_info)
try:
self.health = 100 - (float(avg_erase_count) * 100 / float(nand_endurance))
except ValueError:
pass

def fetch_vendor_ssd_info(self, diskdev, model):
self.vendor_ssd_info = self._execute_shell(self.vendor_ssd_utility[model]["utility"].format(diskdev))

def parse_vendor_ssd_info(self, model):
self.vendor_ssd_utility[model]["parser"]()

def get_health(self):
"""
Retrieves current disk health in percentages

Returns:
A float number of current ssd health
e.g. 83.5
"""
return self.health

def get_temperature(self):
"""
Retrieves current disk temperature in Celsius

Returns:
A float number of current temperature in Celsius
e.g. 40.1
"""
return self.temperature

def get_model(self):
"""
Retrieves model for the given disk device

Returns:
A string holding disk model as provided by the manufacturer
"""
return self.model

def get_firmware(self):
"""
Retrieves firmware version for the given disk device

Returns:
A string holding disk firmware version as provided by the manufacturer
"""
return self.firmware

def get_serial(self):
"""
Retrieves serial number for the given disk device

Returns:
A string holding disk serial number as provided by the manufacturer
"""
return self.serial

def get_vendor_output(self):
"""
Retrieves vendor specific data for the given disk device

Returns:
A string holding some vendor specific disk information
"""
return self.vendor_ssd_info

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Except the attributes you list. It's better to add "capacity" "P/E cycle" "Bad block" "Remaining time" .

Copy link
Contributor Author

@andriymoroz-mlnx andriymoroz-mlnx Aug 19, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the attributes you suggest are probably specific to InnoDisk SSDs
For example StorFly disks does not have it but provide attribute #168 (NAND Endurance) which initial value is 20000. If compare to P/E from InnoDisk which is 3000 I think StorFly are not 6 times more reliable but rather use different units. That's why I would prefer to show such info with the "--vendor" option
"Bad block" value is also ambiguous. Depending on SSD NAND type (SLC, TLC, MLC) the endurance of flash cells can be different. Of course manufacturer knows about it and compensate worse endurance with the greater amount of reserved cells. That's why the absolute value of the bad (reallocated) cells does not represent the disk health state. Sometimes it is used to calculate disk health as ((<total number of reserved cells> - <number of reallocated cells> / <total number of reserved cells>)*100
"Remaining time" (InnoDisk utility calls this parameter Lifespan) is also provided not by all vendors and is very rough estimation. It is highly dependent on disk usage patterns.

Someday we can add daemon to the pmon which will periodically query current disk health and raise alarm once it reaches some threshold.