Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add OC Event Counter #597

Open
wants to merge 6 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 55 additions & 2 deletions jtop/core/power.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from .common import cat, check_file
import glob
import os
# Logging
import logging
Expand Down Expand Up @@ -105,6 +106,46 @@ def find_all_i2c_power_monitor(i2c_path):
return power_sensor


def find_all_oc_event_counters():
"""Find all the overcurrent event counters on the system"""
event_cnt_files = glob.glob('/sys/class/hwmon/hwmon*/oc*_event_cnt')
if (len(event_cnt_files) == 0):
logger.warning("No OC event counters found")
return {}

event_counts = {filename: -1 for filename in event_cnt_files}

update_oc_event_counts(event_counts)

return event_counts


def update_oc_event_counts(event_counts):
"""
Function to update overcurrent event counts.

Update the event counts in the event_counts dictionary, and return True if any of the counts have increased
"""
# We can report more granular information about the throttling events if we really want to, but there
# is no direct mapping from oc*_event_cnt to which power rail/system is being measured, we
# would need to hard code a mapping from board type to oc*_event_cnt->power rail mappings,
# this is fragile, and most users will probably only care about throttling or not throttling,
# and can use the existing power panel to see currents and current limits if they want to dig deeper.
# https://docs.nvidia.com/jetson/archives/r36.4/DeveloperGuide/SD/PlatformPowerAndPerformance/JetsonOrinNanoSeriesJetsonOrinNxSeriesAndJetsonAgxOrinSeries.html#jetson-agx-orin-series
throttling = False
for filename in event_counts:
try:
with open(filename, 'r') as f:
count = int(f.read())
if count > event_counts[filename]:
event_counts[filename] = count
throttling = True
except Exception as e:
logger.error("Error reading OC event counter from {filename}: {e}".format(filename=filename, e=e))
return throttling
return throttling


def read_power_status(data):
values = {}
power_type = data['type']
Expand Down Expand Up @@ -237,6 +278,7 @@ class PowerService(object):
def __init__(self):
self._power_sensor = {}
self._power_avg = {}
self._oc_event_counts = {}
# Find all I2C sensors on board
i2c_path = "/sys/bus/i2c/devices"
system_monitor = "/sys/class/power_supply"
Expand All @@ -248,6 +290,7 @@ def __init__(self):
# Load all power sensors
self._power_sensor = find_all_i2c_power_monitor(i2c_path)
self._power_sensor.update(find_all_system_monitor(system_monitor))
self._oc_event_counts = find_all_oc_event_counters()
if not self._power_sensor:
logger.warning("Power sensors not found!")
# Sort all power sensors
Expand Down Expand Up @@ -287,5 +330,15 @@ def get_status(self):
rails[name] = values
# Measure total power
total, rails = total_power(rails)
return {'rail': rails, 'tot': total}
# EOF
ret_dict = {'rail': rails, 'tot': total}

# Only include OC events if counters exist
if self._oc_event_counts:
oc_events = {
'is_throttling': update_oc_event_counts(self._oc_event_counts),
'count': sum(self._oc_event_counts.values())
}
ret_dict['oc_events'] = oc_events

return ret_dict
# EOF
35 changes: 34 additions & 1 deletion jtop/gui/pcontrol.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,15 @@ def compact_temperatures(stdscr, pos_y, pos_x, width, height, jetson):
return counter


def display_oc_event(stdscr, oc_events, pos_y, pos_x):
oc_event_cnt = oc_events['count']
is_throttling = oc_events['is_throttling']
# Plot OC_EVENT_CNT with color based on throttling status
color = NColors.red() if is_throttling else (NColors.yellow() if oc_event_cnt > 0 else NColors.green())
stdscr.addstr(pos_y, pos_x, "OC EVENT COUNT: ", curses.A_BOLD)
stdscr.addstr(pos_y, pos_x + 16, str(oc_event_cnt), curses.A_BOLD | color)


def compact_power(stdscr, pos_y, pos_x, width, height, jetson):
LIMIT = 25
# center_x = pos_x + width // 2 if width > LIMIT else pos_x + width // 2 + 4
Expand Down Expand Up @@ -105,7 +114,20 @@ def compact_power(stdscr, pos_y, pos_x, width, height, jetson):
if width > LIMIT:
unit_avg = unit_to_string(total['avg'], 'm', 'W')
stdscr.addstr(pos_y + len_power + 1, center_x + column_power - 3, unit_avg, curses.A_BOLD)
return len(power) + 1

# If there is no more space, return
if len_power + 3 >= height:
return len(power) + 1

# if there are no OC events, return
if not jetson.power['oc_events']:
return len(power) + 1

display_oc_event(stdscr,
jetson.power['oc_events'],
pos_y=pos_y + len_power + 3,
pos_x=center_x - column_power - 5)
return len(power) + 3


class CTRL(Page):
Expand Down Expand Up @@ -374,6 +396,17 @@ def control_power(self, pos_y, pos_x, key, mouse):
except curses.error:
pass

# if there are no OC events, return
if not self.jetson.power['oc_events']:
return

# Plot OC_EVENT_CNT
display_oc_event(
self.stdscr,
self.jetson.power['oc_events'],
pos_y=pos_y_table + len_power + 2,
pos_x=pos_x)

def draw(self, key, mouse):
# Screen size
height, width, first = self.size_page()
Expand Down