-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
REVIEW ONLY - Track jid in proc file for runners #5
Changes from all commits
c466103
d46cc7c
d356667
441afea
90a99b8
8dae34e
d8c744f
b0a45e9
ebaf0bb
d50569c
5570f0f
e189120
0e4e664
c17c672
38a8498
365c607
a4a257d
2ed902e
70d4488
68880b9
63cdd3a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,13 @@ | |
from salt.utils.cache import CacheCli as cache_cli | ||
from salt.utils.process import MultiprocessingProcess | ||
|
||
# pylint: disable=import-error | ||
try: | ||
import salt.utils.psutil_compat as psutil | ||
HAS_PSUTIL = True | ||
except ImportError: | ||
HAS_PSUTIL = False | ||
|
||
# Import third party libs | ||
from salt.ext import six | ||
from salt.utils.zeromq import zmq | ||
|
@@ -40,7 +47,7 @@ | |
|
||
def get_running_jobs(opts): | ||
''' | ||
Return the running jobs on this minion | ||
Return the running jobs on the master | ||
''' | ||
|
||
ret = [] | ||
|
@@ -49,84 +56,76 @@ def get_running_jobs(opts): | |
return ret | ||
for fn_ in os.listdir(proc_dir): | ||
path = os.path.join(proc_dir, fn_) | ||
try: | ||
data = _read_proc_file(path, opts) | ||
if data is not None: | ||
ret.append(data) | ||
except (IOError, OSError): | ||
# proc files may be removed at any time during this process by | ||
# the master process that is executing the JID in question, so | ||
# we must ignore ENOENT during this process | ||
log.trace('%s removed during processing by master process', path) | ||
data = read_proc_file(path, opts) | ||
if not data: | ||
continue | ||
if not is_pid_healthy(data['pid']): | ||
continue | ||
ret.append(data) | ||
return ret | ||
|
||
|
||
def _read_proc_file(path, opts): | ||
def read_proc_file(path, opts): | ||
''' | ||
Return a dict of JID metadata, or None | ||
''' | ||
serial = salt.payload.Serial(opts) | ||
with salt.utils.files.fopen(path, 'rb') as fp_: | ||
buf = fp_.read() | ||
fp_.close() | ||
if buf: | ||
data = serial.loads(buf) | ||
else: | ||
# Proc file is empty, remove | ||
try: | ||
os.remove(path) | ||
except IOError: | ||
log.debug('Unable to remove proc file %s.', path) | ||
try: | ||
data = serial.load(fp_) | ||
except Exception as err: | ||
# need to add serial exception here | ||
# Could not read proc file | ||
log.warning("Issue deserializing data: %s", err) | ||
return None | ||
|
||
if not isinstance(data, dict): | ||
# Invalid serial object | ||
return None | ||
if not salt.utils.process.os_is_running(data['pid']): | ||
# The process is no longer running, clear out the file and | ||
# continue | ||
try: | ||
os.remove(path) | ||
except IOError: | ||
log.debug('Unable to remove proc file %s.', path) | ||
log.warning("Data is not a dict: %s", data) | ||
return None | ||
|
||
if not _check_cmdline(data): | ||
pid = data.get('pid') | ||
if pid: | ||
log.warning( | ||
'PID %s exists but does not appear to be a salt process.', pid | ||
) | ||
try: | ||
os.remove(path) | ||
except IOError: | ||
log.debug('Unable to remove proc file %s.', path) | ||
pid = data.get('pid', None) | ||
if not pid: | ||
# No pid, not a salt proc file | ||
log.warning("No PID found in data") | ||
return None | ||
|
||
return data | ||
|
||
|
||
def _check_cmdline(data): | ||
def is_pid_healthy(pid): | ||
''' | ||
In some cases where there are an insane number of processes being created | ||
on a system a PID can get recycled or assigned to a non-Salt process. | ||
On Linux this fn checks to make sure the PID we are checking on is actually | ||
a Salt process. | ||
This is a health check that will confirm the PID is running | ||
and executed by salt. | ||
|
||
If pusutil is available: | ||
* all architectures are checked | ||
|
||
For non-Linux systems we punt and just return True | ||
if psutil is not available: | ||
* Linux/Solaris/etc: archs with `/proc/cmdline` available are checked | ||
* AIX/Windows: assume PID is healhty and return True | ||
''' | ||
if not salt.utils.platform.is_linux(): | ||
return True | ||
pid = data.get('pid') | ||
if not pid: | ||
return False | ||
if not os.path.isdir('/proc'): | ||
if HAS_PSUTIL: | ||
try: | ||
proc = psutil.Process(pid) | ||
except psutil.NoSuchProcess: | ||
log.warning("PID %s is no longer running.", pid) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is a warning necessary here. maybe its called in cases where pid being dead/finished isnt an error? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yea i was on the fence with that. technically this should never happen because "things" should clean up after themselves. i didnt want to generate an error bc i thought that was a bit much. |
||
return False | ||
return any(['salt' in cmd for cmd in proc.cmdline()]) | ||
|
||
if salt.utils.platform.is_aix() or salt.utils.platform.is_windows(): | ||
return True | ||
path = os.path.join('/proc/{0}/cmdline'.format(pid)) | ||
if not os.path.isfile(path): | ||
|
||
if not salt.utils.process.os_is_running(pid): | ||
log.warning("PID %s is no longer running.", pid) | ||
return False | ||
|
||
cmdline_file = os.path.join('proc', str(pid), 'cmdline') | ||
try: | ||
with salt.utils.files.fopen(path, 'rb') as fp_: | ||
with salt.utils.files.fopen(cmdline_file, 'rb') as fp_: | ||
return b'salt' in fp_.read() | ||
except (OSError, IOError): | ||
except (OSError, IOError) as err: | ||
log.error("There was a problem reading proc file: %s", err) | ||
return False | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
# -*- coding: utf-8 -*- | ||
''' | ||
Test master code from utils | ||
''' | ||
from __future__ import absolute_import | ||
|
||
import os | ||
import time | ||
|
||
import setproctitle # pylint: disable=W8410 | ||
|
||
import salt.config | ||
import salt.utils.master as master | ||
|
||
from tests.support.case import ShellTestCase | ||
from tests.support.paths import TMP_ROOT_DIR | ||
from tests.support.helpers import flaky | ||
|
||
DEFAULT_CONFIG = salt.config.master_config(None) | ||
DEFAULT_CONFIG['cachedir'] = os.path.join(TMP_ROOT_DIR, 'cache') | ||
|
||
|
||
class MasterUtilJobsTestCase(ShellTestCase): | ||
|
||
def setUp(self): | ||
''' | ||
Necessary so that the master pid health check | ||
passes as it looks for salt in cmdline | ||
''' | ||
setproctitle.setproctitle('salt') | ||
|
||
@flaky | ||
def test_get_running_jobs(self): | ||
''' | ||
Test get running jobs | ||
''' | ||
ret = self.run_run_plus("test.sleep", '90', asynchronous=True) | ||
jid = ret['jid'] | ||
|
||
# Ran into a problem where the async jump was not seen until | ||
# after the test had finished. This caused the test to fail | ||
# because no job was present (not proc file). This attempts | ||
# to wait a total of 20s before giving up. | ||
attempt = 0 | ||
while attempt < 10: | ||
jobs = master.get_running_jobs(DEFAULT_CONFIG) | ||
if jobs: | ||
jids = [job['jid'] for job in jobs] | ||
assert jids.count(jid) == 1 | ||
break | ||
time.sleep(2) | ||
attempt += attempt + 1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you catch ioerror below but only soerror here, why?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
well, im not actually reading it so there is no ioerror possible.