-
Notifications
You must be signed in to change notification settings - Fork 0
/
job-pressure
executable file
·140 lines (122 loc) · 6.04 KB
/
job-pressure
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python3
# -*- mode: Python;-*-
import classad
import htcondor
import csv
import io
import sys
# Constants
OSPOOL_HOSTS = ['cm-1.ospool.osg-htc.org', 'cm-2.ospool.osg-htc.org']
SCHEDD_BLACKLIST = { 'login.ci-connect.uchicago.edu', 'os-ce1.osgdev.chtc.io', 'osg-vo.isi.edu',
'osg1.research.cs.kent.edu', 'sub-2.icecube.wisc.edu', 'testbed' }
def configure_htcondor():
# See https://opensciencegrid.atlassian.net/browse/INF-760
htcondor.param['SEC_CLIENT_AUTHENTICATION_METHODS'] = 'SSL FS'
# Suggested by Brian L., I think, at some point
htcondor.param['SEC_CLIENT_AUTHENTICATION'] = 'OPTIONAL'
htcondor.param['SEC_CLIENT_INTEGRITY'] = 'OPTIONAL'
htcondor.param['SEC_CLIENT_ENCRYPTION'] = 'OPTIONAL'
# Debugging
# htcondor.param['TOOL_DEBUG'] = 'D_SECURITY:2 D_CAT D_ALWAYS:2'
# htcondor.enable_debug()
def get_schedds(ospool_hosts):
# Per Brian B.: Best to go through both and take the union
ospool_schedd_names = {} # schedd name => [count, schedd classad]
for ospool_collector_hostname in ospool_hosts:
ospool_collector = htcondor.Collector(ospool_collector_hostname)
ospool_schedd_classad_list = ospool_collector.locateAll(htcondor.DaemonTypes.Schedd)
for ospool_schedd_classad in ospool_schedd_classad_list:
ospool_schedd_name = ospool_schedd_classad['Name']
if ospool_schedd_name in ospool_schedd_names:
ospool_schedd_names[ospool_schedd_name][0] += 1
else:
ospool_schedd_names[ospool_schedd_name] = [1, ospool_schedd_classad]
return ospool_schedd_names
def extract_attributes(ad):
output = {}
request_memory_mb = ad.eval('RequestMemory')
request_memory_gb = request_memory_mb / 1024.0
output['request_memory'] = '{:.2f}'.format(request_memory_gb)
request_disk_kb = ad.eval('RequestDisk')
request_disk_gb = request_disk_kb / (1024.0 * 1024.0)
output['request_disk'] = '{:.2f}'.format(request_disk_gb)
project_name_value = ad.get('ProjectName')
if project_name_value is None:
project_name = '<none>'
elif isinstance(project_name_value, classad.ExprTree):
project_name = str(project_name_value.eval())
else:
project_name = project_name_value
output['project_name'] = project_name
job_status = ad['JobStatus']
if job_status == htcondor.JobStatus.IDLE:
job_state_category = 'idle'
elif job_status == htcondor.JobStatus.HELD:
job_state_category = 'held'
elif job_status in (htcondor.JobStatus.RUNNING, htcondor.JobStatus.TRANSFERRING_OUTPUT, htcondor.JobStatus.SUSPENDED):
job_state_category = 'active'
elif job_status in (htcondor.JobStatus.REMOVED, htcondor.JobStatus.COMPLETED):
job_state_category = 'done'
else:
job_state_category = 'ERR'
output['job_state_category'] = job_state_category
return output
def get_job_ads(ospool_schedds):
# Prepare to go through schedds
projection = ['ProjectName', 'Owner', 'JobUniverse', 'RequestCpus', 'RequestDisk', 'RequestMemory', 'RequestGpus', 'JobStatus']
job_data = {}
schedds_contacted = []
schedds_failed = []
for ospool_schedd_name in sorted(ospool_schedds.keys()):
if (ospool_schedd_name in SCHEDD_BLACKLIST) or ('jupyter' in ospool_schedd_name) or ('chtc.wisc.edu' in ospool_schedd_name):
continue
ospool_schedd_classad = ospool_schedds[ospool_schedd_name][1]
ospool_schedd = htcondor.Schedd(ospool_schedd_classad)
schedd_name_dot = ospool_schedd_name.find('.')
if schedd_name_dot == -1:
schedd_short_name = ospool_schedd_name
elif schedd_name_dot > 0:
schedd_short_name = ospool_schedd_name[0:schedd_name_dot]
else:
print(f'Bad schedd name? "{ospool_schedd_name}"', file=sys.stderr)
sys.exit(1)
print(f'Trying schedd "{ospool_schedd_name}"', file=sys.stderr)
try:
for ospool_job_ad in ospool_schedd.query(projection=projection):
attributes = extract_attributes(ospool_job_ad)
job_status = ospool_job_ad['JobStatus']
group = (ospool_job_ad['JobUniverse'], ospool_job_ad['RequestCpus'],
attributes['request_memory'], attributes['request_disk'],
ospool_job_ad.get('RequestGpus', 0))
if group not in job_data:
job_data[group] = {'schedds': {}, 'projects': {}, 'owners': {}, 'idle': 0, 'active': 0, 'held': 0, 'done': 0, 'ERR': 0}
job_data[group]['schedds'][schedd_short_name] = True
job_data[group]['projects'][attributes['project_name']] = True
job_data[group]['owners'][ospool_job_ad['Owner']] = True
job_data[group][attributes['job_state_category']] += 1
except htcondor.HTCondorIOError as e:
schedds_failed.append(ospool_schedd_name)
print(f'Could not access schedd "{ospool_schedd_name}": {e}', file=sys.stderr)
continue
schedds_contacted.append(ospool_schedd_name)
return (schedds_contacted, schedds_failed, job_data)
# --------------------------------------------------------------
configure_htcondor()
ospool_schedds = get_schedds(OSPOOL_HOSTS)
contacted, failed, jobs = get_job_ads(ospool_schedds)
csv_output = io.StringIO()
csv_writer = csv.writer(csv_output)
csv_writer.writerow(['Universe', 'Request CPUs', 'Request Memory', 'Request Disk', 'Request GPUs',
'APs', 'Projects', 'Owners', '# Idle', '# Active', '# Held', '# Done', 'Errs'])
for group in jobs:
csv_row = list(group)
csv_row.append(', '.join(jobs[group]['schedds'].keys()))
csv_row.append(', '.join(jobs[group]['projects'].keys()))
csv_row.append(', '.join(jobs[group]['owners'].keys()))
csv_row.append(jobs[group]['idle'])
csv_row.append(jobs[group]['active'])
csv_row.append(jobs[group]['held'])
csv_row.append(jobs[group]['done'])
csv_row.append(jobs[group]['ERR'])
csv_writer.writerow(csv_row)
print(csv_output.getvalue(), end='')