Документ взят из кэша поисковой машины. Адрес оригинального документа : http://www.stsci.edu/spst/UnixTransition/doc/check_grid.py
Дата изменения: Fri Apr 8 12:46:10 2016
Дата индексирования: Mon Apr 11 00:46:43 2016
Кодировка:

Поисковые слова: enceladus
#
#MODULE check_grid
#
#************************************************************************
"""
**PURPOSE** --
To check the requested GridEngine queues are usable and not in a state
that prevents their use. If problems are found, then send email to
the provided list of names. Also, optionally, include a persistent
problem file so that future runs will only send emails to new problems.

**DEVELOPER** --
Merle Reinhart

**MODIFICATION HISTORY** --

o Initial implementation - 2/17/16
o Add identification of which GE Cluster - mdr 2/18/16
o Fix bug when there is an error condition - mdr 2/18/16
"""
#************************************************************************

__version__ = "16.02.18"

def run (queues, email_list, persistent_file=None):
"""This tool will check that the requested GridEngine queues are in a
functional state or not. If not, then an email will be sent to
the addresses contained in the input email list.

Inputs:
queues - queues to check - string, list or file
email_list - email address to notify of problems - string, list or file
persistent_file - optional persistent file that holds any problem queue names
from the previous run (used to suppress extra email between
multiple runs)
"""

import os
import time
import spss_sys_util
import file_util

# figure out what queues is
# Is it a string?
if (isinstance(queues, str)):
# Is it a file?
if (os.path.isfile(queues)):
queue_list = open(queues,'r').read().strip().replace('\n',',').split(',')
else:
# Nope, just a regular string
queue_list = queues.strip().replace(' ','').split(',')
# end if
else:
# We can just make it into a list object
queue_list = list(queues)
# end if

# Figure out what email_list is
# Is it a string?
if (isinstance(email_list, str)):
# Is it a file?
if (os.path.isfile(email_list)):
email_addr_list = open(email_list,'r').read().strip().replace('\n',',').split(',')
else:
# Nope, just a regular string
email_addr_list = email_list.strip().replace(' ','').split(',')
# end if
else:
# We can just make it into a list object
email_addr_list = list(email_list)
# end if

# Get any problems in the previous run
if (persistent_file):
if (os.path.exists(persistent_file)):
prev_probs = get_previous_problems(persistent_file)
else:
prev_probs = []
# end if
else:
prev_probs = []
# end if

curr_probs = []
problem_found = False
GE_Cell = spss_sys_util.get_environ_variable('SGE_CELL')[0]
GE_Root = spss_sys_util.get_environ_variable('SGE_ROOT')[0]
GE_Cluster = spss_sys_util.get_environ_variable('SGE_CLUSTER_NAME')[0]
report_time = time.asctime()
print("GridEngine Cluster: %-25s Time: %s" % (GE_Cluster,report_time))

tmpfile = file_util.tempfile('txt')
wf = open(tmpfile,'w')
wf.write("Time: %s\n" % report_time)
wf.write("Grid Engine Cluster: %s\n" % GE_Cluster)
wf.write("Grid Engine Root: %s\n" % GE_Root)
wf.write("Grid Engine Cell: %s\n\n" % GE_Cell)

for q in queue_list:
print("Checking %s" % q)
qresult = check_a_queue(q)
if (qresult is not None):
if (qresult['problem']):
# Found a problem
curr_probs.append(qresult['queue'])
# If it has already been reported, then skip
if (qresult['queue'] not in prev_probs):
problem_found = True
print(" Problem found")
wf.write("%s: %s\n" % (qresult['queue'], qresult['problem_cause']))
wf.write(qresult['details'] + "\n\n")
else:
print(" Problem already reported")
# end if
# end if
else:
problem_found = True
wf.write("%s: Error querying GridEngine\n\n" % q)
print("%s: Error querying GridEngine" % q)
# end if
# end for q
wf.close()

if (problem_found):
print("Problems found...")
# email the problems
msubj = "Grid Engine Problems with Cluster: %s Cell: %s" % (GE_Cluster,GE_Cell)
try:
mresults = spss_sys_util.mail(email_addr_list, msubj, tmpfile, 1)
print("Email sent")
except:
print("Problems sending the email...")
print(mresults)
# end try
else:
if (len(curr_probs) != 0):
print("All problems previously reported\n")
else:
print("No problems found\n")
# end if
# end if

# Write out current problems to the persistent file
if (persistent_file):
write_current_problems(persistent_file, curr_probs)
# end if

if (os.path.exists(tmpfile)):
os.remove(tmpfile)
# end if

return
# end def run

def check_a_queue(queue):
"""This function will check the state of the specified queue.

Inputs:
queue - name of the queue to check - string

Returns:
A dictionary of the state of the requested queue
"""

import spss_sys_util

# Do the initial query for the queue
qc1 = "qstat -g c -ext -q %s" % queue
qc1_status, qc1_results = spss_sys_util.command(qc1)

# Now parse the result
if (qc1_status is None):
tstatus = qc1_results.split("\n")[2].split()
queue_status = {}
queue_status['queue'] = tstatus[0]
queue_status['qload'] = float(tstatus[1])
queue_status['slots_used'] = int(tstatus[2])
queue_status['slots_reserved'] = int(tstatus[3])
queue_status['slots_available'] = int(tstatus[4])
queue_status['slots_total'] = int(tstatus[5])
queue_status['slots_aoACDS'] = int(tstatus[6])
queue_status['slots_cdsuE'] = int(tstatus[7])
queue_status['slots_suspended'] = int(tstatus[8])
queue_status['slots_Alarmed_suspend_threshold'] = int(tstatus[9])
queue_status['slots_Subordinate_suspended'] = int(tstatus[10])
queue_status['slots_Calendar_suspended'] = int(tstatus[11])
queue_status['slots_unknown'] = int(tstatus[12])
queue_status['slots_alarmed_load_threshold'] = int(tstatus[13])
queue_status['slots_disabled'] = int(tstatus[14])
queue_status['slots_Disabled_calendar'] = int(tstatus[15])
queue_status['slots_configuration_ambiguous'] = int(tstatus[16])
queue_status['slots_orphaned'] = int(tstatus[17])
queue_status['slots_Errored'] = int(tstatus[18])
queue_status['problem'] = False
queue_status['problem_cause'] = 'No Problems Found'

# The aoACDS states will clear themselves autonomously
# The cdsuE states will require human interaction to clear
# so only pay attention to the cdsuE value
# specifically we want the d - disabled, s - suspended,
# E - Error values.
# Basically, if the number of available slots plus the
# number of used slots is not greater than zero, we have an issue
if (((queue_status['slots_used'] + queue_status['slots_available']) <= 0) or \
(queue_status['slots_Errored'] != 0)):
queue_status['problem'] = True
if (queue_status['slots_Errored'] != 0):
cause = 'Queue in Error State'
# endif
if ((queue_status['slots_disabled'] + queue_status['slots_Disabled_calendar']) >= \
queue_status['slots_total']):
cause = 'All slots have been disabled'
# end if
if ((queue_status['slots_suspended'] + queue_status['slots_Calendar_suspended']) >= \
queue_status['slots_total']):
cause = 'All slots have been suspended'
# end if
queue_status['problem_cause'] = cause
# end if

# Get details of jobs currently running in the queue by host
qc2 = 'qstat -f -q %s -s r -u "*"' % queue
qc2_status, qc2_results = spss_sys_util.command(qc2)
if (qc2_status is None):
queue_status['details'] = qc2_results
else:
queue_status['details'] = "Error: %s\n%s" % (qc2_status, qc2_results)
# end if
else:
queue_status = None
# end if

return queue_status
# end def check_a_queue

def get_previous_problems(pers_file):
""" This function will read a file that contains the name that had problems
during the previous running.

Inputs:
pers_file - the persistent file to read in

Returns:
List of names that had prior problems
"""

import os.path

prev_probs = []

if (not os.path.exists(pers_file)):
raise(IOError, "File %s was not found" % pers_file)
else:
temp = open(pers_file,'r').readlines()
for i in temp:
prev_probs.append(i.strip())
# end for i
# end if

return prev_probs
# end def get_previous_problems

def write_current_problems(pers_file, problems):
""" This function will write out to a file the current names that have
problems.

Inputs:
pers_file - the persitent file to read in
problems - list of the names that have problems

Returns:
Nothing
"""

fo = open(pers_file,'w')
for i in problems:
fo.write("%s\n" % i)
# end for i
fo.close()

return
# end def write_current_problems




if __name__ == '__main__':
import sys
if (len(sys.argv) > 2) and (len(sys.argv) < 5):
print(sys.argv)
run(*tuple(sys.argv[1:]))
else:
print(">>> Wrong number of input parameters provided <<<\n")
print(run.__doc__)