import multiprocessing as mp from multiprocessing.managers import SyncManager import queue import copy import signal import pickle import traceback import socket import os import sys import time import datetime import math import psutil import fcntl import termios import struct import collections import numpy as np """jobmanager module Richard Hartmann 2014 This module provides an easy way to implement distributed computing based on the python class SyncManager for remote communication and the python module multiprocessing for local parallelism. class SIG_handler_Loop The class Loop provides as mechanism to spawn a process repeating to call a certain function as well as a StatusBar class for the terminal. class StatusBar The class JobManager_Server will provide a server process handling the following tasks: - providing a list (queue) of arguments to be processed by client processes (see put_arg and args_from_list) - handling the results of the calculations done by the client processes (see process_new_result) - when finished (all provided arguments have been processed and returned its result) process the obtained results (see process_final_result) The class JobManager_Client """ # a mapping from the numeric values of the signals to their names used in the # standard python module signals signal_dict = {} for s in dir(signal): if s.startswith('SIG') and s[3] != '_': n = getattr(signal, s) if n in signal_dict: signal_dict[n] += ('/'+s) else: signal_dict[n] = s # a list of all names of the implemented python signals all_signals = [s for s in dir(signal) if (s.startswith('SIG') and s[3] != '_')] def getDateForFileName(includePID = False): """returns the current date-time and optionally the process id in the format YYYY_MM_DD_hh_mm_ss_pid """ date = time.localtime() name = '{:d}_{:02d}_{:02d}_{:02d}_{:02d}_{:02d}'.format(date.tm_year, date.tm_mon, date.tm_mday, date.tm_hour, date.tm_min, date.tm_sec) if includePID: name += "_{}".format(os.getpid()) return name def humanize_time(secs): """convert second in to hh:mm:ss format """ mins, secs = divmod(secs, 60) hours, mins = divmod(mins, 60) return '{:02d}:{:02d}:{:02d}'.format(int(hours), int(mins), int(secs)) def humanize_speed(c_per_sec): """convert a speed in counts per second to counts per [s, min, h, d], choosing the smallest value greater zero. """ scales = [60, 60, 24] units = ['c/s', 'c/min', 'c/h', 'c/d'] speed = c_per_sec i = 0 if speed > 0: while (speed < 1) and (i < len(scales)): speed *= scales[i] i += 1 return "{:.1f}{}".format(speed, units[i]) def copyQueueToList(q): res_list = [] res_q = mp.Queue() try: while True: res_list.append(q.get_nowait()) res_q.put(res_list[-1]) except queue.Empty: pass return res_q, res_list class hashDict(dict): def __hash__(self): return hash(tuple(sorted(self.items()))) class hashableCopyOfNumpyArray(np.ndarray): def __new__(self, other): return np.ndarray.__new__(self, shape=other.shape, dtype=other.dtype) def __init__(self, other): self[:] = other[:] def __hash__(self): return hash(self.shape + tuple(self.flat)) def __eq__(self, other): return np.all(np.equal(self, other)) class SIG_handler_Loop(object): """class to setup signal handling for the Loop class Note: each subprocess receives the default signal handling from it's parent. If the signal function from the module signal is evoked within the subprocess this default behavior can be overwritten. The init function receives a shared memory boolean object which will be set false in case of signal detection. Since the Loop class will check the state of this boolean object before each repetition, the loop will stop when a signal was receives. """ def __init__(self, shared_mem_run, sigint, sigterm, identifier, verbose=0): self.shared_mem_run = shared_mem_run self.set_signal(signal.SIGINT, sigint) self.set_signal(signal.SIGTERM, sigterm) self.verbose=verbose self.identifier = identifier if self.verbose > 1: print("{}: setup signal handler for loop (SIGINT:{}, SIGTERM:{})".format(self.identifier, sigint, sigterm)) def set_signal(self, sig, handler_str): if handler_str == 'ign': signal.signal(sig, self._ignore_signal) elif handler_str == 'stop': signal.signal(sig, self._stop_on_signal) else: raise TypeError("unknown signal hander string '{}'".format(handler_str)) def _ignore_signal(self, signal, frame): pass def _stop_on_signal(self, signal, frame): if self.verbose > 0: print("{}: received sig {} -> set run false".format(self.identifier, signal_dict[signal])) self.shared_mem_run.value = False def get_identifier(name=None, pid=None): if pid == None: pid = os.getpid() if name == None: return "PID {}".format(pid) else: return "{} ({})".format(name, pid) def check_process_termination(proc, identifier, timeout, verbose=0, auto_kill_on_last_resort = False): if verbose > 1: print("{}: give running loop at most {}s to finish ... ".format(identifier, timeout), end='', flush=True) proc.join(timeout) if not proc.is_alive(): if verbose > 1: print("done") return True # process still runs -> send SIGTERM -> see what happens if verbose > 1: print("failed!") if verbose > 0: print("{}: found running loop still alive -> terminate via SIGTERM ...".format(identifier), end='', flush=True) proc.terminate() proc.join(3*timeout) if not proc.is_alive(): if verbose > 0: print("done!") return True if verbose > 0: print("failed!") answer = 'y' if auto_kill_on_last_resort else '_' while True: if answer == 'y': print("{}: send SIGKILL to".format(identifier)) os.kill(proc.pid, signal.SIGKILL) time.sleep(0.1) answer = '_' if not proc.is_alive(): print("{}: has stopped running!".format(identifier)) return True else: print("{}: still running!".format(identifier)) while not answer in 'yn': print("Do you want to send SIGKILL to '{}'? [y/n]: ".format(identifier), end='', flush=True) answer = sys.stdin.readline()[:-1] if answer == 'n': while not answer in 'yn': print("Do you want let the process '{}' running? [y/n]: ".format(identifier), end='', flush=True) answer = sys.stdin.readline()[:-1] if answer == 'y': print("{}: keeps running".format(self._identifier)) return False class Loop(object): """ class to run a function periodically an seperate process. In case the called function returns True, the loop will stop. Otherwise a time interval given by interval will be slept before another execution is triggered. The shared memory variable _run (accessible via the class property run) also determines if the function if executed another time. If set to False the execution stops. For safe cleanup (and in order to catch any Errors) it is advisable to instantiate this class using 'with' statement as follows: with Loop(**kwargs) as my_loop: my_loop.start() ... this will guarantee you that the spawned loop process is down when exiting the 'with' scope. The only circumstance where the process is still running is when you set auto_kill_on_last_resort to False and answer the question to send SIGKILL with no. """ def __init__(self, func, args=(), interval = 1, verbose=0, sigint='stop', sigterm='stop', name=None, auto_kill_on_last_resort=False): """ func [callable] - function to be called periodically args [tuple] - arguments passed to func when calling intervall [pos number] - time to "sleep" between each call verbose [pos integer] - specifies the level of verbosity [0--silent, 1--important information, 2--some debug info] sigint [string] - signal handler string to set SIGINT behavior (see below) sigterm [string] - signal handler string to set SIGTERM behavior (see below) name [string] - use this name in messages instead of the PID auto_kill_on_last_resort [bool] - If set False (default), ask user to send SIGKILL to loop process in case normal stop and SIGTERM failed. If set True, send SIDKILL without asking. the signal handler string may be one of the following ing: ignore the incoming signal stop: set the shared memory boolean to false -> prevents the loop from repeating -> subprocess terminates when func returns and sleep time interval has passed. """ self.func = func self.args = args self.interval = interval self._run = mp.Value('b', False) self.verbose = verbose self._proc = None self._sigint = sigint self._sigterm = sigterm self._name = name self._auto_kill_on_last_resort = auto_kill_on_last_resort self._identifier = None def __enter__(self): return self def __exit__(self, *exc_args): # normal exit if not self.is_alive(): if self.verbose > 1: print("{}: has stopped on context exit".format(self._identifier)) return # loop still runs on context exit -> __cleanup if self.verbose > 1: print("{}: is still running on context exit".format(self._identifier)) self.__cleanup() def __cleanup(self): """ Wait at most twice as long as the given repetition interval for the _wrapper_function to terminate. If after that time the _wrapper_function has not terminated, send SIGTERM to and the process. Wait at most five times as long as the given repetition interval for the _wrapper_function to terminate. If the process still running send SIGKILL automatically if auto_kill_on_last_resort was set True or ask the user to confirm sending SIGKILL """ # set run to False and wait some time -> see what happens self.run = False if check_process_termination(proc=self._proc, identifier=self._identifier, timeout=2*self.interval, verbose=self.verbose, auto_kill_on_last_resort=self._auto_kill_on_last_resort): if self.verbose > 1: print("{}: cleanup successful".format(self._identifier)) self._proc = None @staticmethod def _wrapper_func(func, args, shared_mem_run, interval, verbose, sigint, sigterm, name): """to be executed as a seperate process (that's why this functions is declared static) """ # implement the process specific signal handler identifier = get_identifier(name) SIG_handler_Loop(shared_mem_run, sigint, sigterm, identifier, verbose) while shared_mem_run.value: try: quit_loop = func(*args) except: err, val, trb = sys.exc_info() if verbose > 0: print("{}: error {} occurred in Loop class calling 'func(*args)'".format(identifier, err)) if verbose > 0: traceback.print_tb(trb) return if quit_loop: return time.sleep(interval) if verbose > 1: print("{}: _wrapper_func terminates gracefully".format(identifier)) def start(self): """ uses multiprocess Process to call _wrapper_func in subprocess """ if self.is_alive(): if self.verbose > 0: print("{}: is already running".format(self._identifier)) return self.run = True self._proc = mp.Process(target = Loop._wrapper_func, args = (self.func, self.args, self._run, self.interval, self.verbose, self._sigint, self._sigterm, self._name), name=self._name) self._proc.start() self._identifier = get_identifier(self._name, self.getpid()) if self.verbose > 1: print("{}: started as new loop process".format(self._identifier)) def stop(self): """ stops the process triggered by start Setting the shared memory boolean run to false, which should prevent the loop from repeating. Call __cleanup to make sure the process stopped. After that we could trigger start() again. """ self.run = False if not self.is_alive(): if self.verbose > 0: print("PID None: there is no running loop to stop") return self.__cleanup() def join(self, timeout): """ calls join for the spawned process with given timeout """ if self.is_alive(): return psutil.Process(self._pid).wait(timeout) def getpid(self): """ return the process id of the spawned process """ return self._proc.pid def is_alive(self): if self._proc == None: return False else: return self._proc.is_alive() @property def run(self): """ makes the shared memory boolean accessible as class attribute Set run False, the loop will stop repeating. Calling start, will set run True and start the loop again as a new process. """ return self._run.value @run.setter def run(self, run): self._run.value = run def UnsignedIntValue(val=0): return mp.Value('I', val, lock=True) class StatusBar(Loop): """ status bar in ascii art Uses Loop class to implement repeating function which shows the process based of the two shared memory values max_count and count. max_count is assumed to be the final state, whereas count represents the current state. The estimates time of arrival (ETA) will be calculated from a speed measurement given by the average over the last spee_calc_cycles calls of the looping function show_stat. """ def __init__(self, count, max_count, interval=1, speed_calc_cycles=10, width='auto', verbose=0, sigint='stop', sigterm='stop', name='statusbar'): """ The init will also start to display the status bar if run was set True. Otherwise use the inherited method start() to start the show_stat loop. stop() will stop to show the status bar. count [mp.Value] - shared memory to hold the current state max_count [mp.Value] - shared memory holding the final state, may change by external process without having to explicitly tell this class interval [int] - seconds to wait between progress print speed_calc_cycles [int] - use the current (time, count) as well as the (old_time, old_count) read by the show_stat function speed_calc_cycles calls before to calculate the speed as follows: s = count - old_count / (time - old_time) width [int/'auto'] - the number of characters used to show the status bar, use 'auto' to determine width from terminal information -> see __set_width verbose, sigint, sigterm -> see loop class """ assert isinstance(count, mp.sharedctypes.Synchronized) assert isinstance(max_count, mp.sharedctypes.Synchronized) self.start_time = mp.Value('d', time.time()) self.speed_calc_cycles = speed_calc_cycles self.q = mp.Queue() # queue to save the last speed_calc_cycles # (time, count) information to calculate speed self.max_count = max_count # multiprocessing value type self.count = count # multiprocessing value type self.interval = interval self.verbose = verbose self.name = name self.__set_width(width) # setup loop class super().__init__(func=StatusBar.show_stat, args=(self.count, self.start_time, self.max_count, self.width, self.speed_calc_cycles, self.q), interval=interval, verbose=verbose, sigint=sigint, sigterm=sigterm, name=name, auto_kill_on_last_resort=True) def __exit__(self, *exc_args): super().__exit__(*exc_args) StatusBar.show_stat(count=self.max_count, start_time=self.start_time, max_count=self.max_count, width=self.width, speed_calc_cycles=self.speed_calc_cycles, q=self.q) print() def __set_width(self, width): """ set the number of characters to be used to disply the status bar is set to 'auto' try to determine the width of the terminal used (experimental, depends on the terminal used, os dependens) use a width of 80 as fall back. """ if width == 'auto': try: hw = struct.unpack('hh', fcntl.ioctl(sys.stdin, termios.TIOCGWINSZ, '1234')) self.width = hw[1] except: if self.verbose > 0: print("{}: failed to determine the width of the terminal".format(get_identifier(name=self.name))) self.width = 80 else: self.width = width def set_args(self, interval=1, speed_calc_cycles=10, width='auto'): """ change some of the init arguments This will stop the loop, set changes and start the loop again. """ self.stop() self.interval = interval self.speed_calc_cycles = speed_calc_cycles self.__set_width(width) self.args = (self.count, self.start_time, self.max_count, self.width, self.speed_calc_cycles, self.q) self.start() @staticmethod def show_stat(count, start_time, max_count, width, speed_calc_cycles, q): """ the actual routine to bring the status to the screen """ count_value = count.value max_count_value = max_count.value if count_value == 0: start_time.value = time.time() print("\rwait for first count ...", end='', flush=True) return False else: current_time = time.time() start_time_value = start_time.value q.put((count_value, current_time)) if q.qsize() > speed_calc_cycles: old_count_value, old_time = q.get() else: old_count_value, old_time = 0, start_time_value tet = (current_time - start_time_value) speed = (count_value - old_count_value) / (current_time - old_time) if speed == 0: s3 = "] ETA --" else: eta = math.ceil((max_count_value - count_value) / speed) s3 = "] ETA {}".format(humanize_time(eta)) s1 = "\r{} [{}] [".format(humanize_time(tet), humanize_speed(speed)) l = len(s1) + len(s3) l2 = width - l - 1 a = int(l2 * count_value / max_count_value) b = l2 - a s2 = "="*a + ">" + " "*b print(s1+s2+s3, end='', flush=True) return count_value >= max_count_value def stop(self): print() super().stop() def setup_SIG_handler_manager(): """ When a process calls this functions, it's signal handler will be set to ignore the signals given by the list signals. This functions is passed to the SyncManager start routine (used by JobManager_Server) to enable graceful termination when received SIGINT or SIGTERM. The problem is, that the SyncManager start routine triggers a new process to provide shared memory object even over network communication. Since the SIGINT signal will be passed to all child processes, the default handling would make the SyncManger halt on KeyboardInterrupt Exception. As we want to shout down the SyncManager at the last stage of cleanup we have to prevent this default signal handling by passing this functions to the SyncManager start routine. """ Signal_to_SIG_IGN(signals=[signal.SIGINT, signal.SIGTERM], verbose=0) class Signal_to_SIG_IGN(object): def __init__(self, signals=[signal.SIGINT, signal.SIGTERM], verbose=0): self.verbose = verbose for s in signals: signal.signal(s, self._handler) def _handler(self, sig, frame): if self.verbose > 0: print("PID {}: received signal {} -> will be ignored".format(os.getpid(), signal_dict[sig])) class Signal_to_sys_exit(object): def __init__(self, signals=[signal.SIGINT, signal.SIGTERM], verbose=0): self.verbose = verbose for s in signals: signal.signal(s, self._handler) def _handler(self, signal, frame): if self.verbose > 0: print("PID {}: received signal {} -> call sys.exit -> raise SystemExit".format(os.getpid(), signal_dict[signal])) sys.exit('exit due to signal {}'.format(signal_dict[signal])) class Signal_to_terminate_process_list(object): """ SIGINT and SIGTERM will call terminate for process given in process_list """ def __init__(self, process_list, signals = [signal.SIGINT, signal.SIGTERM], verbose=0): self.process_list = process_list self.verbose = verbose for s in signals: signal.signal(s, self._handler) def _handler(self, signal, frame): if self.verbose > 0: print("PID {}: received sig {} -> terminate all given subprocesses".format(os.getpid(), signal_dict[signal])) for p in self.process_list: p.terminate() class JobManager_Server(object): """general usage: - init the JobManager_Server, start SyncManager server process - pass the arguments to be processed to the JobManager_Server (put_arg, args_from_list) - start the JobManager_Server (start), which means to wait for incoming results and to process them. Afterwards process all obtained data. The default behavior of handling each incoming new result is to simply add the pair (arg, result) to the final_result list. When finished the default final processing is to dump the final_result list to fname_for_final_result_dump To change this behavior you may subclass the JobManager_Server and implement - an extended __init__ to change the type of the final_result attribute - process_new_result - process_final_result(self) In case of any exceptions the JobManager_Server will call process_final_result and dump the unprocessed job_q as a list to fname_for_job_q_dump. Also the signal SIGTERM is caught. In such a case it will raise SystemExit exception will will then be handle in the try except clause. SystemExit and KeyboardInterrupt exceptions are not considered as failure. They are rather methods to shout down the Server gracefully. Therefore in such cases no traceback will be printed. All other exceptions are probably due to some failure in function. A traceback it printed to stderr. notes: - when the JobManager_Server gets killed (SIGKILL) and the SyncManager still lives, the port used will occupied. considere sudo natstat -pna | grep 42524 to find the process still using the port - also the SyncManager ignores SIGTERM and SIGINT signals so you have to send a SIGKILL. """ def __init__(self, authkey, const_arg=None, port=42524, verbose=1, msg_interval=1, fname_dump='auto', speed_calc_cycles=50): """ authkey [string] - authentication key used by the SyncManager. Server and Client must have the same authkey. const_arg [dict] - some constant keyword arguments additionally passed to the worker function (see JobManager_Client). port [int] - network port to use verbose [int] - 0: quiet, 1: status only, 2: debug messages msg_interval [int] - number of second for the status bar to update fname_for_final_result_dump [string/None] - sets the file name used to dump the the final_result. (None: do not dump, 'auto' choose filename 'YYYY_MM_DD_hh_mm_ss_final_result.dump') fname_for_args_dump [string/None] - sets the file name used to dump the list of unprocessed arguments, if there are any. (None: do not dump at all, 'auto' choose filename 'YYYY_MM_DD_hh_mm_ss_args.dump') fname_for_fail_dump [string/None] - sets the file name used to dump the list of not successfully processed arguments, if there are any. (None: do not dump, 'auto' choose filename 'YYYY_MM_DD_hh_mm_ss_fail.dump') This init actually starts the SyncManager as a new process. As a next step the job_q has to be filled, see put_arg(). """ self.verbose = verbose self._pid = os.getpid() self._pid_start = None self._identifier = get_identifier(name=self.__class__.__name__, pid=self._pid) if self.verbose > 1: print("{}: I'm the JobManager_Server main process".format(self._identifier)) self.__wait_before_stop = 2 self.port = port self.authkey = bytearray(authkey, encoding='utf8') self.const_arg = copy.copy(const_arg) self.fname_dump = fname_dump self.msg_interval = msg_interval self.speed_calc_cycles = speed_calc_cycles # to do some redundant checking, might be removed # the args_set holds all arguments to be processed # in contrast to the job_q, an argument will only be removed # from the set if it was caught by the result_q # so iff all results have been processed successfully, # the args_set will be empty self.args_set = set() # thread safe integer values self._numresults = mp.Value('i', 0) # count the successfully processed jobs self._numjobs = mp.Value('i', 0) # overall number of jobs # final result as list, other types can be achieved by subclassing self.final_result = [] # NOTE: it only works using multiprocessing.Queue() # the Queue class from the module queue does NOT work self.job_q = mp.Queue() # queue holding args to process self.result_q = mp.Queue() # queue holding returned results self.fail_q = mp.Queue() # queue holding args where processing failed self.manager = None self.__start_SyncManager() def __stop_SyncManager(self): if self.manager == None: return manager_proc = self.manager._process manager_identifier = get_identifier(name='SyncManager') # stop SyncManager self.manager.shutdown() check_process_termination(proc=manager_proc, identifier=manager_identifier, timeout=2, verbose=self.verbose, auto_kill_on_last_resort=True) def __start_SyncManager(self): class JobQueueManager(SyncManager): pass # make job_q, result_q, fail_q, const_arg available via network JobQueueManager.register('get_job_q', callable=lambda: self.job_q) JobQueueManager.register('get_result_q', callable=lambda: self.result_q) JobQueueManager.register('get_fail_q', callable=lambda: self.fail_q) JobQueueManager.register('get_const_arg', callable=lambda: self.const_arg, exposed=["__iter__"]) address=('', self.port) #ip='' means local authkey=self.authkey self.manager = JobQueueManager(address, authkey) # start manager with non default signal handling given by # the additional init function setup_SIG_handler_manager self.manager.start(setup_SIG_handler_manager) self.hostname = socket.gethostname() if self.verbose > 1: print("{}: started on {}:{} with authkey '{}'".format(get_identifier('SyncManager', self.manager._process.pid), self.hostname, self.port, str(authkey, encoding='utf8'))) def __restart_SyncManager(self): self.__stop_SyncManager() self.__start_SyncManager() def __enter__(self): return self def __exit__(self, err, val, trb): # KeyboardInterrupt via SIGINT will be mapped to SystemExit # SystemExit is considered non erroneous if err == SystemExit: if self.verbose > 0: print("{}: normal shutdown caused by SystemExit".format(self._identifier)) # no exception traceback will be printed elif err != None: # causes exception traceback to be printed traceback.print_exception(err, val, trb) # bring everything down, dump status to file self.shoutdown() return True @property def numjobs(self): return self._numjobs.value @numjobs.setter def numjobs(self, numjobs): self._numjobs.value = numjobs @property def numresults(self): return self._numresults.value @numresults.setter def numresults(self, numresults): self._numresults.value = numresults def shoutdown(self): """"stop all spawned processes and clean up - call process_final_result to handle all collected result - if job_q is not empty dump remaining job_q """ # will only be False when _shutdown was started in subprocess # start also makes sure that it was not started as subprocess # so at default behavior this assertion will allays be True assert self._pid == os.getpid() self.__stop_SyncManager() # do user defined final processing self.process_final_result() if self.fname_dump != None: if self.verbose > 0: print("{}: dump current state ... ".format(self._identifier), end='', flush=True) if self.fname_dump == 'auto': fname = "{}_{}.dump".format(self.authkey.decode('utf8'), getDateForFileName(includePID=False)) else: fname = self.fname_dump with open(fname, 'wb') as f: self.__dump(f) if self.verbose > 0: print("done!") else: if self.verbose > 0: print("{}: fname_dump == None, ignore dumping current state!".format(self._identifier)) print("{}: JobManager_Server was successfully shout down".format(self._identifier)) @staticmethod def static_load(f): data = {} data['numjobs'] = pickle.load(f) data['numresults'] = pickle.load(f) data['final_result'] = pickle.load(f) data['args_set'] = pickle.load(f) fail_list = pickle.load(f) data['fail_set'] = {fail_item[0] for fail_item in fail_list} data['fail_q'] = mp.Queue() data['job_q'] = mp.Queue() for fail_item in fail_list: data['fail_q'].put_nowait(fail_item) for arg in (data['args_set'] - data['fail_set']): data['job_q'].put_nowait(arg) return data def __load(self, f): data = JobManager_Server.static_load(f) for key in ['numjobs', 'numresults', 'final_result', 'args_set', 'fail_q','job_q']: self.__setattr__(key, data[key]) def __dump(self, f): pickle.dump(self.numjobs, f, protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(self.numresults, f, protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(self.final_result, f, protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(self.args_set, f, protocol=pickle.HIGHEST_PROTOCOL) fail_list = [] try: while True: fail_list.append(self.fail_q.get_nowait()) except queue.Empty: pass pickle.dump(fail_list, f, protocol=pickle.HIGHEST_PROTOCOL) def read_old_state(self, fname_dump=None): if fname_dump == None: fname_dump = self.fname_dump if fname_dump == 'auto': raise RuntimeError("fname_dump must not be 'auto' when reading old state") if not os.path.isfile(fname_dump): raise RuntimeError("file '{}' to read old state from not found".format(fname_dump)) with open(fname_dump, 'rb') as f: self.__load(f) self.__restart_SyncManager() def put_arg(self, a): """add argument a to the job_q """ if (not hasattr(a, '__hash__')) or (a.__hash__ == None): # try to add hashability if isinstance(a, dict): a = hashDict(a) else: raise AttributeError("'{}' is not hashable".format(type(a))) self.args_set.add(copy.copy(a)) self.job_q.put(copy.copy(a)) with self._numjobs.get_lock(): self._numjobs.value += 1 def args_from_list(self, args): """serialize a list of arguments to the job_q """ for a in args: self.put_arg(a) def process_new_result(self, arg, result): """Will be called when the result_q has data available. result is the computed result to the argument arg. Should be overwritten by subclassing! """ self.final_result.append((arg, result)) def process_final_result(self): """to implement user defined final processing""" pass def start(self): """ starts to loop over incoming results When finished, or on exception call stop() afterwards to shout down gracefully. """ if self._pid != os.getpid(): raise RuntimeError("do not run JobManager_Server.start() in a subprocess") if (self.numjobs - self.numresults) != len(self.args_set): raise RuntimeError("inconsistency detected! use JobManager_Server.put_arg to put arguments to the job_q") if self.numjobs == 0: raise RuntimeError("no jobs to process! use JobManager_Server.put_arg to put arguments to the job_q") Signal_to_sys_exit(signals=[signal.SIGTERM, signal.SIGINT], verbose = self.verbose) pid = os.getpid() if self.verbose > 1: print("{}: start processing incoming results".format(self._identifier)) with StatusBar(count = self._numresults, max_count = self._numjobs, interval=self.msg_interval, speed_calc_cycles=self.speed_calc_cycles, verbose = self.verbose, sigint='ign', sigterm='ign') as stat: stat.start() while (len(self.args_set) - self.fail_q.qsize()) > 0: try: arg, result = self.result_q.get(timeout=1) self.args_set.remove(arg) self.process_new_result(arg, result) self.numresults = self.numjobs - (len(self.args_set) - self.fail_q.qsize()) except queue.Empty: pass if self.verbose > 1: print("{}: wait {}s before trigger clean up".format(self._identifier, self.__wait_before_stop)) time.sleep(self.__wait_before_stop) class JobManager_Client(object): """ Calls the functions self.func with arguments fetched from the job_q. You should subclass this class and overwrite func to handle your own function. The job_q is provided by the SycnManager who connects to a SyncManager setup by the JobManager_Server. Spawns nproc subprocesses (__worker_func) to process arguments. Each subprocess gets an argument from the job_q, processes it and puts the result to the result_q. If the job_q is empty, terminate the subprocess. In case of any failure detected within the try except clause the argument, which just failed to process, the error and the hostname are put to the fail_q so the JobManager_Server can take care of that. After that the traceback is written to a file with name traceback_args__err____
____.trb. Then the process will terminate. """ def __init__(self, server, authkey, port = 42524, nproc = 0, nice=19, no_warings=False, verbose=1): """ server [string] - ip address or hostname where the JobManager_Server is running authkey [string] - authentication key used by the SyncManager. Server and Client must have the same authkey. port [int] - network port to use nproc [integer] - number of subprocesses to start positive integer: number of processes to spawn zero: number of spawned processes == number cpu cores negative integer: number of spawned processes == number cpu cores - |nproc| nice [integer] - niceness of the subprocesses no_warnings [bool] - call warnings.filterwarnings("ignore") -> all warnings are ignored verbose [int] - 0: quiet, 1: status only, 2: debug messages """ self.verbose = verbose self._pid = os.getpid() self._identifier = get_identifier(name=self.__class__.__name__, pid=self._pid) if self.verbose > 1: print("{}: init".format(self._identifier)) if no_warings: import warnings warnings.filterwarnings("ignore") if self.verbose > 1: print("{}: ignore all warnings".format(self._identifier)) self.server = server self.authkey = bytearray(authkey, encoding='utf8') self.port = port self.nice = nice if nproc > 0: self.nproc = nproc else: self.nproc = mp.cpu_count() + nproc assert self.nproc > 0 self.procs = [] self.manager_objects = self.get_manager_objects() def get_manager_objects(self): return JobManager_Client._get_manager_objects(self.server, self.port, self.authkey, self._identifier, self.verbose) @staticmethod def _get_manager_objects(server, port, authkey, identifier, verbose=0): """ connects to the server and get registered shared objects such as job_q, result_q, fail_q, const_arg """ class ServerQueueManager(SyncManager): pass ServerQueueManager.register('get_job_q') ServerQueueManager.register('get_result_q') ServerQueueManager.register('get_fail_q') ServerQueueManager.register('get_const_arg', exposed="__iter__") manager = ServerQueueManager(address=(server, port), authkey=authkey) if verbose > 0: print('{}: connecting to {}:{} authkey {}... '.format(identifier, server, port, authkey.decode('utf8')), end='', flush=True) try: manager.connect() except: if verbose > 0: print('failed!') err, val, trb = sys.exc_info() print("caught exception {}: {}".format(err.__name__, val)) if err == ConnectionRefusedError: print("check if server is up!") if verbose > 1: traceback.print_exception(err, val, trb) return None else: if verbose > 0: print('done!') job_q = manager.get_job_q() if verbose > 1: print("{}: found job_q with {} jobs".format(identifier, job_q.qsize())) result_q = manager.get_result_q() fail_q = manager.get_fail_q() const_arg = manager.get_const_arg() return job_q, result_q, fail_q, const_arg @staticmethod def func(arg, const_arg): """ function to be called by the worker processes arg - provided by the job_q of the JobManager_Server const_arg - tuple of constant argruments also provided by the JobManager_Server NOTE: This is just some dummy implementation to be used for test reasons only! Subclass and overwrite this function to implement your own function. """ time.sleep(0.1) return os.getpid() @staticmethod def __worker_func(func, nice, verbose, server, port, authkey, i, manager_objects=None): """ the wrapper spawned nproc trimes calling and handling self.func """ identifier = get_identifier(name='worker{}'.format(i+1)) Signal_to_sys_exit(signals=[signal.SIGTERM, signal.SIGINT]) if manager_objects is None: manager_objects = JobManager_Client._get_manager_object(server, port, authkey, identifier, verbose) if res == None: if verbose > 1: print("{}: no shared object recieved, terminate!".format(identifier)) sys.exit(1) job_q, result_q, fail_q, const_arg = manager_objects n = os.nice(0) n = os.nice(nice - n) c = 0 if verbose > 1: print("{}: now alive, niceness {}".format(identifier, n)) time_queue = 0 time_calc = 0 while True: try: t0 = time.clock() arg = job_q.get(block = True, timeout = 0.1) t1 = time.clock() res = func(arg, const_arg) t2 = time.clock() result_q.put((arg, res)) t3 = time.clock() c += 1 time_queue += (t1-t0 + t3-t2) time_calc += (t2-t1) # regular case, just stop woring when empty job_q was found except queue.Empty: if verbose > 0: print("{}: finds empty job queue, processed {} jobs".format(identifier, c)) break # in this context usually raised if the communication to the server fails except EOFError: if verbose > 0: print("{}: EOFError, I guess the server went down, can't do anything, terminate now!".format(identifier)) break # considered as normal exit caused by some user interaction, SIGINT, SIGTERM # note SIGINT, SIGTERM -> SystemExit is achieved by overwriting the # default signal handlers except SystemExit: if verbose > 0: print("{}: SystemExit, quit processing, reinsert current argument".format(identifier)) try: if verbose > 1: print("{}: put argument back to job_q ... ".format(identifier), end='', flush=True) job_q.put(arg, timeout=10) except queue.Full: if verbose > 0: print("{}: failed to reinsert argument, Server down? I quit!".format(identifier)) else: if verbose > 1: print("done!") break # some unexpected Exception # write argument, exception name and hostname to fail_q, save traceback # continue workung except: err, val, trb = sys.exc_info() if verbose > 0: print("{}: caught exception '{}', report failure of current argument to server ... ".format(identifier, err.__name__), end='', flush=True) hostname = socket.gethostname() fname = 'traceback_err_{}_{}.trb'.format(err.__name__, getDateForFileName(includePID=True)) fail_q.put((arg, err.__name__, hostname), timeout=10) if verbose > 0: print("done") print(" write exception to file {} ... ".format(fname), end='', flush=True) with open(fname, 'w') as f: traceback.print_exception(etype=err, value=val, tb=trb, file=f) if verbose > 0: print("done") print(" continue processing next argument.") if verbose > 0: try: print("{}: calculation:{:.2%} communication:{:.2%}".format(identifier, time_calc/(time_calc+time_queue), time_queue/(time_calc+time_queue))) except: pass if verbose > 1: print("{}: JobManager_Client.__worker_func terminates".format(identifier)) def start(self): """ starts a number of nproc subprocess to work on the job_q SIGTERM and SIGINT are managed to terminate all subprocesses retruns when all subprocesses have terminated """ if self.verbose > 1: print("{}: start {} processes to work on the remote queue".format(self._identifier, self.nproc)) for i in range(self.nproc): p = mp.Process(target=self.__worker_func, args=(self.func, self.nice, self.verbose, self.server, self.port, self.authkey, i, self.manager_objects)) self.procs.append(p) p.start() time.sleep(0.3) Signal_to_terminate_process_list(process_list = self.procs, verbose=self.verbose) for p in self.procs: p.join()