mirror of
https://github.com/vale981/jobmanager
synced 2025-03-06 02:11:39 -05:00
1313 lines
48 KiB
Python
1313 lines
48 KiB
Python
import multiprocessing as mp
|
|
from multiprocessing.managers import SyncManager
|
|
import queue
|
|
import copy
|
|
import signal
|
|
import pickle
|
|
import traceback
|
|
import socket
|
|
import os
|
|
import sys
|
|
import time
|
|
import datetime
|
|
import math
|
|
import psutil
|
|
import fcntl
|
|
import termios
|
|
import struct
|
|
import collections
|
|
import numpy as np
|
|
|
|
|
|
"""jobmanager module
|
|
|
|
Richard Hartmann 2014
|
|
|
|
|
|
This module provides an easy way to implement distributed computing
|
|
based on the python class SyncManager for remote communication
|
|
and the python module multiprocessing for local parallelism.
|
|
|
|
class SIG_handler_Loop
|
|
|
|
The class Loop provides as mechanism to spawn a process repeating to
|
|
call a certain function as well as a StatusBar class for the terminal.
|
|
|
|
class StatusBar
|
|
|
|
The class JobManager_Server will provide a server process handling the
|
|
following tasks:
|
|
- providing a list (queue) of arguments to be processed by client processes
|
|
(see put_arg and args_from_list)
|
|
- handling the results of the calculations done by the client processes
|
|
(see process_new_result)
|
|
- when finished (all provided arguments have been processed and returned its result)
|
|
process the obtained results (see process_final_result)
|
|
|
|
The class JobManager_Client
|
|
|
|
"""
|
|
|
|
# a mapping from the numeric values of the signals to their names used in the
|
|
# standard python module signals
|
|
signal_dict = {}
|
|
for s in dir(signal):
|
|
if s.startswith('SIG') and s[3] != '_':
|
|
n = getattr(signal, s)
|
|
if n in signal_dict:
|
|
signal_dict[n] += ('/'+s)
|
|
else:
|
|
signal_dict[n] = s
|
|
|
|
# a list of all names of the implemented python signals
|
|
all_signals = [s for s in dir(signal) if (s.startswith('SIG') and s[3] != '_')]
|
|
|
|
def getDateForFileName(includePID = False):
|
|
"""returns the current date-time and optionally the process id in the format
|
|
YYYY_MM_DD_hh_mm_ss_pid
|
|
"""
|
|
date = time.localtime()
|
|
name = '{:d}_{:02d}_{:02d}_{:02d}_{:02d}_{:02d}'.format(date.tm_year, date.tm_mon, date.tm_mday, date.tm_hour, date.tm_min, date.tm_sec)
|
|
if includePID:
|
|
name += "_{}".format(os.getpid())
|
|
return name
|
|
|
|
def humanize_time(secs):
|
|
"""convert second in to hh:mm:ss format
|
|
"""
|
|
mins, secs = divmod(secs, 60)
|
|
hours, mins = divmod(mins, 60)
|
|
return '{:02d}:{:02d}:{:02d}'.format(int(hours), int(mins), int(secs))
|
|
|
|
def humanize_speed(c_per_sec):
|
|
"""convert a speed in counts per second to counts per [s, min, h, d], choosing the smallest value greater zero.
|
|
"""
|
|
scales = [60, 60, 24]
|
|
units = ['c/s', 'c/min', 'c/h', 'c/d']
|
|
speed = c_per_sec
|
|
i = 0
|
|
if speed > 0:
|
|
while (speed < 1) and (i < len(scales)):
|
|
speed *= scales[i]
|
|
i += 1
|
|
|
|
return "{:.1f}{}".format(speed, units[i])
|
|
|
|
def copyQueueToList(q):
|
|
res_list = []
|
|
res_q = mp.Queue()
|
|
|
|
try:
|
|
while True:
|
|
res_list.append(q.get_nowait())
|
|
res_q.put(res_list[-1])
|
|
except queue.Empty:
|
|
pass
|
|
|
|
return res_q, res_list
|
|
|
|
class hashDict(dict):
|
|
def __hash__(self):
|
|
return hash(tuple(sorted(self.items())))
|
|
|
|
class hashableCopyOfNumpyArray(np.ndarray):
|
|
def __new__(self, other):
|
|
return np.ndarray.__new__(self, shape=other.shape, dtype=other.dtype)
|
|
|
|
def __init__(self, other):
|
|
self[:] = other[:]
|
|
|
|
def __hash__(self):
|
|
return hash(self.shape + tuple(self.flat))
|
|
|
|
def __eq__(self, other):
|
|
return np.all(np.equal(self, other))
|
|
|
|
|
|
class SIG_handler_Loop(object):
|
|
"""class to setup signal handling for the Loop class
|
|
|
|
Note: each subprocess receives the default signal handling from it's parent.
|
|
If the signal function from the module signal is evoked within the subprocess
|
|
this default behavior can be overwritten.
|
|
|
|
The init function receives a shared memory boolean object which will be set
|
|
false in case of signal detection. Since the Loop class will check the state
|
|
of this boolean object before each repetition, the loop will stop when
|
|
a signal was receives.
|
|
"""
|
|
def __init__(self, shared_mem_run, sigint, sigterm, identifier, verbose=0):
|
|
self.shared_mem_run = shared_mem_run
|
|
self.set_signal(signal.SIGINT, sigint)
|
|
self.set_signal(signal.SIGTERM, sigterm)
|
|
self.verbose=verbose
|
|
self.identifier = identifier
|
|
if self.verbose > 1:
|
|
print("{}: setup signal handler for loop (SIGINT:{}, SIGTERM:{})".format(self.identifier, sigint, sigterm))
|
|
|
|
def set_signal(self, sig, handler_str):
|
|
if handler_str == 'ign':
|
|
signal.signal(sig, self._ignore_signal)
|
|
elif handler_str == 'stop':
|
|
signal.signal(sig, self._stop_on_signal)
|
|
else:
|
|
raise TypeError("unknown signal hander string '{}'".format(handler_str))
|
|
|
|
def _ignore_signal(self, signal, frame):
|
|
pass
|
|
|
|
def _stop_on_signal(self, signal, frame):
|
|
if self.verbose > 0:
|
|
print("{}: received sig {} -> set run false".format(self.identifier, signal_dict[signal]))
|
|
self.shared_mem_run.value = False
|
|
|
|
def get_identifier(name=None, pid=None):
|
|
if pid == None:
|
|
pid = os.getpid()
|
|
|
|
if name == None:
|
|
return "PID {}".format(pid)
|
|
else:
|
|
return "{} ({})".format(name, pid)
|
|
def check_process_termination(proc, identifier, timeout, verbose=0, auto_kill_on_last_resort = False):
|
|
if verbose > 1:
|
|
print("{}: give running loop at most {}s to finish ... ".format(identifier, timeout), end='', flush=True)
|
|
|
|
proc.join(timeout)
|
|
|
|
if not proc.is_alive():
|
|
if verbose > 1:
|
|
print("done")
|
|
return True
|
|
|
|
# process still runs -> send SIGTERM -> see what happens
|
|
if verbose > 1:
|
|
print("failed!")
|
|
if verbose > 0:
|
|
print("{}: found running loop still alive -> terminate via SIGTERM ...".format(identifier), end='', flush=True)
|
|
|
|
proc.terminate()
|
|
|
|
proc.join(3*timeout)
|
|
if not proc.is_alive():
|
|
if verbose > 0:
|
|
print("done!")
|
|
return True
|
|
|
|
if verbose > 0:
|
|
print("failed!")
|
|
|
|
answer = 'y' if auto_kill_on_last_resort else '_'
|
|
while True:
|
|
if answer == 'y':
|
|
print("{}: send SIGKILL to".format(identifier))
|
|
os.kill(proc.pid, signal.SIGKILL)
|
|
time.sleep(0.1)
|
|
answer = '_'
|
|
|
|
if not proc.is_alive():
|
|
print("{}: has stopped running!".format(identifier))
|
|
return True
|
|
else:
|
|
print("{}: still running!".format(identifier))
|
|
|
|
while not answer in 'yn':
|
|
print("Do you want to send SIGKILL to '{}'? [y/n]: ".format(identifier), end='', flush=True)
|
|
answer = sys.stdin.readline()[:-1]
|
|
|
|
|
|
if answer == 'n':
|
|
while not answer in 'yn':
|
|
print("Do you want let the process '{}' running? [y/n]: ".format(identifier), end='', flush=True)
|
|
answer = sys.stdin.readline()[:-1]
|
|
if answer == 'y':
|
|
print("{}: keeps running".format(self._identifier))
|
|
return False
|
|
|
|
|
|
|
|
class Loop(object):
|
|
"""
|
|
class to run a function periodically an seperate process.
|
|
|
|
In case the called function returns True, the loop will stop.
|
|
Otherwise a time interval given by interval will be slept before
|
|
another execution is triggered.
|
|
|
|
The shared memory variable _run (accessible via the class property run)
|
|
also determines if the function if executed another time. If set to False
|
|
the execution stops.
|
|
|
|
For safe cleanup (and in order to catch any Errors)
|
|
it is advisable to instantiate this class
|
|
using 'with' statement as follows:
|
|
|
|
with Loop(**kwargs) as my_loop:
|
|
my_loop.start()
|
|
...
|
|
|
|
this will guarantee you that the spawned loop process is
|
|
down when exiting the 'with' scope.
|
|
|
|
The only circumstance where the process is still running is
|
|
when you set auto_kill_on_last_resort to False and answer the
|
|
question to send SIGKILL with no.
|
|
"""
|
|
def __init__(self,
|
|
func,
|
|
args=(),
|
|
interval = 1,
|
|
verbose=0,
|
|
sigint='stop',
|
|
sigterm='stop',
|
|
name=None,
|
|
auto_kill_on_last_resort=False):
|
|
"""
|
|
func [callable] - function to be called periodically
|
|
|
|
args [tuple] - arguments passed to func when calling
|
|
|
|
intervall [pos number] - time to "sleep" between each call
|
|
|
|
verbose [pos integer] - specifies the level of verbosity
|
|
[0--silent, 1--important information, 2--some debug info]
|
|
|
|
sigint [string] - signal handler string to set SIGINT behavior (see below)
|
|
|
|
sigterm [string] - signal handler string to set SIGTERM behavior (see below)
|
|
|
|
name [string] - use this name in messages instead of the PID
|
|
|
|
auto_kill_on_last_resort [bool] - If set False (default), ask user to send SIGKILL
|
|
to loop process in case normal stop and SIGTERM failed. If set True, send SIDKILL
|
|
without asking.
|
|
|
|
the signal handler string may be one of the following
|
|
ing: ignore the incoming signal
|
|
stop: set the shared memory boolean to false -> prevents the loop from
|
|
repeating -> subprocess terminates when func returns and sleep time interval
|
|
has passed.
|
|
"""
|
|
self.func = func
|
|
self.args = args
|
|
self.interval = interval
|
|
self._run = mp.Value('b', False)
|
|
self.verbose = verbose
|
|
self._proc = None
|
|
self._sigint = sigint
|
|
self._sigterm = sigterm
|
|
self._name = name
|
|
self._auto_kill_on_last_resort = auto_kill_on_last_resort
|
|
self._identifier = None
|
|
|
|
def __enter__(self):
|
|
return self
|
|
|
|
def __exit__(self, *exc_args):
|
|
# normal exit
|
|
if not self.is_alive():
|
|
if self.verbose > 1:
|
|
print("{}: has stopped on context exit".format(self._identifier))
|
|
return
|
|
|
|
# loop still runs on context exit -> __cleanup
|
|
if self.verbose > 1:
|
|
print("{}: is still running on context exit".format(self._identifier))
|
|
self.__cleanup()
|
|
|
|
|
|
def __cleanup(self):
|
|
"""
|
|
Wait at most twice as long as the given repetition interval
|
|
for the _wrapper_function to terminate.
|
|
|
|
If after that time the _wrapper_function has not terminated,
|
|
send SIGTERM to and the process.
|
|
|
|
Wait at most five times as long as the given repetition interval
|
|
for the _wrapper_function to terminate.
|
|
|
|
If the process still running send SIGKILL automatically if
|
|
auto_kill_on_last_resort was set True or ask the
|
|
user to confirm sending SIGKILL
|
|
"""
|
|
# set run to False and wait some time -> see what happens
|
|
self.run = False
|
|
if check_process_termination(proc=self._proc,
|
|
identifier=self._identifier,
|
|
timeout=2*self.interval,
|
|
verbose=self.verbose,
|
|
auto_kill_on_last_resort=self._auto_kill_on_last_resort):
|
|
if self.verbose > 1:
|
|
print("{}: cleanup successful".format(self._identifier))
|
|
self._proc = None
|
|
|
|
|
|
@staticmethod
|
|
def _wrapper_func(func, args, shared_mem_run, interval, verbose, sigint, sigterm, name):
|
|
"""to be executed as a seperate process (that's why this functions is declared static)
|
|
"""
|
|
# implement the process specific signal handler
|
|
identifier = get_identifier(name)
|
|
SIG_handler_Loop(shared_mem_run, sigint, sigterm, identifier, verbose)
|
|
|
|
while shared_mem_run.value:
|
|
try:
|
|
quit_loop = func(*args)
|
|
except:
|
|
err, val, trb = sys.exc_info()
|
|
if verbose > 0:
|
|
print("{}: error {} occurred in Loop class calling 'func(*args)'".format(identifier, err))
|
|
if verbose > 0:
|
|
traceback.print_tb(trb)
|
|
return
|
|
|
|
if quit_loop:
|
|
return
|
|
time.sleep(interval)
|
|
|
|
if verbose > 1:
|
|
print("{}: _wrapper_func terminates gracefully".format(identifier))
|
|
|
|
def start(self):
|
|
"""
|
|
uses multiprocess Process to call _wrapper_func in subprocess
|
|
"""
|
|
|
|
if self.is_alive():
|
|
if self.verbose > 0:
|
|
print("{}: is already running".format(self._identifier))
|
|
return
|
|
|
|
self.run = True
|
|
self._proc = mp.Process(target = Loop._wrapper_func,
|
|
args = (self.func, self.args, self._run, self.interval,
|
|
self.verbose, self._sigint, self._sigterm, self._name),
|
|
name=self._name)
|
|
self._proc.start()
|
|
self._identifier = get_identifier(self._name, self.getpid())
|
|
if self.verbose > 1:
|
|
print("{}: started as new loop process".format(self._identifier))
|
|
|
|
def stop(self):
|
|
"""
|
|
stops the process triggered by start
|
|
|
|
Setting the shared memory boolean run to false, which should prevent
|
|
the loop from repeating. Call __cleanup to make sure the process
|
|
stopped. After that we could trigger start() again.
|
|
"""
|
|
self.run = False
|
|
if not self.is_alive():
|
|
if self.verbose > 0:
|
|
print("PID None: there is no running loop to stop")
|
|
return
|
|
|
|
self.__cleanup()
|
|
|
|
def join(self, timeout):
|
|
"""
|
|
calls join for the spawned process with given timeout
|
|
"""
|
|
if self.is_alive():
|
|
return psutil.Process(self._pid).wait(timeout)
|
|
|
|
|
|
def getpid(self):
|
|
"""
|
|
return the process id of the spawned process
|
|
"""
|
|
return self._proc.pid
|
|
|
|
def is_alive(self):
|
|
if self._proc == None:
|
|
return False
|
|
else:
|
|
return self._proc.is_alive()
|
|
|
|
|
|
@property
|
|
def run(self):
|
|
"""
|
|
makes the shared memory boolean accessible as class attribute
|
|
|
|
Set run False, the loop will stop repeating.
|
|
Calling start, will set run True and start the loop again as a new process.
|
|
"""
|
|
return self._run.value
|
|
@run.setter
|
|
def run(self, run):
|
|
self._run.value = run
|
|
|
|
def UnsignedIntValue(val=0):
|
|
return mp.Value('I', val, lock=True)
|
|
|
|
class StatusBar(Loop):
|
|
"""
|
|
status bar in ascii art
|
|
|
|
Uses Loop class to implement repeating function which shows the process
|
|
based of the two shared memory values max_count and count. max_count is
|
|
assumed to be the final state, whereas count represents the current state.
|
|
|
|
The estimates time of arrival (ETA) will be calculated from a speed measurement
|
|
given by the average over the last spee_calc_cycles calls of the looping
|
|
function show_stat.
|
|
"""
|
|
def __init__(self,
|
|
count,
|
|
max_count,
|
|
interval=1,
|
|
speed_calc_cycles=10,
|
|
width='auto',
|
|
verbose=0,
|
|
sigint='stop',
|
|
sigterm='stop',
|
|
name='statusbar'):
|
|
"""
|
|
The init will also start to display the status bar if run was set True.
|
|
Otherwise use the inherited method start() to start the show_stat loop.
|
|
stop() will stop to show the status bar.
|
|
|
|
count [mp.Value] - shared memory to hold the current state
|
|
|
|
max_count [mp.Value] - shared memory holding the final state, may change
|
|
by external process without having to explicitly tell this class
|
|
|
|
interval [int] - seconds to wait between progress print
|
|
|
|
speed_calc_cycles [int] - use the current (time, count) as
|
|
well as the (old_time, old_count) read by the show_stat function
|
|
speed_calc_cycles calls before to calculate the speed as follows:
|
|
s = count - old_count / (time - old_time)
|
|
|
|
width [int/'auto'] - the number of characters used to show the status bar,
|
|
use 'auto' to determine width from terminal information -> see __set_width
|
|
|
|
verbose, sigint, sigterm -> see loop class
|
|
"""
|
|
|
|
assert isinstance(count, mp.sharedctypes.Synchronized)
|
|
assert isinstance(max_count, mp.sharedctypes.Synchronized)
|
|
|
|
self.start_time = mp.Value('d', time.time())
|
|
self.speed_calc_cycles = speed_calc_cycles
|
|
self.q = mp.Queue() # queue to save the last speed_calc_cycles
|
|
# (time, count) information to calculate speed
|
|
self.max_count = max_count # multiprocessing value type
|
|
self.count = count # multiprocessing value type
|
|
self.interval = interval
|
|
self.verbose = verbose
|
|
self.name = name
|
|
self.__set_width(width)
|
|
|
|
|
|
# setup loop class
|
|
super().__init__(func=StatusBar.show_stat,
|
|
args=(self.count,
|
|
self.start_time,
|
|
self.max_count,
|
|
self.width,
|
|
self.speed_calc_cycles,
|
|
self.q),
|
|
interval=interval,
|
|
verbose=verbose,
|
|
sigint=sigint,
|
|
sigterm=sigterm,
|
|
name=name,
|
|
auto_kill_on_last_resort=True)
|
|
|
|
def __exit__(self, *exc_args):
|
|
super().__exit__(*exc_args)
|
|
StatusBar.show_stat(count=self.max_count,
|
|
start_time=self.start_time,
|
|
max_count=self.max_count,
|
|
width=self.width,
|
|
speed_calc_cycles=self.speed_calc_cycles,
|
|
q=self.q)
|
|
print()
|
|
|
|
def __set_width(self, width):
|
|
"""
|
|
set the number of characters to be used to disply the status bar
|
|
|
|
is set to 'auto' try to determine the width of the terminal used
|
|
(experimental, depends on the terminal used, os dependens)
|
|
use a width of 80 as fall back.
|
|
"""
|
|
if width == 'auto':
|
|
try:
|
|
hw = struct.unpack('hh', fcntl.ioctl(sys.stdin, termios.TIOCGWINSZ, '1234'))
|
|
self.width = hw[1]
|
|
except:
|
|
if self.verbose > 0:
|
|
print("{}: failed to determine the width of the terminal".format(get_identifier(name=self.name)))
|
|
self.width = 80
|
|
else:
|
|
self.width = width
|
|
|
|
def set_args(self, interval=1, speed_calc_cycles=10, width='auto'):
|
|
"""
|
|
change some of the init arguments
|
|
|
|
This will stop the loop, set changes and start the loop again.
|
|
"""
|
|
self.stop()
|
|
self.interval = interval
|
|
self.speed_calc_cycles = speed_calc_cycles
|
|
self.__set_width(width)
|
|
|
|
self.args = (self.count, self.start_time, self.max_count,
|
|
self.width, self.speed_calc_cycles, self.q)
|
|
self.start()
|
|
|
|
@staticmethod
|
|
def show_stat(count, start_time, max_count, width, speed_calc_cycles, q):
|
|
"""
|
|
the actual routine to bring the status to the screen
|
|
"""
|
|
count_value = count.value
|
|
max_count_value = max_count.value
|
|
if count_value == 0:
|
|
start_time.value = time.time()
|
|
print("\rwait for first count ...", end='', flush=True)
|
|
return False
|
|
else:
|
|
current_time = time.time()
|
|
start_time_value = start_time.value
|
|
q.put((count_value, current_time))
|
|
|
|
if q.qsize() > speed_calc_cycles:
|
|
old_count_value, old_time = q.get()
|
|
else:
|
|
old_count_value, old_time = 0, start_time_value
|
|
|
|
tet = (current_time - start_time_value)
|
|
speed = (count_value - old_count_value) / (current_time - old_time)
|
|
if speed == 0:
|
|
s3 = "] ETA --"
|
|
else:
|
|
eta = math.ceil((max_count_value - count_value) / speed)
|
|
s3 = "] ETA {}".format(humanize_time(eta))
|
|
|
|
|
|
s1 = "\r{} [{}] [".format(humanize_time(tet), humanize_speed(speed))
|
|
|
|
l = len(s1) + len(s3)
|
|
l2 = width - l - 1
|
|
|
|
a = int(l2 * count_value / max_count_value)
|
|
b = l2 - a
|
|
s2 = "="*a + ">" + " "*b
|
|
print(s1+s2+s3, end='', flush=True)
|
|
return count_value >= max_count_value
|
|
|
|
def stop(self):
|
|
print()
|
|
super().stop()
|
|
|
|
def setup_SIG_handler_manager():
|
|
"""
|
|
When a process calls this functions, it's signal handler
|
|
will be set to ignore the signals given by the list signals.
|
|
|
|
This functions is passed to the SyncManager start routine (used
|
|
by JobManager_Server) to enable graceful termination when received
|
|
SIGINT or SIGTERM.
|
|
|
|
The problem is, that the SyncManager start routine triggers a new
|
|
process to provide shared memory object even over network communication.
|
|
Since the SIGINT signal will be passed to all child processes, the default
|
|
handling would make the SyncManger halt on KeyboardInterrupt Exception.
|
|
|
|
As we want to shout down the SyncManager at the last stage of cleanup
|
|
we have to prevent this default signal handling by passing this functions
|
|
to the SyncManager start routine.
|
|
"""
|
|
Signal_to_SIG_IGN(signals=[signal.SIGINT, signal.SIGTERM], verbose=0)
|
|
|
|
class Signal_to_SIG_IGN(object):
|
|
def __init__(self, signals=[signal.SIGINT, signal.SIGTERM], verbose=0):
|
|
self.verbose = verbose
|
|
for s in signals:
|
|
signal.signal(s, self._handler)
|
|
def _handler(self, sig, frame):
|
|
if self.verbose > 0:
|
|
print("PID {}: received signal {} -> will be ignored".format(os.getpid(), signal_dict[sig]))
|
|
|
|
|
|
|
|
class Signal_to_sys_exit(object):
|
|
def __init__(self, signals=[signal.SIGINT, signal.SIGTERM], verbose=0):
|
|
self.verbose = verbose
|
|
for s in signals:
|
|
signal.signal(s, self._handler)
|
|
def _handler(self, signal, frame):
|
|
if self.verbose > 0:
|
|
print("PID {}: received signal {} -> call sys.exit -> raise SystemExit".format(os.getpid(), signal_dict[signal]))
|
|
sys.exit('exit due to signal {}'.format(signal_dict[signal]))
|
|
|
|
class Signal_to_terminate_process_list(object):
|
|
"""
|
|
SIGINT and SIGTERM will call terminate for process given in process_list
|
|
"""
|
|
def __init__(self, process_list, signals = [signal.SIGINT, signal.SIGTERM], verbose=0):
|
|
self.process_list = process_list
|
|
self.verbose = verbose
|
|
for s in signals:
|
|
signal.signal(s, self._handler)
|
|
def _handler(self, signal, frame):
|
|
if self.verbose > 0:
|
|
print("PID {}: received sig {} -> terminate all given subprocesses".format(os.getpid(), signal_dict[signal]))
|
|
for p in self.process_list:
|
|
p.terminate()
|
|
|
|
class JobManager_Server(object):
|
|
"""general usage:
|
|
|
|
- init the JobManager_Server, start SyncManager server process
|
|
|
|
- pass the arguments to be processed to the JobManager_Server
|
|
(put_arg, args_from_list)
|
|
|
|
- start the JobManager_Server (start), which means to wait for incoming
|
|
results and to process them. Afterwards process all obtained data.
|
|
|
|
The default behavior of handling each incoming new result is to simply
|
|
add the pair (arg, result) to the final_result list.
|
|
|
|
When finished the default final processing is to dump the
|
|
final_result list to fname_for_final_result_dump
|
|
|
|
To change this behavior you may subclass the JobManager_Server
|
|
and implement
|
|
- an extended __init__ to change the type of the final_result attribute
|
|
- process_new_result
|
|
- process_final_result(self)
|
|
|
|
In case of any exceptions the JobManager_Server will call process_final_result
|
|
and dump the unprocessed job_q as a list to fname_for_job_q_dump.
|
|
|
|
Also the signal SIGTERM is caught. In such a case it will raise SystemExit exception
|
|
will will then be handle in the try except clause.
|
|
|
|
SystemExit and KeyboardInterrupt exceptions are not considered as failure. They are
|
|
rather methods to shout down the Server gracefully. Therefore in such cases no
|
|
traceback will be printed.
|
|
|
|
All other exceptions are probably due to some failure in function. A traceback
|
|
it printed to stderr.
|
|
|
|
notes:
|
|
- when the JobManager_Server gets killed (SIGKILL) and the SyncManager still
|
|
lives, the port used will occupied. considere sudo natstat -pna | grep 42524 to
|
|
find the process still using the port
|
|
|
|
- also the SyncManager ignores SIGTERM and SIGINT signals so you have to send
|
|
a SIGKILL.
|
|
"""
|
|
def __init__(self,
|
|
authkey,
|
|
const_arg=None,
|
|
port=42524,
|
|
verbose=1,
|
|
msg_interval=1,
|
|
fname_dump='auto',
|
|
speed_calc_cycles=50):
|
|
"""
|
|
authkey [string] - authentication key used by the SyncManager.
|
|
Server and Client must have the same authkey.
|
|
|
|
const_arg [dict] - some constant keyword arguments additionally passed to the
|
|
worker function (see JobManager_Client).
|
|
|
|
port [int] - network port to use
|
|
|
|
verbose [int] - 0: quiet, 1: status only, 2: debug messages
|
|
|
|
msg_interval [int] - number of second for the status bar to update
|
|
|
|
fname_for_final_result_dump [string/None] - sets the file name used to dump the the
|
|
final_result. (None: do not dump, 'auto' choose filename
|
|
'YYYY_MM_DD_hh_mm_ss_final_result.dump')
|
|
|
|
fname_for_args_dump [string/None] - sets the file name used to dump the list
|
|
of unprocessed arguments, if there are any. (None: do not dump at all,
|
|
'auto' choose filename 'YYYY_MM_DD_hh_mm_ss_args.dump')
|
|
|
|
fname_for_fail_dump [string/None] - sets the file name used to dump the list
|
|
of not successfully processed arguments, if there are any.
|
|
(None: do not dump, 'auto' choose filename 'YYYY_MM_DD_hh_mm_ss_fail.dump')
|
|
|
|
This init actually starts the SyncManager as a new process. As a next step
|
|
the job_q has to be filled, see put_arg().
|
|
"""
|
|
self.verbose = verbose
|
|
self._pid = os.getpid()
|
|
self._pid_start = None
|
|
self._identifier = get_identifier(name=self.__class__.__name__, pid=self._pid)
|
|
if self.verbose > 1:
|
|
print("{}: I'm the JobManager_Server main process".format(self._identifier))
|
|
|
|
self.__wait_before_stop = 2
|
|
|
|
self.port = port
|
|
self.authkey = bytearray(authkey, encoding='utf8')
|
|
self.const_arg = copy.copy(const_arg)
|
|
self.fname_dump = fname_dump
|
|
self.msg_interval = msg_interval
|
|
self.speed_calc_cycles = speed_calc_cycles
|
|
|
|
# to do some redundant checking, might be removed
|
|
# the args_set holds all arguments to be processed
|
|
# in contrast to the job_q, an argument will only be removed
|
|
# from the set if it was caught by the result_q
|
|
# so iff all results have been processed successfully,
|
|
# the args_set will be empty
|
|
self.args_set = set()
|
|
|
|
# thread safe integer values
|
|
self._numresults = mp.Value('i', 0) # count the successfully processed jobs
|
|
self._numjobs = mp.Value('i', 0) # overall number of jobs
|
|
|
|
# final result as list, other types can be achieved by subclassing
|
|
self.final_result = []
|
|
|
|
# NOTE: it only works using multiprocessing.Queue()
|
|
# the Queue class from the module queue does NOT work
|
|
self.job_q = mp.Queue() # queue holding args to process
|
|
self.result_q = mp.Queue() # queue holding returned results
|
|
self.fail_q = mp.Queue() # queue holding args where processing failed
|
|
self.manager = None
|
|
|
|
self.__start_SyncManager()
|
|
|
|
def __stop_SyncManager(self):
|
|
if self.manager == None:
|
|
return
|
|
|
|
manager_proc = self.manager._process
|
|
manager_identifier = get_identifier(name='SyncManager')
|
|
|
|
# stop SyncManager
|
|
self.manager.shutdown()
|
|
|
|
check_process_termination(proc=manager_proc,
|
|
identifier=manager_identifier,
|
|
timeout=2,
|
|
verbose=self.verbose,
|
|
auto_kill_on_last_resort=True)
|
|
|
|
def __start_SyncManager(self):
|
|
class JobQueueManager(SyncManager):
|
|
pass
|
|
|
|
# make job_q, result_q, fail_q, const_arg available via network
|
|
JobQueueManager.register('get_job_q', callable=lambda: self.job_q)
|
|
JobQueueManager.register('get_result_q', callable=lambda: self.result_q)
|
|
JobQueueManager.register('get_fail_q', callable=lambda: self.fail_q)
|
|
JobQueueManager.register('get_const_arg', callable=lambda: self.const_arg, exposed=["__iter__"])
|
|
|
|
address=('', self.port) #ip='' means local
|
|
authkey=self.authkey
|
|
|
|
self.manager = JobQueueManager(address, authkey)
|
|
|
|
# start manager with non default signal handling given by
|
|
# the additional init function setup_SIG_handler_manager
|
|
self.manager.start(setup_SIG_handler_manager)
|
|
self.hostname = socket.gethostname()
|
|
|
|
if self.verbose > 1:
|
|
print("{}: started on {}:{} with authkey '{}'".format(get_identifier('SyncManager', self.manager._process.pid),
|
|
self.hostname,
|
|
self.port,
|
|
str(authkey, encoding='utf8')))
|
|
|
|
def __restart_SyncManager(self):
|
|
self.__stop_SyncManager()
|
|
self.__start_SyncManager()
|
|
|
|
def __enter__(self):
|
|
return self
|
|
|
|
def __exit__(self, err, val, trb):
|
|
# KeyboardInterrupt via SIGINT will be mapped to SystemExit
|
|
# SystemExit is considered non erroneous
|
|
if err == SystemExit:
|
|
if self.verbose > 0:
|
|
print("{}: normal shutdown caused by SystemExit".format(self._identifier))
|
|
# no exception traceback will be printed
|
|
elif err != None:
|
|
# causes exception traceback to be printed
|
|
traceback.print_exception(err, val, trb)
|
|
|
|
# bring everything down, dump status to file
|
|
self.shoutdown()
|
|
return True
|
|
|
|
@property
|
|
def numjobs(self):
|
|
return self._numjobs.value
|
|
@numjobs.setter
|
|
def numjobs(self, numjobs):
|
|
self._numjobs.value = numjobs
|
|
|
|
@property
|
|
def numresults(self):
|
|
return self._numresults.value
|
|
@numresults.setter
|
|
def numresults(self, numresults):
|
|
self._numresults.value = numresults
|
|
|
|
def shoutdown(self):
|
|
""""stop all spawned processes and clean up
|
|
|
|
- call process_final_result to handle all collected result
|
|
- if job_q is not empty dump remaining job_q
|
|
"""
|
|
# will only be False when _shutdown was started in subprocess
|
|
|
|
# start also makes sure that it was not started as subprocess
|
|
# so at default behavior this assertion will allays be True
|
|
assert self._pid == os.getpid()
|
|
|
|
self.__stop_SyncManager()
|
|
|
|
|
|
# do user defined final processing
|
|
self.process_final_result()
|
|
|
|
if self.fname_dump != None:
|
|
|
|
if self.verbose > 0:
|
|
print("{}: dump current state ... ".format(self._identifier), end='', flush=True)
|
|
|
|
if self.fname_dump == 'auto':
|
|
fname = "{}_{}.dump".format(self.authkey.decode('utf8'), getDateForFileName(includePID=False))
|
|
else:
|
|
fname = self.fname_dump
|
|
|
|
with open(fname, 'wb') as f:
|
|
self.__dump(f)
|
|
|
|
if self.verbose > 0:
|
|
print("done!")
|
|
else:
|
|
if self.verbose > 0:
|
|
print("{}: fname_dump == None, ignore dumping current state!".format(self._identifier))
|
|
|
|
print("{}: JobManager_Server was successfully shout down".format(self._identifier))
|
|
|
|
@staticmethod
|
|
def static_load(f):
|
|
data = {}
|
|
data['numjobs'] = pickle.load(f)
|
|
data['numresults'] = pickle.load(f)
|
|
data['final_result'] = pickle.load(f)
|
|
data['args_set'] = pickle.load(f)
|
|
|
|
fail_list = pickle.load(f)
|
|
data['fail_set'] = {fail_item[0] for fail_item in fail_list}
|
|
|
|
data['fail_q'] = mp.Queue()
|
|
data['job_q'] = mp.Queue()
|
|
|
|
for fail_item in fail_list:
|
|
data['fail_q'].put_nowait(fail_item)
|
|
for arg in (data['args_set'] - data['fail_set']):
|
|
data['job_q'].put_nowait(arg)
|
|
|
|
return data
|
|
|
|
def __load(self, f):
|
|
data = JobManager_Server.static_load(f)
|
|
for key in ['numjobs', 'numresults', 'final_result',
|
|
'args_set', 'fail_q','job_q']:
|
|
self.__setattr__(key, data[key])
|
|
|
|
def __dump(self, f):
|
|
pickle.dump(self.numjobs, f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
pickle.dump(self.numresults, f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
pickle.dump(self.final_result, f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
pickle.dump(self.args_set, f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
fail_list = []
|
|
try:
|
|
while True:
|
|
fail_list.append(self.fail_q.get_nowait())
|
|
except queue.Empty:
|
|
pass
|
|
pickle.dump(fail_list, f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
|
|
|
|
def read_old_state(self, fname_dump=None):
|
|
|
|
if fname_dump == None:
|
|
fname_dump = self.fname_dump
|
|
if fname_dump == 'auto':
|
|
raise RuntimeError("fname_dump must not be 'auto' when reading old state")
|
|
|
|
if not os.path.isfile(fname_dump):
|
|
raise RuntimeError("file '{}' to read old state from not found".format(fname_dump))
|
|
|
|
|
|
with open(fname_dump, 'rb') as f:
|
|
self.__load(f)
|
|
self.__restart_SyncManager()
|
|
|
|
|
|
def put_arg(self, a):
|
|
"""add argument a to the job_q
|
|
"""
|
|
if (not hasattr(a, '__hash__')) or (a.__hash__ == None):
|
|
# try to add hashability
|
|
if isinstance(a, dict):
|
|
a = hashDict(a)
|
|
else:
|
|
raise AttributeError("'{}' is not hashable".format(type(a)))
|
|
|
|
self.args_set.add(copy.copy(a))
|
|
self.job_q.put(copy.copy(a))
|
|
|
|
with self._numjobs.get_lock():
|
|
self._numjobs.value += 1
|
|
|
|
def args_from_list(self, args):
|
|
"""serialize a list of arguments to the job_q
|
|
"""
|
|
for a in args:
|
|
self.put_arg(a)
|
|
|
|
def process_new_result(self, arg, result):
|
|
"""Will be called when the result_q has data available.
|
|
result is the computed result to the argument arg.
|
|
|
|
Should be overwritten by subclassing!
|
|
"""
|
|
self.final_result.append((arg, result))
|
|
|
|
def process_final_result(self):
|
|
"""to implement user defined final processing"""
|
|
pass
|
|
|
|
def start(self):
|
|
"""
|
|
starts to loop over incoming results
|
|
|
|
When finished, or on exception call stop() afterwards to shout down gracefully.
|
|
"""
|
|
if self._pid != os.getpid():
|
|
raise RuntimeError("do not run JobManager_Server.start() in a subprocess")
|
|
|
|
if (self.numjobs - self.numresults) != len(self.args_set):
|
|
raise RuntimeError("inconsistency detected! use JobManager_Server.put_arg to put arguments to the job_q")
|
|
|
|
if self.numjobs == 0:
|
|
raise RuntimeError("no jobs to process! use JobManager_Server.put_arg to put arguments to the job_q")
|
|
|
|
Signal_to_sys_exit(signals=[signal.SIGTERM, signal.SIGINT], verbose = self.verbose)
|
|
pid = os.getpid()
|
|
|
|
if self.verbose > 1:
|
|
print("{}: start processing incoming results".format(self._identifier))
|
|
|
|
|
|
with StatusBar(count = self._numresults, max_count = self._numjobs,
|
|
interval=self.msg_interval, speed_calc_cycles=self.speed_calc_cycles,
|
|
verbose = self.verbose, sigint='ign', sigterm='ign') as stat:
|
|
stat.start()
|
|
|
|
while (len(self.args_set) - self.fail_q.qsize()) > 0:
|
|
try:
|
|
arg, result = self.result_q.get(timeout=1)
|
|
self.args_set.remove(arg)
|
|
self.process_new_result(arg, result)
|
|
self.numresults = self.numjobs - (len(self.args_set) - self.fail_q.qsize())
|
|
except queue.Empty:
|
|
pass
|
|
|
|
if self.verbose > 1:
|
|
print("{}: wait {}s before trigger clean up".format(self._identifier, self.__wait_before_stop))
|
|
time.sleep(self.__wait_before_stop)
|
|
|
|
class JobManager_Client(object):
|
|
"""
|
|
Calls the functions self.func with arguments fetched from the job_q. You should
|
|
subclass this class and overwrite func to handle your own function.
|
|
|
|
The job_q is provided by the SycnManager who connects to a SyncManager setup
|
|
by the JobManager_Server.
|
|
|
|
Spawns nproc subprocesses (__worker_func) to process arguments.
|
|
Each subprocess gets an argument from the job_q, processes it
|
|
and puts the result to the result_q.
|
|
|
|
If the job_q is empty, terminate the subprocess.
|
|
|
|
In case of any failure detected within the try except clause
|
|
the argument, which just failed to process, the error and the
|
|
hostname are put to the fail_q so the JobManager_Server can take care of that.
|
|
|
|
After that the traceback is written to a file with name
|
|
traceback_args_<args>_err_<err>_<YYYY>_<MM>_<DD>_<hh>_<mm>_<ss>_<PID>.trb.
|
|
|
|
Then the process will terminate.
|
|
"""
|
|
|
|
def __init__(self,
|
|
server,
|
|
authkey,
|
|
port = 42524,
|
|
nproc = 0,
|
|
nice=19,
|
|
no_warings=False,
|
|
verbose=1):
|
|
"""
|
|
server [string] - ip address or hostname where the JobManager_Server is running
|
|
|
|
authkey [string] - authentication key used by the SyncManager.
|
|
Server and Client must have the same authkey.
|
|
|
|
port [int] - network port to use
|
|
|
|
nproc [integer] - number of subprocesses to start
|
|
|
|
positive integer: number of processes to spawn
|
|
|
|
zero: number of spawned processes == number cpu cores
|
|
|
|
negative integer: number of spawned processes == number cpu cores - |nproc|
|
|
|
|
nice [integer] - niceness of the subprocesses
|
|
|
|
no_warnings [bool] - call warnings.filterwarnings("ignore") -> all warnings are ignored
|
|
|
|
verbose [int] - 0: quiet, 1: status only, 2: debug messages
|
|
"""
|
|
|
|
|
|
self.verbose = verbose
|
|
self._pid = os.getpid()
|
|
self._identifier = get_identifier(name=self.__class__.__name__, pid=self._pid)
|
|
if self.verbose > 1:
|
|
print("{}: init".format(self._identifier))
|
|
|
|
if no_warings:
|
|
import warnings
|
|
warnings.filterwarnings("ignore")
|
|
if self.verbose > 1:
|
|
print("{}: ignore all warnings".format(self._identifier))
|
|
self.server = server
|
|
self.authkey = bytearray(authkey, encoding='utf8')
|
|
self.port = port
|
|
self.nice = nice
|
|
if nproc > 0:
|
|
self.nproc = nproc
|
|
else:
|
|
self.nproc = mp.cpu_count() + nproc
|
|
assert self.nproc > 0
|
|
|
|
self.procs = []
|
|
|
|
self.manager_objects = self.get_manager_objects()
|
|
|
|
|
|
def get_manager_objects(self):
|
|
return JobManager_Client._get_manager_objects(self.server,
|
|
self.port,
|
|
self.authkey,
|
|
self._identifier,
|
|
self.verbose)
|
|
|
|
@staticmethod
|
|
def _get_manager_objects(server, port, authkey, identifier, verbose=0):
|
|
"""
|
|
connects to the server and get registered shared objects such as
|
|
job_q, result_q, fail_q, const_arg
|
|
"""
|
|
class ServerQueueManager(SyncManager):
|
|
pass
|
|
|
|
ServerQueueManager.register('get_job_q')
|
|
ServerQueueManager.register('get_result_q')
|
|
ServerQueueManager.register('get_fail_q')
|
|
ServerQueueManager.register('get_const_arg', exposed="__iter__")
|
|
|
|
manager = ServerQueueManager(address=(server, port), authkey=authkey)
|
|
|
|
if verbose > 0:
|
|
print('{}: connecting to {}:{} authkey {}... '.format(identifier, server, port, authkey.decode('utf8')), end='', flush=True)
|
|
try:
|
|
manager.connect()
|
|
except:
|
|
if verbose > 0:
|
|
print('failed!')
|
|
|
|
err, val, trb = sys.exc_info()
|
|
print("caught exception {}: {}".format(err.__name__, val))
|
|
|
|
if err == ConnectionRefusedError:
|
|
print("check if server is up!")
|
|
|
|
if verbose > 1:
|
|
traceback.print_exception(err, val, trb)
|
|
return None
|
|
else:
|
|
if verbose > 0:
|
|
print('done!')
|
|
|
|
|
|
job_q = manager.get_job_q()
|
|
if verbose > 1:
|
|
print("{}: found job_q with {} jobs".format(identifier, job_q.qsize()))
|
|
|
|
result_q = manager.get_result_q()
|
|
fail_q = manager.get_fail_q()
|
|
const_arg = manager.get_const_arg()
|
|
return job_q, result_q, fail_q, const_arg
|
|
|
|
@staticmethod
|
|
def func(arg, const_arg):
|
|
"""
|
|
function to be called by the worker processes
|
|
|
|
arg - provided by the job_q of the JobManager_Server
|
|
|
|
const_arg - tuple of constant argruments also provided by the JobManager_Server
|
|
|
|
NOTE: This is just some dummy implementation to be used for test reasons only!
|
|
Subclass and overwrite this function to implement your own function.
|
|
"""
|
|
time.sleep(0.1)
|
|
return os.getpid()
|
|
|
|
|
|
@staticmethod
|
|
def __worker_func(func, nice, verbose, server, port, authkey, i, manager_objects=None):
|
|
"""
|
|
the wrapper spawned nproc trimes calling and handling self.func
|
|
"""
|
|
identifier = get_identifier(name='worker{}'.format(i+1))
|
|
Signal_to_sys_exit(signals=[signal.SIGTERM, signal.SIGINT])
|
|
|
|
if manager_objects is None:
|
|
manager_objects = JobManager_Client._get_manager_object(server, port, authkey, identifier, verbose)
|
|
if res == None:
|
|
if verbose > 1:
|
|
print("{}: no shared object recieved, terminate!".format(identifier))
|
|
sys.exit(1)
|
|
|
|
job_q, result_q, fail_q, const_arg = manager_objects
|
|
|
|
n = os.nice(0)
|
|
n = os.nice(nice - n)
|
|
c = 0
|
|
if verbose > 1:
|
|
print("{}: now alive, niceness {}".format(identifier, n))
|
|
|
|
time_queue = 0
|
|
time_calc = 0
|
|
|
|
while True:
|
|
try:
|
|
t0 = time.clock()
|
|
arg = job_q.get(block = True, timeout = 0.1)
|
|
t1 = time.clock()
|
|
res = func(arg, const_arg)
|
|
t2 = time.clock()
|
|
result_q.put((arg, res))
|
|
t3 = time.clock()
|
|
c += 1
|
|
|
|
time_queue += (t1-t0 + t3-t2)
|
|
time_calc += (t2-t1)
|
|
|
|
# regular case, just stop woring when empty job_q was found
|
|
except queue.Empty:
|
|
if verbose > 0:
|
|
print("{}: finds empty job queue, processed {} jobs".format(identifier, c))
|
|
break
|
|
|
|
# in this context usually raised if the communication to the server fails
|
|
except EOFError:
|
|
if verbose > 0:
|
|
print("{}: EOFError, I guess the server went down, can't do anything, terminate now!".format(identifier))
|
|
break
|
|
|
|
# considered as normal exit caused by some user interaction, SIGINT, SIGTERM
|
|
# note SIGINT, SIGTERM -> SystemExit is achieved by overwriting the
|
|
# default signal handlers
|
|
except SystemExit:
|
|
if verbose > 0:
|
|
print("{}: SystemExit, quit processing, reinsert current argument".format(identifier))
|
|
try:
|
|
if verbose > 1:
|
|
print("{}: put argument back to job_q ... ".format(identifier), end='', flush=True)
|
|
job_q.put(arg, timeout=10)
|
|
except queue.Full:
|
|
if verbose > 0:
|
|
print("{}: failed to reinsert argument, Server down? I quit!".format(identifier))
|
|
else:
|
|
if verbose > 1:
|
|
print("done!")
|
|
break
|
|
|
|
# some unexpected Exception
|
|
# write argument, exception name and hostname to fail_q, save traceback
|
|
# continue workung
|
|
except:
|
|
err, val, trb = sys.exc_info()
|
|
if verbose > 0:
|
|
print("{}: caught exception '{}', report failure of current argument to server ... ".format(identifier, err.__name__), end='', flush=True)
|
|
|
|
hostname = socket.gethostname()
|
|
fname = 'traceback_err_{}_{}.trb'.format(err.__name__, getDateForFileName(includePID=True))
|
|
fail_q.put((arg, err.__name__, hostname), timeout=10)
|
|
|
|
if verbose > 0:
|
|
print("done")
|
|
print(" write exception to file {} ... ".format(fname), end='', flush=True)
|
|
|
|
with open(fname, 'w') as f:
|
|
traceback.print_exception(etype=err, value=val, tb=trb, file=f)
|
|
|
|
if verbose > 0:
|
|
print("done")
|
|
print(" continue processing next argument.")
|
|
|
|
if verbose > 0:
|
|
try:
|
|
print("{}: calculation:{:.2%} communication:{:.2%}".format(identifier, time_calc/(time_calc+time_queue), time_queue/(time_calc+time_queue)))
|
|
except:
|
|
pass
|
|
if verbose > 1:
|
|
print("{}: JobManager_Client.__worker_func terminates".format(identifier))
|
|
|
|
|
|
def start(self):
|
|
"""
|
|
starts a number of nproc subprocess to work on the job_q
|
|
|
|
SIGTERM and SIGINT are managed to terminate all subprocesses
|
|
|
|
retruns when all subprocesses have terminated
|
|
"""
|
|
if self.verbose > 1:
|
|
print("{}: start {} processes to work on the remote queue".format(self._identifier, self.nproc))
|
|
for i in range(self.nproc):
|
|
p = mp.Process(target=self.__worker_func, args=(self.func,
|
|
self.nice,
|
|
self.verbose,
|
|
self.server,
|
|
self.port,
|
|
self.authkey,
|
|
i,
|
|
self.manager_objects))
|
|
self.procs.append(p)
|
|
p.start()
|
|
time.sleep(0.3)
|
|
|
|
Signal_to_terminate_process_list(process_list = self.procs, verbose=self.verbose)
|
|
|
|
for p in self.procs:
|
|
p.join()
|