2019-02-26 12:15:08 -08:00
|
|
|
import enum
|
|
|
|
import logging
|
|
|
|
|
2019-12-10 20:33:24 +08:00
|
|
|
import cloudpickle
|
|
|
|
|
2019-02-26 12:15:08 -08:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
logger.setLevel("DEBUG")
|
|
|
|
|
|
|
|
|
|
|
|
# Stream partitioning schemes
|
2020-01-02 17:42:13 -08:00
|
|
|
class PScheme:
|
2019-02-26 12:15:08 -08:00
|
|
|
def __init__(self, strategy, partition_fn=None):
|
|
|
|
self.strategy = strategy
|
|
|
|
self.partition_fn = partition_fn
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return "({},{})".format(self.strategy, self.partition_fn)
|
|
|
|
|
|
|
|
|
|
|
|
# Partitioning strategies
|
|
|
|
class PStrategy(enum.Enum):
|
|
|
|
Forward = 0 # Default
|
|
|
|
Shuffle = 1
|
|
|
|
Rescale = 2
|
|
|
|
RoundRobin = 3
|
|
|
|
Broadcast = 4
|
|
|
|
Custom = 5
|
|
|
|
ShuffleByKey = 6
|
|
|
|
# ...
|
|
|
|
|
|
|
|
|
|
|
|
# Operator types
|
|
|
|
class OpType(enum.Enum):
|
|
|
|
Source = 0
|
|
|
|
Map = 1
|
|
|
|
FlatMap = 2
|
|
|
|
Filter = 3
|
|
|
|
TimeWindow = 4
|
|
|
|
KeyBy = 5
|
|
|
|
Sink = 6
|
|
|
|
WindowJoin = 7
|
|
|
|
Inspect = 8
|
|
|
|
ReadTextFile = 9
|
|
|
|
Reduce = 10
|
|
|
|
Sum = 11
|
|
|
|
# ...
|
|
|
|
|
|
|
|
|
|
|
|
# A logical dataflow operator
|
2020-01-02 17:42:13 -08:00
|
|
|
class Operator:
|
2019-02-26 12:15:08 -08:00
|
|
|
def __init__(self,
|
|
|
|
id,
|
2019-12-10 20:33:24 +08:00
|
|
|
op_type,
|
|
|
|
processor_class,
|
2019-02-26 12:15:08 -08:00
|
|
|
name="",
|
|
|
|
logic=None,
|
|
|
|
num_instances=1,
|
|
|
|
other=None,
|
|
|
|
state_actor=None):
|
|
|
|
self.id = id
|
2019-12-10 20:33:24 +08:00
|
|
|
self.type = op_type
|
|
|
|
self.processor_class = processor_class
|
2019-02-26 12:15:08 -08:00
|
|
|
self.name = name
|
2019-12-10 20:33:24 +08:00
|
|
|
self._logic = cloudpickle.dumps(logic) # The operator's logic
|
2019-02-26 12:15:08 -08:00
|
|
|
self.num_instances = num_instances
|
|
|
|
# One partitioning strategy per downstream operator (default: forward)
|
|
|
|
self.partitioning_strategies = {}
|
|
|
|
self.other_args = other # Depends on the type of the operator
|
|
|
|
self.state_actor = state_actor # Actor to query state
|
|
|
|
|
|
|
|
# Sets the partitioning scheme for an output stream of the operator
|
|
|
|
def _set_partition_strategy(self,
|
|
|
|
stream_id,
|
|
|
|
partitioning_scheme,
|
|
|
|
dest_operator=None):
|
|
|
|
self.partitioning_strategies[stream_id] = (partitioning_scheme,
|
|
|
|
dest_operator)
|
|
|
|
|
|
|
|
# Retrieves the partitioning scheme for the given
|
|
|
|
# output stream of the operator
|
|
|
|
# Returns None is no strategy has been defined for the particular stream
|
|
|
|
def _get_partition_strategy(self, stream_id):
|
|
|
|
return self.partitioning_strategies.get(stream_id)
|
|
|
|
|
|
|
|
# Cleans metatada from all partitioning strategies that lack a
|
|
|
|
# destination operator
|
|
|
|
# Valid entries are re-organized as
|
|
|
|
# 'destination operator id -> partitioning scheme'
|
|
|
|
# Should be called only after the logical dataflow has been constructed
|
|
|
|
def _clean(self):
|
|
|
|
strategies = {}
|
|
|
|
for _, v in self.partitioning_strategies.items():
|
|
|
|
strategy, destination_operator = v
|
|
|
|
if destination_operator is not None:
|
|
|
|
strategies.setdefault(destination_operator, strategy)
|
|
|
|
self.partitioning_strategies = strategies
|
|
|
|
|
|
|
|
def print(self):
|
2019-12-10 20:33:24 +08:00
|
|
|
log = "Operator<\nID = {}\nName = {}\nprocessor_class = {}\n"
|
2019-02-26 12:15:08 -08:00
|
|
|
log += "Logic = {}\nNumber_of_Instances = {}\n"
|
|
|
|
log += "Partitioning_Scheme = {}\nOther_Args = {}>\n"
|
|
|
|
logger.debug(
|
2019-12-10 20:33:24 +08:00
|
|
|
log.format(self.id, self.name, self.processor_class, self.logic,
|
2019-02-26 12:15:08 -08:00
|
|
|
self.num_instances, self.partitioning_strategies,
|
|
|
|
self.other_args))
|
2019-12-10 20:33:24 +08:00
|
|
|
|
|
|
|
@property
|
|
|
|
def logic(self):
|
|
|
|
return cloudpickle.loads(self._logic)
|