ray/streaming/python/examples/key_selectors.py

import argparse
import logging
import time

import ray
from ray.streaming.streaming import Environment

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

parser = argparse.ArgumentParser()
parser.add_argument("--input-file", required=True, help="the input text file")


# A class used to check attribute-based key selection
class Record:
    def __init__(self, record):
        k, _ = record
        self.word = k
        self.record = record


# Splits input line into words and outputs objects of type Record
# each one consisting of a key (word) and a tuple (word,1)
def splitter(line):
    records = []
    words = line.split()
    for w in words:
        records.append(Record((w, 1)))
    return records


# Receives an object of type Record and returns the actual tuple
def as_tuple(record):
    return record.record


if __name__ == "__main__":
    # Get program parameters
    args = parser.parse_args()
    input_file = str(args.input_file)

    ray.init()
    ray.register_custom_serializer(Record, use_dict=True)

    # A Ray streaming environment with the default configuration
    env = Environment()
    env.set_parallelism(2)  # Each operator will be executed by two actors

    # 'key_by("word")' physically partitions the stream of records
    # based on the hash value of the 'word' attribute (see Record class above)
    # 'map(as_tuple)' maps a record of type Record into a tuple
    # 'sum(1)' sums the 2nd element of the tuple, i.e. the word count
    stream = env.read_text_file(input_file) \
                .round_robin() \
                .flat_map(splitter) \
                .key_by("word") \
                .map(as_tuple) \
                .sum(1) \
                .inspect(print)     # Prints the content of the
    # stream to stdout
    start = time.time()
    env_handle = env.execute()  # Deploys and executes the dataflow
    ray.get(env_handle)  # Stay alive until execution finishes
    end = time.time()
    logger.info("Elapsed time: {} secs".format(end - start))
    logger.debug("Output stream id: {}".format(stream.id))