ray/examples/streaming/streaming.py

import argparse
from collections import Counter, defaultdict
import heapq
import numpy as np
import os
import ray
import wikipedia

parser = argparse.ArgumentParser()
parser.add_argument("--num-mappers",
                    help="number of mapper actors used", default=3)
parser.add_argument("--num-reducers",
                    help="number of reducer actors used", default=4)


@ray.remote
class Mapper(object):
    def __init__(self, title_stream):
        self.title_stream = title_stream
        self.num_articles_processed = 0
        self.articles = []
        self.word_counts = []

    def get_new_article(self):
        # Get the next wikipedia article.
        article = wikipedia.page(self.title_stream.next()).content
        # Count the words and store the result.
        self.word_counts.append(Counter(article.split(" ")))
        self.num_articles_processed += 1

    def get_range(self, article_index, keys):
        # Process more articles if this Mapper hasn't processed enough yet.
        while self.num_articles_processed < article_index + 1:
            self.get_new_article()
        # Return the word counts from within a given character range.
        return [(k, v) for k, v in self.word_counts[article_index].items()
                if len(k) >= 1 and k[0] >= keys[0] and k[0] <= keys[1]]


@ray.remote
class Reducer(object):
    def __init__(self, keys, *mappers):
        self.mappers = mappers
        self.keys = keys

    def next_reduce_result(self, article_index):
        word_count_sum = defaultdict(lambda: 0)
        # Get the word counts for this Reducer's keys from all of the Mappers
        # and aggregate the results.
        count_ids = [mapper.get_range.remote(article_index, self.keys)
                     for mapper in self.mappers]
        # TODO(rkn): We should process these out of order using ray.wait.
        for count_id in count_ids:
            for k, v in ray.get(count_id):
                word_count_sum[k] += v
        return word_count_sum


class Stream(object):
    def __init__(self, elements):
        self.elements = elements

    def next(self):
        i = np.random.randint(0, len(self.elements))
        return self.elements[i]


if __name__ == "__main__":
    args = parser.parse_args()

    ray.init()

    # Create one streaming source of articles per mapper.
    directory = os.path.dirname(os.path.realpath(__file__))
    streams = []
    for _ in range(args.num_mappers):
        with open(os.path.join(directory, "articles.txt")) as f:
            streams.append(Stream([line.strip() for line in f.readlines()]))

    # Partition the keys among the reducers.
    chunks = np.array_split([chr(i) for i in range(ord("a"), ord("z") + 1)],
                            args.num_reducers)
    keys = [[chunk[0], chunk[-1]] for chunk in chunks]

    # Create a number of mappers.
    mappers = [Mapper.remote(stream) for stream in streams]

    # Create a number of reduces, each responsible for a different range of
    # keys. This gives each Reducer actor a handle to each Mapper actor.
    reducers = [Reducer.remote(key, *mappers) for key in keys]

    article_index = 0
    while True:
        print("article index = {}".format(article_index))
        wordcounts = dict()
        counts = ray.get([reducer.next_reduce_result.remote(article_index)
                          for reducer in reducers])
        for count in counts:
            wordcounts.update(count)
        most_frequent_words = heapq.nlargest(10, wordcounts,
                                             key=wordcounts.get)
        for word in most_frequent_words:
            print("  ", word, wordcounts[word])
        article_index += 1