ray/streaming/python/examples/wordcount.py

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import logging
import time

import ray
import wikipedia
from ray.streaming.streaming import Environment

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

parser = argparse.ArgumentParser()
parser.add_argument(
    "--titles-file",
    required=True,
    help="the file containing the wikipedia titles to lookup")


# A custom data source that reads articles from wikipedia
# Custom data sources need to implement a get_next() method
# that returns the next data element, in this case sentences
class Wikipedia:
    def __init__(self, title_file):
        # Titles in this file will be as queries
        self.title_file = title_file
        # TODO (john): Handle possible exception here
        self.title_reader = iter(list(open(self.title_file, "r").readlines()))
        self.done = False
        self.article_done = True
        self.sentences = iter([])

    # Returns next sentence from a wikipedia article
    def get_next(self):
        if self.done:
            return None  # Source exhausted
        while True:
            if self.article_done:
                try:  # Try next title
                    next_title = next(self.title_reader)
                except StopIteration:
                    self.done = True  # Source exhausted
                    return None
                # Get next article
                logger.debug("Next article: {}".format(next_title))
                article = wikipedia.page(next_title).content
                # Split article in sentences
                self.sentences = iter(article.split("."))
                self.article_done = False
            try:  # Try next sentence
                sentence = next(self.sentences)
                logger.debug("Next sentence: {}".format(sentence))
                return sentence
            except StopIteration:
                self.article_done = True


# Splits input line into words and
# outputs records of the form (word,1)
def splitter(line):
    records = []
    words = line.split()
    for w in words:
        records.append((w, 1))
    return records


# Returns the first attribute of a tuple
def key_selector(tuple):
    return tuple[0]


# Returns the second attribute of a tuple
def attribute_selector(tuple):
    return tuple[1]


if __name__ == "__main__":
    # Get program parameters
    args = parser.parse_args()
    titles_file = str(args.titles_file)

    ray.init()

    # A Ray streaming environment with the default configuration
    env = Environment()
    env.set_parallelism(2)  # Each operator will be executed by two actors

    # The following dataflow is a simple streaming wordcount
    #  with a rolling sum operator.
    # It reads articles from wikipedia, splits them in words,
    # shuffles words, and counts the occurences of each word.
    stream = env.source(Wikipedia(titles_file)) \
                .round_robin() \
                .flat_map(splitter) \
                .key_by(key_selector) \
                .sum(attribute_selector) \
                .inspect(print)     # Prints the contents of the
    # stream to stdout
    start = time.time()
    env_handle = env.execute()  # Deploys and executes the dataflow
    ray.get(env_handle)  # Stay alive until execution finishes
    env.wait_finish()
    end = time.time()
    logger.info("Elapsed time: {} secs".format(end - start))
    logger.debug("Output stream id: {}".format(stream.id))