ray/streaming/python/examples/wordcount.py
2020-01-02 17:42:13 -08:00

109 lines
3.5 KiB
Python

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import logging
import time
import ray
import wikipedia
from ray.streaming.streaming import Environment
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument(
"--titles-file",
required=True,
help="the file containing the wikipedia titles to lookup")
# A custom data source that reads articles from wikipedia
# Custom data sources need to implement a get_next() method
# that returns the next data element, in this case sentences
class Wikipedia:
def __init__(self, title_file):
# Titles in this file will be as queries
self.title_file = title_file
# TODO (john): Handle possible exception here
self.title_reader = iter(list(open(self.title_file, "r").readlines()))
self.done = False
self.article_done = True
self.sentences = iter([])
# Returns next sentence from a wikipedia article
def get_next(self):
if self.done:
return None # Source exhausted
while True:
if self.article_done:
try: # Try next title
next_title = next(self.title_reader)
except StopIteration:
self.done = True # Source exhausted
return None
# Get next article
logger.debug("Next article: {}".format(next_title))
article = wikipedia.page(next_title).content
# Split article in sentences
self.sentences = iter(article.split("."))
self.article_done = False
try: # Try next sentence
sentence = next(self.sentences)
logger.debug("Next sentence: {}".format(sentence))
return sentence
except StopIteration:
self.article_done = True
# Splits input line into words and
# outputs records of the form (word,1)
def splitter(line):
records = []
words = line.split()
for w in words:
records.append((w, 1))
return records
# Returns the first attribute of a tuple
def key_selector(tuple):
return tuple[0]
# Returns the second attribute of a tuple
def attribute_selector(tuple):
return tuple[1]
if __name__ == "__main__":
# Get program parameters
args = parser.parse_args()
titles_file = str(args.titles_file)
ray.init()
# A Ray streaming environment with the default configuration
env = Environment()
env.set_parallelism(2) # Each operator will be executed by two actors
# The following dataflow is a simple streaming wordcount
# with a rolling sum operator.
# It reads articles from wikipedia, splits them in words,
# shuffles words, and counts the occurences of each word.
stream = env.source(Wikipedia(titles_file)) \
.round_robin() \
.flat_map(splitter) \
.key_by(key_selector) \
.sum(attribute_selector) \
.inspect(print) # Prints the contents of the
# stream to stdout
start = time.time()
env_handle = env.execute() # Deploys and executes the dataflow
ray.get(env_handle) # Stay alive until execution finishes
env.wait_finish()
end = time.time()
logger.info("Elapsed time: {} secs".format(end - start))
logger.debug("Output stream id: {}".format(stream.id))