#!/usr/bin/env bash set -euo pipefail cd ~/efs/lm # download the dataset wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip unzip wikitext-103-raw-v1.zip # encode it with the GPT-2 BPE mkdir -p gpt2_bpe wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe wget https://raw.githubusercontent.com/pytorch/fairseq/master/examples/roberta/multiprocessing_bpe_encoder.py for SPLIT in train valid test; do \ python multiprocessing_bpe_encoder.py \ --encoder-json gpt2_bpe/encoder.json \ --vocab-bpe gpt2_bpe/vocab.bpe \ --inputs wikitext-103-raw/wiki.${SPLIT}.raw \ --outputs wikitext-103-raw/wiki.${SPLIT}.bpe \ --keep-empty \ --workers 60; \ done # preprocess/binarize the data using the GPT-2 fairseq dictionary wget -O gpt2_bpe/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt fairseq-preprocess \ --only-source \ --srcdict gpt2_bpe/dict.txt \ --trainpref wikitext-103-raw/wiki.train.bpe \ --validpref wikitext-103-raw/wiki.valid.bpe \ --testpref wikitext-103-raw/wiki.test.bpe \ --destdir data-bin/wikitext-103 \ --workers 60