2020-07-21 14:47:09 -07:00
|
|
|
#!/usr/bin/env bash
|
|
|
|
|
2020-07-24 15:24:19 -07:00
|
|
|
set -euo pipefail
|
|
|
|
|
2019-10-20 20:28:16 -07:00
|
|
|
cd ~/efs/lm
|
|
|
|
|
|
|
|
# download the dataset
|
|
|
|
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
|
|
|
|
unzip wikitext-103-raw-v1.zip
|
|
|
|
# encode it with the GPT-2 BPE
|
|
|
|
mkdir -p gpt2_bpe
|
|
|
|
wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json
|
|
|
|
wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe
|
|
|
|
wget https://raw.githubusercontent.com/pytorch/fairseq/master/examples/roberta/multiprocessing_bpe_encoder.py
|
|
|
|
for SPLIT in train valid test; do \
|
|
|
|
python multiprocessing_bpe_encoder.py \
|
|
|
|
--encoder-json gpt2_bpe/encoder.json \
|
|
|
|
--vocab-bpe gpt2_bpe/vocab.bpe \
|
|
|
|
--inputs wikitext-103-raw/wiki.${SPLIT}.raw \
|
|
|
|
--outputs wikitext-103-raw/wiki.${SPLIT}.bpe \
|
|
|
|
--keep-empty \
|
|
|
|
--workers 60; \
|
|
|
|
done
|
|
|
|
# preprocess/binarize the data using the GPT-2 fairseq dictionary
|
|
|
|
wget -O gpt2_bpe/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt
|
|
|
|
fairseq-preprocess \
|
|
|
|
--only-source \
|
|
|
|
--srcdict gpt2_bpe/dict.txt \
|
|
|
|
--trainpref wikitext-103-raw/wiki.train.bpe \
|
|
|
|
--validpref wikitext-103-raw/wiki.valid.bpe \
|
|
|
|
--testpref wikitext-103-raw/wiki.test.bpe \
|
|
|
|
--destdir data-bin/wikitext-103 \
|
|
|
|
--workers 60
|