mirror of
https://github.com/vale981/ray
synced 2025-03-06 18:41:40 -05:00

* add pages about examples on training language models with fairseq and ray autoscaler * better format * update ray_train.sh * Move EFS to the autoscaler file * nits * add comments to the code & use a new way to implement checkpoint hook * small bug fix * polish the doc * fix formatting * yaml * update docs * fix the bugs and add preprocess.sh * fix lint * Reduce batch size & fix lint * shorttitle
29 lines
No EOL
1.2 KiB
Bash
29 lines
No EOL
1.2 KiB
Bash
cd ~/efs/lm
|
|
|
|
# download the dataset
|
|
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
|
|
unzip wikitext-103-raw-v1.zip
|
|
# encode it with the GPT-2 BPE
|
|
mkdir -p gpt2_bpe
|
|
wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json
|
|
wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe
|
|
wget https://raw.githubusercontent.com/pytorch/fairseq/master/examples/roberta/multiprocessing_bpe_encoder.py
|
|
for SPLIT in train valid test; do \
|
|
python multiprocessing_bpe_encoder.py \
|
|
--encoder-json gpt2_bpe/encoder.json \
|
|
--vocab-bpe gpt2_bpe/vocab.bpe \
|
|
--inputs wikitext-103-raw/wiki.${SPLIT}.raw \
|
|
--outputs wikitext-103-raw/wiki.${SPLIT}.bpe \
|
|
--keep-empty \
|
|
--workers 60; \
|
|
done
|
|
# preprocess/binarize the data using the GPT-2 fairseq dictionary
|
|
wget -O gpt2_bpe/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt
|
|
fairseq-preprocess \
|
|
--only-source \
|
|
--srcdict gpt2_bpe/dict.txt \
|
|
--trainpref wikitext-103-raw/wiki.train.bpe \
|
|
--validpref wikitext-103-raw/wiki.valid.bpe \
|
|
--testpref wikitext-103-raw/wiki.test.bpe \
|
|
--destdir data-bin/wikitext-103 \
|
|
--workers 60 |