ray/doc/examples/lm/preprocess.sh

#!/usr/bin/env bash

set -euo pipefail

cd ~/efs/lm

# download the dataset
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
unzip wikitext-103-raw-v1.zip
# encode it with the GPT-2 BPE
mkdir -p gpt2_bpe
wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json
wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe
wget https://raw.githubusercontent.com/pytorch/fairseq/master/examples/roberta/multiprocessing_bpe_encoder.py
for SPLIT in train valid test; do \
    python multiprocessing_bpe_encoder.py \
        --encoder-json gpt2_bpe/encoder.json \
        --vocab-bpe gpt2_bpe/vocab.bpe \
        --inputs wikitext-103-raw/wiki.${SPLIT}.raw \
        --outputs wikitext-103-raw/wiki.${SPLIT}.bpe \
        --keep-empty \
        --workers 60; \
done
# preprocess/binarize the data using the GPT-2 fairseq dictionary
wget -O gpt2_bpe/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt
fairseq-preprocess \
    --only-source \
    --srcdict gpt2_bpe/dict.txt \
    --trainpref wikitext-103-raw/wiki.train.bpe \
    --validpref wikitext-103-raw/wiki.valid.bpe \
    --testpref wikitext-103-raw/wiki.test.bpe \
    --destdir data-bin/wikitext-103 \
    --workers 60
Shellcheck comments (#9595) 2020-07-21 14:47:09 -07:00			`#!/usr/bin/env bash`

Shellcheck rewrites (#9597) * Fix SC2001: See if you can use ${variable//search/replace} instead. * Fix SC2010: Don't use ls \| grep. Use a glob or a for loop with a condition to allow non-alphanumeric filenames. * Fix SC2012: Use find instead of ls to better handle non-alphanumeric filenames. * Fix SC2015: Note that A && B \|\| C is not if-then-else. C may run when A is true. * Fix SC2028: echo may not expand escape sequences. Use printf. * Fix SC2034: variable appears unused. Verify use (or export if used externally). * Fix SC2035: Use ./glob or -- glob so names with dashes won't become options. * Fix SC2071: > is for string comparisons. Use -gt instead. * Fix SC2154: variable is referenced but not assigned * Fix SC2164: Use 'cd ... \|\| exit' or 'cd ... \|\| return' in case cd fails. * Fix SC2188: This redirection doesn't have a command. Move to its command (or use 'true' as no-op). * Fix SC2236: Use -n instead of ! -z. * Fix SC2242: Can only exit with status 0-255. Other data should be written to stdout/stderr. * Fix SC2086: Double quote to prevent globbing and word splitting. Co-authored-by: Mehrdad <noreply@github.com> 2020-07-24 15:24:19 -07:00			`set -euo pipefail`

[docs] add pages about examples on training language models with fairseq (#5755) * add pages about examples on training language models with fairseq and ray autoscaler * better format * update ray_train.sh * Move EFS to the autoscaler file * nits * add comments to the code & use a new way to implement checkpoint hook * small bug fix * polish the doc * fix formatting * yaml * update docs * fix the bugs and add preprocess.sh * fix lint * Reduce batch size & fix lint * shorttitle 2019-10-20 20:28:16 -07:00			`cd ~/efs/lm`

			`# download the dataset`
			`wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip`
			`unzip wikitext-103-raw-v1.zip`
			`# encode it with the GPT-2 BPE`
			`mkdir -p gpt2_bpe`
			`wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json`
			`wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe`
			`wget https://raw.githubusercontent.com/pytorch/fairseq/master/examples/roberta/multiprocessing_bpe_encoder.py`
			`for SPLIT in train valid test; do \`
			`python multiprocessing_bpe_encoder.py \`
			`--encoder-json gpt2_bpe/encoder.json \`
			`--vocab-bpe gpt2_bpe/vocab.bpe \`
			`--inputs wikitext-103-raw/wiki.${SPLIT}.raw \`
			`--outputs wikitext-103-raw/wiki.${SPLIT}.bpe \`
			`--keep-empty \`
			`--workers 60; \`
			`done`
			`# preprocess/binarize the data using the GPT-2 fairseq dictionary`
			`wget -O gpt2_bpe/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt`
			`fairseq-preprocess \`
			`--only-source \`
			`--srcdict gpt2_bpe/dict.txt \`
			`--trainpref wikitext-103-raw/wiki.train.bpe \`
			`--validpref wikitext-103-raw/wiki.valid.bpe \`
			`--testpref wikitext-103-raw/wiki.test.bpe \`
			`--destdir data-bin/wikitext-103 \`
			`--workers 60`