mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00

* Fix SC2001: See if you can use ${variable//search/replace} instead. * Fix SC2010: Don't use ls | grep. Use a glob or a for loop with a condition to allow non-alphanumeric filenames. * Fix SC2012: Use find instead of ls to better handle non-alphanumeric filenames. * Fix SC2015: Note that A && B || C is not if-then-else. C may run when A is true. * Fix SC2028: echo may not expand escape sequences. Use printf. * Fix SC2034: variable appears unused. Verify use (or export if used externally). * Fix SC2035: Use ./*glob* or -- *glob* so names with dashes won't become options. * Fix SC2071: > is for string comparisons. Use -gt instead. * Fix SC2154: variable is referenced but not assigned * Fix SC2164: Use 'cd ... || exit' or 'cd ... || return' in case cd fails. * Fix SC2188: This redirection doesn't have a command. Move to its command (or use 'true' as no-op). * Fix SC2236: Use -n instead of ! -z. * Fix SC2242: Can only exit with status 0-255. Other data should be written to stdout/stderr. * Fix SC2086: Double quote to prevent globbing and word splitting. Co-authored-by: Mehrdad <noreply@github.com>
33 lines
No EOL
1.3 KiB
Bash
33 lines
No EOL
1.3 KiB
Bash
#!/usr/bin/env bash
|
|
|
|
set -euo pipefail
|
|
|
|
cd ~/efs/lm
|
|
|
|
# download the dataset
|
|
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
|
|
unzip wikitext-103-raw-v1.zip
|
|
# encode it with the GPT-2 BPE
|
|
mkdir -p gpt2_bpe
|
|
wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json
|
|
wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe
|
|
wget https://raw.githubusercontent.com/pytorch/fairseq/master/examples/roberta/multiprocessing_bpe_encoder.py
|
|
for SPLIT in train valid test; do \
|
|
python multiprocessing_bpe_encoder.py \
|
|
--encoder-json gpt2_bpe/encoder.json \
|
|
--vocab-bpe gpt2_bpe/vocab.bpe \
|
|
--inputs wikitext-103-raw/wiki.${SPLIT}.raw \
|
|
--outputs wikitext-103-raw/wiki.${SPLIT}.bpe \
|
|
--keep-empty \
|
|
--workers 60; \
|
|
done
|
|
# preprocess/binarize the data using the GPT-2 fairseq dictionary
|
|
wget -O gpt2_bpe/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt
|
|
fairseq-preprocess \
|
|
--only-source \
|
|
--srcdict gpt2_bpe/dict.txt \
|
|
--trainpref wikitext-103-raw/wiki.train.bpe \
|
|
--validpref wikitext-103-raw/wiki.valid.bpe \
|
|
--testpref wikitext-103-raw/wiki.test.bpe \
|
|
--destdir data-bin/wikitext-103 \
|
|
--workers 60 |