ray/doc/examples/resnet/.rayproject/project.yaml

58 lines
2 KiB
YAML

# This file is generated by `ray project create`.
name: ray-example-resnet
description: "Using ray to train resnet on multiple gpus"
tags: ["ray-example", "machine-learning", "tensorflow", "resnet"]
documentation: https://ray.readthedocs.io/en/latest/auto_examples/plot_resnet.html
cluster: .rayproject/cluster.yaml
environment:
requirements: .rayproject/requirements.txt
commands:
- name: train
command: |
if [ "{{dataset}}" == "cifar10" ]; then
# Get the CIFAR-10 dataset.
curl -o cifar-10-binary.tar.gz https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz;
tar -xvf cifar-10-binary.tar.gz;
else
# Get the CIFAR-100 dataset.
curl -o cifar-100-binary.tar.gz https://www.cs.toronto.edu/~kriz/cifar-100-binary.tar.gz;
tar -xvf cifar-100-binary.tar.gz;
fi
python resnet_main.py --dataset {{dataset}} --train_data_path {{train_data_path}} --eval_data_path {{eval_data_path}} --eval_dir {{eval_data_path}} --eval_batch_count {{eval_batch_count}} --num_gpus {{num_gpus}}
params:
- name: dataset
help: "The dataset to train on."
default: "cifar10"
choices: ["cifar10", "cifar100"]
- name: train-data-path
help: "Data path for the training data."
default: "'cifar-10-batches-bin/data_batch*'"
type: str
- name: eval-data-path
help: "Data path for the testing data."
default: "cifar-10-batches-bin/test_batch.bin"
type: str
- name: eval-dir
help: "Data path for the tensorboard logs."
default: "/tmp/resnet-model/eval"
type: str
- name: eval-batch-count
help: "Number of batches to evaluate over."
default: 50
type: int
- name: num-gpus
help: "Number of GPUs to use for training."
default: 0
type: int
config:
tmux: true
output_files: [
# Save the logs from the latest run in snapshots.
"/tmp/ray/session_latest/logs"
]