# This file is generated by `ray project create`. name: ray-example-resnet description: "Using ray to train resnet on multiple gpus" tags: ["ray-example", "machine-learning", "tensorflow", "resnet"] documentation: https://ray.readthedocs.io/en/latest/auto_examples/plot_resnet.html cluster: .rayproject/cluster.yaml environment: requirements: .rayproject/requirements.txt commands: - name: train command: | if [ "{{dataset}}" == "cifar10" ]; then # Get the CIFAR-10 dataset. curl -o cifar-10-binary.tar.gz https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz; tar -xvf cifar-10-binary.tar.gz; else # Get the CIFAR-100 dataset. curl -o cifar-100-binary.tar.gz https://www.cs.toronto.edu/~kriz/cifar-100-binary.tar.gz; tar -xvf cifar-100-binary.tar.gz; fi python resnet_main.py --dataset {{dataset}} --train_data_path {{train_data_path}} --eval_data_path {{eval_data_path}} --eval_dir {{eval_data_path}} --eval_batch_count {{eval_batch_count}} --num_gpus {{num_gpus}} params: - name: dataset help: "The dataset to train on." default: "cifar10" choices: ["cifar10", "cifar100"] - name: train-data-path help: "Data path for the training data." default: "'cifar-10-batches-bin/data_batch*'" type: str - name: eval-data-path help: "Data path for the testing data." default: "cifar-10-batches-bin/test_batch.bin" type: str - name: eval-dir help: "Data path for the tensorboard logs." default: "/tmp/resnet-model/eval" type: str - name: eval-batch-count help: "Number of batches to evaluate over." default: 50 type: int - name: num-gpus help: "Number of GPUs to use for training." default: 0 type: int config: tmux: true output_files: [ # Save the logs from the latest run in snapshots. "/tmp/ray/session_latest/logs" ]