mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
[CUJ#2] Update nightly test for CUJ#2 #21064
This commit is contained in:
parent
03e05df9cb
commit
ec06a1f65e
1 changed files with 16 additions and 7 deletions
|
@ -117,7 +117,7 @@ class DataPreprocessor:
|
|||
self.fruits = list(fruit_means.keys())
|
||||
|
||||
fruit_one_hots = {
|
||||
fruit: collections.defaultdict(int, fruit=1)
|
||||
fruit: collections.defaultdict(int, **{fruit: 1})
|
||||
for fruit in self.fruits
|
||||
}
|
||||
|
||||
|
@ -454,6 +454,11 @@ if __name__ == "__main__":
|
|||
action="store_true",
|
||||
default=False,
|
||||
help="Use dummy trainer to debug dataset performance")
|
||||
parser.add_argument(
|
||||
"--num-epochs",
|
||||
default=2,
|
||||
type=int,
|
||||
help="The number of epochs to use for training")
|
||||
|
||||
args = parser.parse_args()
|
||||
smoke_test = args.smoke_test
|
||||
|
@ -462,6 +467,7 @@ if __name__ == "__main__":
|
|||
use_gpu = args.use_gpu
|
||||
use_s3 = args.use_s3
|
||||
large_dataset = args.large_dataset
|
||||
num_epochs = args.num_epochs
|
||||
|
||||
if large_dataset:
|
||||
assert use_s3, "--large-dataset requires --use-s3 to be set."
|
||||
|
@ -485,6 +491,7 @@ if __name__ == "__main__":
|
|||
# exists.
|
||||
mlflow.set_experiment("cuj-big-data-training")
|
||||
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
if use_s3:
|
||||
# Check if s3 data is populated.
|
||||
BUCKET_NAME = "cuj-big-data"
|
||||
|
@ -504,8 +511,6 @@ if __name__ == "__main__":
|
|||
inference_path = "s3://cuj-big-data/inference/"
|
||||
inference_output_path = "s3://cuj-big-data/output/"
|
||||
else:
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
data_path = os.path.join(dir_path, "data")
|
||||
inference_path = os.path.join(dir_path, "inference")
|
||||
inference_output_path = "/tmp"
|
||||
|
@ -532,7 +537,6 @@ if __name__ == "__main__":
|
|||
# remove label column and internal Arrow column.
|
||||
num_features = num_columns - 2
|
||||
|
||||
NUM_EPOCHS = 2
|
||||
BATCH_SIZE = 512
|
||||
NUM_HIDDEN = 50 # 200
|
||||
NUM_LAYERS = 3 # 15
|
||||
|
@ -541,7 +545,7 @@ if __name__ == "__main__":
|
|||
|
||||
if args.debug:
|
||||
num_gpus = 1 if use_gpu else 0
|
||||
shards = train_dataset.repeat(NUM_EPOCHS) \
|
||||
shards = train_dataset.repeat(num_epochs) \
|
||||
.random_shuffle_each_window(_spread_resource_prefix="node:") \
|
||||
.split(num_workers)
|
||||
del train_dataset
|
||||
|
@ -571,7 +575,7 @@ if __name__ == "__main__":
|
|||
|
||||
config = {
|
||||
"use_gpu": use_gpu,
|
||||
"num_epochs": NUM_EPOCHS,
|
||||
"num_epochs": num_epochs,
|
||||
"batch_size": BATCH_SIZE,
|
||||
"num_hidden": NUM_HIDDEN,
|
||||
"num_layers": NUM_LAYERS,
|
||||
|
@ -585,7 +589,12 @@ if __name__ == "__main__":
|
|||
# reported by ``train.report()`` will be logged to these 2 places.
|
||||
# TODO: TBXLoggerCallback should create nonexistent logdir
|
||||
# and should also create 1 directory per file.
|
||||
callbacks = [TBXLoggerCallback(logdir="/tmp"), MLflowCallback(config)]
|
||||
tbx_runs_dir = os.path.join(dir_path, "runs")
|
||||
os.makedirs(tbx_runs_dir, exist_ok=True)
|
||||
callbacks = [
|
||||
TBXLoggerCallback(logdir=tbx_runs_dir),
|
||||
MLflowCallback(config)
|
||||
]
|
||||
|
||||
# Remove CPU resource so Datasets can be scheduled.
|
||||
resources_per_worker = {"CPU": 0, "GPU": 1} if use_gpu else None
|
||||
|
|
Loading…
Add table
Reference in a new issue