[air/benchmarks] Measure local training time in torch/tf benchmarks (#27902)

We currently measure end-to-end training time in our benchmarks, which includes setup overhead. This is an unequal comparison, as setup overhead for vanilla training cannot be accurately expressed and was instead just disregarded.
By comparing the raw training times in the actual training loop, we will get a more accurate expression of any potential overhead or benefit in using Ray vs. vanilla tensorflow/torch.

Signed-off-by: Kai Fricke <kai@anyscale.com>
This commit is contained in:
Kai Fricke 2022-08-16 19:16:08 +02:00 committed by GitHub
parent b9a2fb79b6
commit b91246a093
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 123 additions and 52 deletions

View file

@ -181,9 +181,12 @@ Pytorch Training Parity
This task checks the performance parity between native Pytorch Distributed and
Ray Train's distributed TorchTrainer.
We demonstrate that the performance is similar (within 10\%) between the two frameworks.
We demonstrate that the performance is similar (within 2.5\%) between the two frameworks.
Performance may vary greatly across different model, hardware, and cluster configurations.
The reported times are for the raw training times. There is an unreported constant setup
overhead of a few seconds for both methods that is negligible for longer training runs.
- `Pytorch comparison training script`_
- `Pytorch comparison CPU cluster configuration`_
- `Pytorch comparison GPU cluster configuration`_
@ -196,15 +199,15 @@ Performance may vary greatly across different model, hardware, and cluster confi
- **Command**
* - 4 m5.2xlarge nodes (4 workers)
- FashionMNIST
- 201.17 s (vs 195.90 s Pytorch)
- 196.64 s (vs 194.90 s Pytorch)
- `python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 8`
* - 4 m5.2xlarge nodes (16 workers)
- FashionMNIST
- 447.14 s (vs 461.75 s Pytorch)
- 430.88 s (vs 475.97 s Pytorch)
- `python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 16 --cpus-per-worker 2`
* - 4 g4dn.12xlarge node (16 workers)
- FashionMNIST
- 236.61 s (vs 220.97 s Pytorch)
- 149.80 s (vs 146.46 s Pytorch)
- `python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 16 --cpus-per-worker 4 --use-gpu`
@ -216,9 +219,12 @@ Tensorflow Training Parity
This task checks the performance parity between native Tensorflow Distributed and
Ray Train's distributed TensorflowTrainer.
We demonstrate that the performance is similar (within 10\%) between the two frameworks.
We demonstrate that the performance is similar (within 1\%) between the two frameworks.
Performance may vary greatly across different model, hardware, and cluster configurations.
The reported times are for the raw training times. There is an unreported constant setup
overhead of a few seconds for both methods that is negligible for longer training runs.
.. note:: The batch size and number of epochs is different for the GPU benchmark, resulting in a longer runtime.
- `Tensorflow comparison training script`_
@ -233,15 +239,15 @@ Performance may vary greatly across different model, hardware, and cluster confi
- **Command**
* - 4 m5.2xlarge nodes (4 workers)
- FashionMNIST
- 90.61 s (vs 81.26 s Tensorflow)
- 78.81 s (vs 79.67 s Tensorflow)
- `python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 8`
* - 4 m5.2xlarge nodes (16 workers)
- FashionMNIST
- 75.34 s (vs 69.51 s Tensorflow)
- 64.57 s (vs 67.45 s Tensorflow)
- `python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 16 --cpus-per-worker 2`
* - 4 g4dn.12xlarge node (16 workers)
- FashionMNIST
- 495.85 s (vs 479.28 s Tensorflow)
- 465.16 s (vs 461.74 s Tensorflow)
- `python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 200 --num-workers 16 --cpus-per-worker 4 --batch-size 64 --use-gpu`

View file

@ -39,12 +39,7 @@ def build_cnn_model() -> tf.keras.Model:
def train_func(use_ray: bool, config: dict):
if use_ray:
from ray.air.callbacks.keras import Callback as TrainCheckpointReportCallback
callbacks = [TrainCheckpointReportCallback(frequency=0)]
else:
callbacks = []
local_start_time = time.monotonic()
per_worker_batch_size = config.get("batch_size", 64)
epochs = config.get("epochs", 3)
@ -69,6 +64,18 @@ def train_func(use_ray: bool, config: dict):
metrics=["accuracy"],
)
if use_ray:
from ray.air.callbacks.keras import Callback as TrainCheckpointReportCallback
class CustomReportCallback(TrainCheckpointReportCallback):
def _handle(self, logs: dict, when: str = None):
logs["local_time_taken"] = time.monotonic() - local_start_time
super()._handle(logs, when)
callbacks = [CustomReportCallback(frequency=0)]
else:
callbacks = []
history = multi_worker_model.fit(
multi_worker_dataset,
epochs=epochs,
@ -79,10 +86,11 @@ def train_func(use_ray: bool, config: dict):
loss = results["loss"][-1]
if not use_ray:
local_time_taken = time.monotonic() - local_start_time
print(f"Reporting loss: {loss:.4f}")
if local_rank == 0:
with open(VANILLA_RESULT_JSON, "w") as f:
json.dump({"loss": loss}, f)
json.dump({"loss": loss, "local_time_taken": local_time_taken}, f)
return results
@ -93,7 +101,7 @@ def train_tf_ray_air(
num_workers: int = 4,
cpus_per_worker: int = 8,
use_gpu: bool = False,
) -> Tuple[float, float]:
) -> Tuple[float, float, float]:
# This function is kicked off by the main() function and runs a full training
# run using Ray AIR.
from ray.train.tensorflow import TensorflowTrainer
@ -117,7 +125,7 @@ def train_tf_ray_air(
time_taken = time.monotonic() - start_time
print(f"Last result: {result.metrics}")
return time_taken, result.metrics["loss"]
return time_taken, result.metrics["local_time_taken"], result.metrics["loss"]
def train_tf_vanilla_worker(
@ -147,7 +155,7 @@ def train_tf_vanilla(
num_workers: int = 4,
cpus_per_worker: int = 8,
use_gpu: bool = False,
) -> Tuple[float, float]:
) -> Tuple[float, float, float]:
# This function is kicked off by the main() function and subsequently kicks
# off tasks that run train_tf_vanilla_worker() on the worker nodes.
from benchmark_util import (
@ -205,13 +213,14 @@ def train_tf_vanilla(
run_commands_on_actors(actors=actors, cmds=cmds)
time_taken = time.monotonic() - start_time
loss = 0.0
loss = local_time_taken = 0.0
if os.path.exists(VANILLA_RESULT_JSON):
with open(VANILLA_RESULT_JSON, "r") as f:
result = json.load(f)
loss = result["loss"]
local_time_taken = result["local_time_taken"]
return time_taken, loss
return time_taken, local_time_taken, loss
@click.group(help="Run Tensorflow benchmarks")
@ -253,15 +262,17 @@ def run(
run_command_on_all_nodes(["python", path])
times_ray = []
times_local_ray = []
losses_ray = []
times_vanilla = []
times_local_vanilla = []
losses_vanilla = []
for run in range(1, num_runs + 1):
time.sleep(2)
print(f"[Run {run}/{num_runs}] Running Tensorflow Ray benchmark")
time_ray, loss_ray = train_tf_ray_air(
time_ray, time_local_ray, loss_ray = train_tf_ray_air(
num_workers=num_workers,
cpus_per_worker=cpus_per_worker,
use_gpu=use_gpu,
@ -270,7 +281,8 @@ def run(
print(
f"[Run {run}/{num_runs}] Finished Ray training ({num_epochs} epochs) in "
f"{time_ray:.2f} seconds. Observed loss = {loss_ray:.4f}"
f"{time_ray:.2f} seconds (local training time: {time_local_ray:.2f}s). "
f"Observed loss = {loss_ray:.4f}"
)
time.sleep(2)
@ -279,10 +291,10 @@ def run(
# Todo: Vanilla runs are sometimes failing. We just retry here, but we should
# get to the bottom of it.
time_vanilla = loss_vanilla = 0.0
time_vanilla = time_local_vanilla = loss_vanilla = 0.0
for i in range(3):
try:
time_vanilla, loss_vanilla = train_tf_vanilla(
time_vanilla, time_local_vanilla, loss_vanilla = train_tf_vanilla(
num_workers=num_workers,
cpus_per_worker=cpus_per_worker,
use_gpu=use_gpu,
@ -297,40 +309,58 @@ def run(
print(
f"[Run {run}/{num_runs}] Finished vanilla training ({num_epochs} epochs) "
f"in {time_vanilla:.2f} seconds. Observed loss = {loss_vanilla:.4f}"
f"in {time_vanilla:.2f} seconds "
f"(local training time: {time_local_vanilla:.2f}s). "
f"Observed loss = {loss_vanilla:.4f}"
)
print(
f"[Run {run}/{num_runs}] Observed results: ",
{
"tensorflow_mnist_ray_time_s": time_ray,
"tensorflow_mnist_ray_local_time_s": time_local_ray,
"tensorflow_mnist_ray_loss": loss_ray,
"tensorflow_mnist_vanilla_time_s": time_vanilla,
"tensorflow_mnist_vanilla_local_time_s": time_local_vanilla,
"tensorflow_mnist_vanilla_loss": loss_vanilla,
},
)
times_ray.append(time_ray)
times_local_ray.append(time_local_ray)
losses_ray.append(loss_ray)
times_vanilla.append(time_vanilla)
times_local_vanilla.append(time_local_vanilla)
losses_vanilla.append(loss_vanilla)
times_ray_mean = np.mean(times_ray)
times_ray_sd = np.std(times_ray)
times_local_ray_mean = np.mean(times_local_ray)
times_local_ray_sd = np.std(times_local_ray)
times_vanilla_mean = np.mean(times_vanilla)
times_vanilla_sd = np.std(times_vanilla)
times_local_vanilla_mean = np.mean(times_local_vanilla)
times_local_vanilla_sd = np.std(times_local_vanilla)
result = {
"tensorflow_mnist_ray_num_runs": num_runs,
"tensorflow_mnist_ray_time_s_all": times_ray,
"tensorflow_mnist_ray_time_s_mean": times_ray_mean,
"tensorflow_mnist_ray_time_s_sd": times_ray_sd,
"tensorflow_mnist_ray_time_local_s_all": times_local_ray,
"tensorflow_mnist_ray_time_local_s_mean": times_local_ray_mean,
"tensorflow_mnist_ray_time_local_s_sd": times_local_ray_sd,
"tensorflow_mnist_ray_loss_mean": np.mean(losses_ray),
"tensorflow_mnist_ray_loss_sd": np.std(losses_ray),
"tensorflow_mnist_vanilla_time_s_all": times_vanilla,
"tensorflow_mnist_vanilla_time_s_mean": times_vanilla_mean,
"tensorflow_mnist_vanilla_time_s_sd": times_vanilla_sd,
"tensorflow_mnist_vanilla_local_time_s_all": times_local_vanilla,
"tensorflow_mnist_vanilla_local_time_s_mean": times_local_vanilla_mean,
"tensorflow_mnist_vanilla_local_time_s_sd": times_local_vanilla_sd,
"tensorflow_mnist_vanilla_loss_mean": np.mean(losses_vanilla),
"tensorflow_mnist_vanilla_loss_std": np.std(losses_vanilla),
}
@ -341,18 +371,22 @@ def run(
json.dump(result, f)
target_ratio = 1.2
ratio = (times_ray_mean / times_vanilla_mean) if times_vanilla_mean != 0.0 else 1.0
ratio = (
(times_local_ray_mean / times_local_vanilla_mean)
if times_local_vanilla_mean != 0.0
else 1.0
)
if ratio > target_ratio:
raise RuntimeError(
f"Training on Ray took an average of {times_ray_mean:.2f} seconds, "
f"Training on Ray took an average of {times_local_ray_mean:.2f} seconds, "
f"which is more than {target_ratio:.2f}x of the average vanilla training "
f"time of {times_vanilla_mean:.2f} seconds ({ratio:.2f}x). FAILED"
f"time of {times_local_vanilla_mean:.2f} seconds ({ratio:.2f}x). FAILED"
)
print(
f"Training on Ray took an average of {times_ray_mean:.2f} seconds, "
f"Training on Ray took an average of {times_local_ray_mean:.2f} seconds, "
f"which is less than {target_ratio:.2f}x of the average vanilla training "
f"time of {times_vanilla_mean:.2f} seconds ({ratio:.2f}x). PASSED"
f"time of {times_local_vanilla_mean:.2f} seconds ({ratio:.2f}x). PASSED"
)

View file

@ -78,6 +78,8 @@ def validate_epoch(dataloader, model, loss_fn, world_size: int, local_rank: int)
def train_func(use_ray: bool, config: Dict):
local_start_time = time.monotonic()
if use_ray:
from ray.air import session
import ray.train as train
@ -197,13 +199,16 @@ def train_func(use_ray: bool, config: Dict):
world_size=world_size,
local_rank=local_rank,
)
local_time_taken = time.monotonic() - local_start_time
if use_ray:
session.report(dict(loss=loss))
session.report(dict(loss=loss, local_time_taken=local_time_taken))
else:
print(f"Reporting loss: {loss:.4f}")
if local_rank == 0:
with open(VANILLA_RESULT_JSON, "w") as f:
json.dump({"loss": loss}, f)
json.dump({"loss": loss, "local_time_taken": local_time_taken}, f)
def train_torch_ray_air(
@ -212,7 +217,7 @@ def train_torch_ray_air(
num_workers: int = 4,
cpus_per_worker: int = 8,
use_gpu: bool = False,
) -> Tuple[float, float]:
) -> Tuple[float, float, float]:
# This function is kicked off by the main() function and runs a full training
# run using Ray AIR.
from ray.train.torch import TorchTrainer
@ -236,7 +241,7 @@ def train_torch_ray_air(
time_taken = time.monotonic() - start_time
print(f"Last result: {result.metrics}")
return time_taken, result.metrics["loss"]
return time_taken, result.metrics["local_time_taken"], result.metrics["loss"]
def train_torch_vanilla_worker(
@ -273,7 +278,7 @@ def train_torch_vanilla(
num_workers: int = 4,
cpus_per_worker: int = 8,
use_gpu: bool = False,
) -> Tuple[float, float]:
) -> Tuple[float, float, float]:
# This function is kicked off by the main() function and subsequently kicks
# off tasks that run train_torch_vanilla_worker() on the worker nodes.
from benchmark_util import (
@ -363,8 +368,9 @@ def train_torch_vanilla(
with open(VANILLA_RESULT_JSON, "r") as f:
result = json.load(f)
loss = result["loss"]
local_time_taken = result["local_time_taken"]
return time_taken, loss
return time_taken, local_time_taken, loss
@click.group(help="Run Torch benchmarks")
@ -417,15 +423,17 @@ def run(
run_command_on_all_nodes(["python", path])
times_ray = []
times_local_ray = []
losses_ray = []
times_vanilla = []
times_local_vanilla = []
losses_vanilla = []
for run in range(1, num_runs + 1):
time.sleep(2)
print(f"[Run {run}/{num_runs}] Running Torch Ray benchmark")
time_ray, loss_ray = train_torch_ray_air(
time_ray, time_local_ray, loss_ray = train_torch_ray_air(
num_workers=num_workers,
cpus_per_worker=cpus_per_worker,
use_gpu=use_gpu,
@ -434,14 +442,15 @@ def run(
print(
f"[Run {run}/{num_runs}] Finished Ray training ({num_epochs} epochs) in "
f"{time_ray:.2f} seconds. Observed loss = {loss_ray:.4f}"
f"{time_ray:.2f} seconds (local training time: {time_local_ray:.2f}s). "
f"Observed loss = {loss_ray:.4f}"
)
time.sleep(2)
print(f"[Run {run}/{num_runs}] Running Torch vanilla benchmark")
time_vanilla, loss_vanilla = train_torch_vanilla(
time_vanilla, time_local_vanilla, loss_vanilla = train_torch_vanilla(
num_workers=num_workers,
cpus_per_worker=cpus_per_worker,
use_gpu=use_gpu,
@ -450,40 +459,58 @@ def run(
print(
f"[Run {run}/{num_runs}] Finished vanilla training ({num_epochs} epochs) "
f"in {time_vanilla:.2f} seconds. Observed loss = {loss_vanilla:.4f}"
f"in {time_vanilla:.2f} seconds "
f"(local training time: {time_local_vanilla:.2f}s). "
f"Observed loss = {loss_vanilla:.4f}"
)
print(
f"[Run {run}/{num_runs}] Observed results: ",
{
"torch_mnist_ray_time_s": time_ray,
"torch_mnist_ray_loss": loss_ray,
"torch_mnist_vanilla_time_s": time_vanilla,
"torch_mnist_vanilla_loss": loss_vanilla,
"tensorflow_mnist_ray_time_s": time_ray,
"tensorflow_mnist_ray_local_time_s": time_local_ray,
"tensorflow_mnist_ray_loss": loss_ray,
"tensorflow_mnist_vanilla_time_s": time_vanilla,
"tensorflow_mnist_vanilla_local_time_s": time_local_vanilla,
"tensorflow_mnist_vanilla_loss": loss_vanilla,
},
)
times_ray.append(time_ray)
times_local_ray.append(time_local_ray)
losses_ray.append(loss_ray)
times_vanilla.append(time_vanilla)
times_local_vanilla.append(time_local_vanilla)
losses_vanilla.append(loss_vanilla)
times_ray_mean = np.mean(times_ray)
times_ray_sd = np.std(times_ray)
times_local_ray_mean = np.mean(times_local_ray)
times_local_ray_sd = np.std(times_local_ray)
times_vanilla_mean = np.mean(times_vanilla)
times_vanilla_sd = np.std(times_vanilla)
times_local_vanilla_mean = np.mean(times_local_vanilla)
times_local_vanilla_sd = np.std(times_local_vanilla)
result = {
"torch_mnist_ray_num_runs": num_runs,
"torch_mnist_ray_time_s_all": times_ray,
"torch_mnist_ray_time_s_mean": times_ray_mean,
"torch_mnist_ray_time_s_sd": times_ray_sd,
"torch_mnist_ray_time_local_s_all": times_local_ray,
"torch_mnist_ray_time_local_s_mean": times_local_ray_mean,
"torch_mnist_ray_time_local_s_sd": times_local_ray_sd,
"torch_mnist_ray_loss_mean": np.mean(losses_ray),
"torch_mnist_ray_loss_sd": np.std(losses_ray),
"torch_mnist_vanilla_time_s_all": times_vanilla,
"torch_mnist_vanilla_time_s_mean": times_vanilla_mean,
"torch_mnist_vanilla_time_s_sd": times_vanilla_sd,
"torch_mnist_vanilla_local_time_s_all": times_local_vanilla,
"torch_mnist_vanilla_local_time_s_mean": times_local_vanilla_mean,
"torch_mnist_vanilla_local_time_s_sd": times_local_vanilla_sd,
"torch_mnist_vanilla_loss_mean": np.mean(losses_vanilla),
"torch_mnist_vanilla_loss_std": np.std(losses_vanilla),
}
@ -494,18 +521,22 @@ def run(
json.dump(result, f)
target_ratio = 1.15
ratio = (times_ray_mean / times_vanilla_mean) if times_vanilla_mean != 0.0 else 1.0
ratio = (
(times_local_ray_mean / times_local_vanilla_mean)
if times_local_vanilla_mean != 0.0
else 1.0
)
if ratio > target_ratio:
raise RuntimeError(
f"Training on Ray took an average of {times_ray_mean:.2f} seconds, "
f"Training on Ray took an average of {times_local_ray_mean:.2f} seconds, "
f"which is more than {target_ratio:.2f}x of the average vanilla training "
f"time of {times_vanilla_mean:.2f} seconds ({ratio:.2f}x). FAILED"
f"time of {times_local_vanilla_mean:.2f} seconds ({ratio:.2f}x). FAILED"
)
print(
f"Training on Ray took an average of {times_ray_mean:.2f} seconds, "
f"Training on Ray took an average of {times_local_ray_mean:.2f} seconds, "
f"which is less than {target_ratio:.2f}x of the average vanilla training "
f"time of {times_vanilla_mean:.2f} seconds ({ratio:.2f}x). PASSED"
f"time of {times_local_vanilla_mean:.2f} seconds ({ratio:.2f}x). PASSED"
)

View file

@ -454,7 +454,7 @@
cluster_compute: compute_cpu_4.yaml
run:
timeout: 3600
timeout: 5400
script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 8
wait_for_nodes:
@ -479,7 +479,7 @@
cluster_compute: compute_cpu_1.yaml
run:
timeout: 3600
timeout: 5400
script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 2
type: sdk_command
@ -528,7 +528,7 @@
cluster_compute: compute_gpu_4x4.yaml
run:
timeout: 3600
timeout: 5400
script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 200 --num-workers 16 --cpus-per-worker 4 --batch-size 64 --use-gpu
wait_for_nodes: