From 8a306063088fcc2553168cc2ba9cd6e22f9366fc Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 30 Aug 2022 21:36:41 +0200 Subject: [PATCH] [AIR][Docs] Improve Hugging Face notebook example (#28121) Improves the HF notebook by making use of preprocessors and adding a section on tuning. Brings it in line with the Ray Summit 2022 demo. Signed-off-by: Antoni Baum antoni.baum@protonmail.com --- .../huggingface_text_classification.ipynb | 2053 +++++++++++------ 1 file changed, 1388 insertions(+), 665 deletions(-) diff --git a/doc/source/ray-air/examples/huggingface_text_classification.ipynb b/doc/source/ray-air/examples/huggingface_text_classification.ipynb index 6ba93c934..a44ba0d2d 100644 --- a/doc/source/ray-air/examples/huggingface_text_classification.ipynb +++ b/doc/source/ray-air/examples/huggingface_text_classification.ipynb @@ -18,7 +18,7 @@ "In this notebook, we will:\n", "1. [Set up Ray](#setup)\n", "2. [Load the dataset](#load)\n", - "3. [Preprocess the dataset](#preprocess)\n", + "3. [Preprocess the dataset with Ray AIR](#preprocess)\n", "4. [Run the training with Ray AIR](#train)\n", "5. [Predict on test data with Ray AIR](#predict)\n", "6. [Optionally, share the model with the community](#share)" @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "id": "YajFzmkthYbO" }, @@ -61,7 +61,7 @@ "source": [ "We will use `ray.init()` to initialize a local cluster. By default, this cluster will be compromised of only the machine you are running this notebook on. You can also run this notebook on an Anyscale cluster.\n", "\n", - "This notebook *will not* run in [Ray Client](https://docs.ray.io/en/latest/cluster/running-applications/job-submission/ray-client.html) mode." + "Note: this notebook *will not* run in Ray Client mode." ] }, { @@ -75,10 +75,64 @@ "outputId": "e527bdbb-2f28-4142-cca0-762e0566cbcd" }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-08-25 10:09:51,282\tINFO worker.py:1223 -- Using address localhost:9031 set in the environment variable RAY_ADDRESS\n", + "2022-08-25 10:09:51,697\tINFO worker.py:1333 -- Connecting to existing Ray cluster at address: 172.31.80.117:9031...\n", + "2022-08-25 10:09:51,706\tINFO worker.py:1509 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://session-i8ddtfaxhwypbvnyb9uzg7xs.i.anyscaleuserdata-staging.com/auth/?token=agh0_CkcwRQIhAJXwvxwq31GryaWthvXGCXZebsijbuqi7qL2pCa5uROOAiBGjzsyXAJFHLlaEI9zSlNI8ewtghKg5UV3t8NmlxuMcRJmEiCtvjcKE0VPiU7iQx51P9oPQjfpo5g1RJXccVSS5005cBgCIgNuL2E6DAj9xazjBhDwj4veAUIMCP3ClJgGEPCPi94B-gEeChxzZXNfaThERFRmQVhId1lwYlZueWI5dVpnN3hT&redirect_to=dashboard \u001b[39m\u001b[22m\n", + "2022-08-25 10:09:51,709\tINFO packaging.py:342 -- Pushing file package 'gcs://_ray_pkg_3332f64b0a461fddc20be71129115d0a.zip' (0.34MiB) to Ray cluster...\n", + "2022-08-25 10:09:51,714\tINFO packaging.py:351 -- Successfully pushed file package 'gcs://_ray_pkg_3332f64b0a461fddc20be71129115d0a.zip'.\n" + ] + }, { "data": { + "text/html": [ + "
\n", + "
\n", + "

Ray

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "
Python version:3.8.5
Ray version: 2.0.0
Dashboard:http://session-i8ddtfaxhwypbvnyb9uzg7xs.i.anyscaleuserdata-staging.com/auth/?token=agh0_CkcwRQIhAJXwvxwq31GryaWthvXGCXZebsijbuqi7qL2pCa5uROOAiBGjzsyXAJFHLlaEI9zSlNI8ewtghKg5UV3t8NmlxuMcRJmEiCtvjcKE0VPiU7iQx51P9oPQjfpo5g1RJXccVSS5005cBgCIgNuL2E6DAj9xazjBhDwj4veAUIMCP3ClJgGEPCPi94B-gEeChxzZXNfaThERFRmQVhId1lwYlZueWI5dVpnN3hT&redirect_to=dashboard
\n", + "
\n", + "
\n" + ], "text/plain": [ - "RayContext(dashboard_url='', python_version='3.7.13', ray_version='2.0.0.dev0', ray_commit='e2ee2140f97ca08b70fd0f7561038b7f8d958d63', address_info={'node_ip_address': '172.28.0.2', 'raylet_ip_address': '172.28.0.2', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-05-12_18-30-10_467499_75/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-05-12_18-30-10_467499_75/sockets/raylet', 'webui_url': '', 'session_dir': '/tmp/ray/session_2022-05-12_18-30-10_467499_75', 'metrics_export_port': 64840, 'gcs_address': '172.28.0.2:58661', 'address': '172.28.0.2:58661', 'node_id': '65d091b8f504ccd72024fd0b1a8445a8f9ea43e86bcbf67868c22ba7'})" + "RayContext(dashboard_url='session-i8ddtfaxhwypbvnyb9uzg7xs.i.anyscaleuserdata-staging.com/auth/?token=agh0_CkcwRQIhAJXwvxwq31GryaWthvXGCXZebsijbuqi7qL2pCa5uROOAiBGjzsyXAJFHLlaEI9zSlNI8ewtghKg5UV3t8NmlxuMcRJmEiCtvjcKE0VPiU7iQx51P9oPQjfpo5g1RJXccVSS5005cBgCIgNuL2E6DAj9xazjBhDwj4veAUIMCP3ClJgGEPCPi94B-gEeChxzZXNfaThERFRmQVhId1lwYlZueWI5dVpnN3hT&redirect_to=dashboard', python_version='3.8.5', ray_version='2.0.0', ray_commit='cba26cc83f6b5b8a2ff166594a65cb74c0ec8740', address_info={'node_ip_address': '172.31.80.117', 'raylet_ip_address': '172.31.80.117', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-08-25_09-57-39_455459_216/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-08-25_09-57-39_455459_216/sockets/raylet', 'webui_url': 'session-i8ddtfaxhwypbvnyb9uzg7xs.i.anyscaleuserdata-staging.com/auth/?token=agh0_CkcwRQIhAJXwvxwq31GryaWthvXGCXZebsijbuqi7qL2pCa5uROOAiBGjzsyXAJFHLlaEI9zSlNI8ewtghKg5UV3t8NmlxuMcRJmEiCtvjcKE0VPiU7iQx51P9oPQjfpo5g1RJXccVSS5005cBgCIgNuL2E6DAj9xazjBhDwj4veAUIMCP3ClJgGEPCPi94B-gEeChxzZXNfaThERFRmQVhId1lwYlZueWI5dVpnN3hT&redirect_to=dashboard', 'session_dir': '/tmp/ray/session_2022-08-25_09-57-39_455459_216', 'metrics_export_port': 55366, 'gcs_address': '172.31.80.117:9031', 'address': '172.31.80.117:9031', 'dashboard_agent_listen_port': 52365, 'node_id': '422ff33444fd0f870aa6e718628407400a0ec9483a637c3026c3f9a3'})" ] }, "execution_count": 2, @@ -117,12 +171,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'CPU': 2.0,\n", - " 'GPU': 1.0,\n", - " 'accelerator_type:T4': 1.0,\n", - " 'memory': 7855477556.0,\n", - " 'node:172.28.0.2': 1.0,\n", - " 'object_store_memory': 3927738777.0}\n" + "{'CPU': 208.0,\n", + " 'GPU': 16.0,\n", + " 'accelerator_type:T4': 4.0,\n", + " 'memory': 616693614180.0,\n", + " 'node:172.31.76.237': 1.0,\n", + " 'node:172.31.80.117': 1.0,\n", + " 'node:172.31.85.193': 1.0,\n", + " 'node:172.31.85.32': 1.0,\n", + " 'node:172.31.90.137': 1.0,\n", + " 'object_store_memory': 259318055729.0}\n" ] } ], @@ -232,7 +290,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -241,120 +299,7 @@ "id": "MwhAeEOuhYbV", "outputId": "3aff8c73-d6eb-4784-890a-a419403b5bda" }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "bf499d18407642489b7f5acb9dc88ca8", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading builder script: 0%| | 0.00/7.78k [00:00" + "### Preprocessing the data with Ray AIR " ] }, { @@ -437,64 +382,7 @@ "id": "eXNLu_-nIrJI", "outputId": "f545a7a5-f341-4315-cd89-9942a657aa31" }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "8afaa1d7c12a41db8ad9f37c4067bfd4", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading: 0%| | 0.00/28.0 [00:00Current time: 2022-05-12 18:35:14 (running for 00:03:48.08)
Memory usage on this node: 5.7/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.32 GiB heap, 0.0/3.66 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/HuggingFaceTrainer_2022-05-12_18-31-26
Number of trials: 1/1 (1 TERMINATED)
\n", + "== Status ==
Current time: 2022-08-25 10:14:09 (running for 00:04:06.45)
Memory usage on this node: 4.3/62.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/208 CPUs, 0/16 GPUs, 0.0/574.34 GiB heap, 0.0/241.51 GiB objects (0.0/4.0 accelerator_type:T4)
Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-08-25_10-10-02
Number of trials: 1/1 (1 TERMINATED)
\n", "\n", - "\n", + "\n", "\n", "\n", - "\n", + "\n", "\n", "
Trial name status loc iter total time (s) loss learning_rate epoch
Trial name status loc iter total time (s) loss learning_rate epoch
HuggingFaceTrainer_bb9dd_00000TERMINATED172.28.0.2:419 5 222.3910.1575 1.30841e-06 5
HuggingFaceTrainer_c1ff5_00000TERMINATED172.31.90.137:947 2 200.2170.3886 0 2


" ], @@ -854,294 +671,335 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(RayTrainWorker pid=455)\u001b[0m 2022-05-12 18:31:33,158\tINFO torch.py:347 -- Setting up process group for: env:// [rank=0, world_size=1]\n" + "(RayTrainWorker pid=1114, ip=172.31.90.137) 2022-08-25 10:10:44,617\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=4]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(RayTrainWorker pid=455)\u001b[0m Is CUDA available: True\n" + "(RayTrainWorker pid=1114, ip=172.31.90.137) Is CUDA available: True\n", + "(RayTrainWorker pid=1116, ip=172.31.90.137) Is CUDA available: True\n", + "(RayTrainWorker pid=1117, ip=172.31.90.137) Is CUDA available: True\n", + "(RayTrainWorker pid=1115, ip=172.31.90.137) Is CUDA available: True\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Downloading builder script: 5.76kB [00:00, 6.35MB/s] \n", - "Downloading: 0%| | 0.00/256M [00:00, error=None)" + "Result(metrics={'loss': 0.3886, 'learning_rate': 0.0, 'epoch': 2.0, 'step': 1070, 'eval_loss': 0.6215357184410095, 'eval_matthews_correlation': 0.42957017514952434, 'eval_runtime': 0.9956, 'eval_samples_per_second': 273.204, 'eval_steps_per_second': 5.022, 'train_runtime': 174.4696, 'train_samples_per_second': 98.023, 'train_steps_per_second': 6.133, 'train_loss': 0.4661755713346963, '_timestamp': 1661447637, '_time_this_iter_s': 96.96447467803955, '_training_iteration': 2, 'should_checkpoint': True, 'done': True, 'trial_id': 'c1ff5_00000', 'experiment_tag': '0'}, error=None, log_dir=PosixPath('/home/ray/ray_results/HuggingFaceTrainer_2022-08-25_10-10-02/HuggingFaceTrainer_c1ff5_00000_0_2022-08-25_10-10-04'))" ] }, - "execution_count": 17, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1184,6 +1042,996 @@ "result" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tune hyperparameters with Ray AIR " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we would like to tune any hyperparameters of the model, we can do so by simply passing our `HuggingFaceTrainer` into a `Tuner` and defining the search space.\n", + "\n", + "We can also take advantage of the advanced search algorithms and schedulers provided by Ray Tune. In this example, we will use an `ASHAScheduler` to aggresively terminate underperforming trials." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "from ray import tune\n", + "from ray.tune import Tuner\n", + "from ray.tune.schedulers.async_hyperband import ASHAScheduler\n", + "\n", + "tune_epochs = 4\n", + "tuner = Tuner(\n", + " trainer,\n", + " param_space={\n", + " \"trainer_init_config\": {\n", + " \"learning_rate\": tune.grid_search([2e-5, 2e-4, 2e-3, 2e-2]),\n", + " \"epochs\": tune_epochs,\n", + " }\n", + " },\n", + " tune_config=tune.TuneConfig(\n", + " metric=\"eval_loss\",\n", + " mode=\"min\",\n", + " num_samples=1,\n", + " scheduler=ASHAScheduler(\n", + " max_t=tune_epochs,\n", + " )\n", + " ),\n", + " run_config=RunConfig(\n", + " checkpoint_config=CheckpointConfig(num_to_keep=1, checkpoint_score_attribute=\"eval_loss\", checkpoint_score_order=\"min\")\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "== Status ==
Current time: 2022-08-25 10:20:13 (running for 00:06:01.75)
Memory usage on this node: 4.4/62.0 GiB
Using AsyncHyperBand: num_stopped=4\n", + "Bracket: Iter 4.000: -0.8064090609550476 | Iter 1.000: -0.6378736793994904
Resources requested: 0/208 CPUs, 0/16 GPUs, 0.0/574.34 GiB heap, 0.0/241.51 GiB objects (0.0/4.0 accelerator_type:T4)
Current best trial: 5654d_00001 with eval_loss=0.6492420434951782 and parameters={'trainer_init_config': {'learning_rate': 0.0002, 'epochs': 4}}
Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-08-25_10-14-11
Number of trials: 4/4 (4 TERMINATED)
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Trial name status loc trainer_init_conf... iter total time (s) loss learning_rate epoch
HuggingFaceTrainer_5654d_00000TERMINATED172.31.90.137:1729 2e-05 4 347.171 0.1958 0 4
HuggingFaceTrainer_5654d_00001TERMINATED172.31.76.237:1805 0.0002 1 95.24920.6225 0.00015 1
HuggingFaceTrainer_5654d_00002TERMINATED172.31.85.32:1322 0.002 1 93.76130.6463 0.0015 1
HuggingFaceTrainer_5654d_00003TERMINATED172.31.85.193:1060 0.02 1 99.36770.926 0.015 1


" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=1789, ip=172.31.90.137) 2022-08-25 10:14:23,379\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=4]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=1792, ip=172.31.90.137) Is CUDA available: True\n", + "(RayTrainWorker pid=1790, ip=172.31.90.137) Is CUDA available: True\n", + "(RayTrainWorker pid=1791, ip=172.31.90.137) Is CUDA available: True\n", + "(RayTrainWorker pid=1789, ip=172.31.90.137) Is CUDA available: True\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=1974, ip=172.31.76.237) 2022-08-25 10:14:29,354\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=4]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=1977, ip=172.31.76.237) Is CUDA available: True\n", + "(RayTrainWorker pid=1976, ip=172.31.76.237) Is CUDA available: True\n", + "(RayTrainWorker pid=1975, ip=172.31.76.237) Is CUDA available: True\n", + "(RayTrainWorker pid=1974, ip=172.31.76.237) Is CUDA available: True\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=1483, ip=172.31.85.32) 2022-08-25 10:14:35,313\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=4]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=1790, ip=172.31.90.137) Starting training\n", + "(RayTrainWorker pid=1792, ip=172.31.90.137) Starting training\n", + "(RayTrainWorker pid=1791, ip=172.31.90.137) Starting training\n", + "(RayTrainWorker pid=1789, ip=172.31.90.137) Starting training\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=1789, ip=172.31.90.137) ***** Running training *****\n", + "(RayTrainWorker pid=1789, ip=172.31.90.137) Num examples = 8551\n", + "(RayTrainWorker pid=1789, ip=172.31.90.137) Num Epochs = 4\n", + "(RayTrainWorker pid=1789, ip=172.31.90.137) Instantaneous batch size per device = 16\n", + "(RayTrainWorker pid=1789, ip=172.31.90.137) Total train batch size (w. parallel, distributed & accumulation) = 64\n", + "(RayTrainWorker pid=1789, ip=172.31.90.137) Gradient Accumulation steps = 1\n", + "(RayTrainWorker pid=1789, ip=172.31.90.137) Total optimization steps = 2140\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=1483, ip=172.31.85.32) Is CUDA available: True\n", + "(RayTrainWorker pid=1485, ip=172.31.85.32) Is CUDA available: True\n", + "(RayTrainWorker pid=1486, ip=172.31.85.32) Is CUDA available: True\n", + "(RayTrainWorker pid=1484, ip=172.31.85.32) Is CUDA available: True\n", + "(RayTrainWorker pid=1977, ip=172.31.76.237) Starting training\n", + "(RayTrainWorker pid=1976, ip=172.31.76.237) Starting training\n", + "(RayTrainWorker pid=1975, ip=172.31.76.237) Starting training\n", + "(RayTrainWorker pid=1974, ip=172.31.76.237) Starting training\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=1974, ip=172.31.76.237) ***** Running training *****\n", + "(RayTrainWorker pid=1974, ip=172.31.76.237) Num examples = 8551\n", + "(RayTrainWorker pid=1974, ip=172.31.76.237) Num Epochs = 4\n", + "(RayTrainWorker pid=1974, ip=172.31.76.237) Instantaneous batch size per device = 16\n", + "(RayTrainWorker pid=1974, ip=172.31.76.237) Total train batch size (w. parallel, distributed & accumulation) = 64\n", + "(RayTrainWorker pid=1974, ip=172.31.76.237) Gradient Accumulation steps = 1\n", + "(RayTrainWorker pid=1974, ip=172.31.76.237) Total optimization steps = 2140\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=1483, ip=172.31.85.32) Starting training\n", + "(RayTrainWorker pid=1485, ip=172.31.85.32) Starting training\n", + "(RayTrainWorker pid=1486, ip=172.31.85.32) Starting training\n", + "(RayTrainWorker pid=1484, ip=172.31.85.32) Starting training\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=1483, ip=172.31.85.32) ***** Running training *****\n", + "(RayTrainWorker pid=1483, ip=172.31.85.32) Num examples = 8551\n", + "(RayTrainWorker pid=1483, ip=172.31.85.32) Num Epochs = 4\n", + "(RayTrainWorker pid=1483, ip=172.31.85.32) Instantaneous batch size per device = 16\n", + "(RayTrainWorker pid=1483, ip=172.31.85.32) Total train batch size (w. parallel, distributed & accumulation) = 64\n", + "(RayTrainWorker pid=1483, ip=172.31.85.32) Gradient Accumulation steps = 1\n", + "(RayTrainWorker pid=1483, ip=172.31.85.32) Total optimization steps = 2140\n", + "(RayTrainWorker pid=1223, ip=172.31.85.193) 2022-08-25 10:14:48,193\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=4]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=1223, ip=172.31.85.193) Is CUDA available: True\n", + "(RayTrainWorker pid=1224, ip=172.31.85.193) Is CUDA available: True\n", + "(RayTrainWorker pid=1226, ip=172.31.85.193) Is CUDA available: True\n", + "(RayTrainWorker pid=1225, ip=172.31.85.193) Is CUDA available: True\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading builder script: 5.76kB [00:00, 6.59MB/s] \n", + "Downloading builder script: 5.76kB [00:00, 6.52MB/s] \n", + "Downloading builder script: 5.76kB [00:00, 6.07MB/s] \n", + "Downloading builder script: 5.76kB [00:00, 6.81MB/s] \n", + "Downloading tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 46.0kB/s]\n", + "Downloading config.json: 100%|██████████| 483/483 [00:00<00:00, 766kB/s]\n", + "Downloading vocab.txt: 0%| | 0.00/226k [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
losslearning_rateepochstepeval_losseval_matthews_correlationeval_runtimeeval_samples_per_secondeval_steps_per_second_timestamp...pidhostnamenode_iptime_since_restoretimesteps_since_restoreiterations_since_restorewarmup_timeconfig/trainer_init_config/epochsconfig/trainer_init_config/learning_ratelogdir
10.62250.000151.05350.6492420.0000001.0157267.7924.9231661447759...1805ip-172-31-76-237172.31.76.23795.249164010.00366140.00020/home/ray/ray_results/HuggingFaceTrainer_2022-...
30.92600.015001.05350.6529430.0000000.9428288.5105.3031661447782...1060ip-172-31-85-193172.31.85.19399.367746010.00413340.02000/home/ray/ray_results/HuggingFaceTrainer_2022-...
20.64630.001501.05350.6586530.0000000.9576284.0505.2221661447764...1322ip-172-31-85-32172.31.85.3293.761317010.00453340.00200/home/ray/ray_results/HuggingFaceTrainer_2022-...
00.19580.000004.021400.8064090.5322861.0006271.8274.9971661448005...1729ip-172-31-90-137172.31.90.137347.170584040.00370240.00002/home/ray/ray_results/HuggingFaceTrainer_2022-...
\n", + "

4 rows × 33 columns

\n", + "" + ], + "text/plain": [ + " loss learning_rate epoch step eval_loss eval_matthews_correlation \\\n", + "1 0.6225 0.00015 1.0 535 0.649242 0.000000 \n", + "3 0.9260 0.01500 1.0 535 0.652943 0.000000 \n", + "2 0.6463 0.00150 1.0 535 0.658653 0.000000 \n", + "0 0.1958 0.00000 4.0 2140 0.806409 0.532286 \n", + "\n", + " eval_runtime eval_samples_per_second eval_steps_per_second _timestamp \\\n", + "1 1.0157 267.792 4.923 1661447759 \n", + "3 0.9428 288.510 5.303 1661447782 \n", + "2 0.9576 284.050 5.222 1661447764 \n", + "0 1.0006 271.827 4.997 1661448005 \n", + "\n", + " ... pid hostname node_ip time_since_restore \\\n", + "1 ... 1805 ip-172-31-76-237 172.31.76.237 95.249164 \n", + "3 ... 1060 ip-172-31-85-193 172.31.85.193 99.367746 \n", + "2 ... 1322 ip-172-31-85-32 172.31.85.32 93.761317 \n", + "0 ... 1729 ip-172-31-90-137 172.31.90.137 347.170584 \n", + "\n", + " timesteps_since_restore iterations_since_restore warmup_time \\\n", + "1 0 1 0.003661 \n", + "3 0 1 0.004133 \n", + "2 0 1 0.004533 \n", + "0 0 4 0.003702 \n", + "\n", + " config/trainer_init_config/epochs config/trainer_init_config/learning_rate \\\n", + "1 4 0.00020 \n", + "3 4 0.02000 \n", + "2 4 0.00200 \n", + "0 4 0.00002 \n", + "\n", + " logdir \n", + "1 /home/ray/ray_results/HuggingFaceTrainer_2022-... \n", + "3 /home/ray/ray_results/HuggingFaceTrainer_2022-... \n", + "2 /home/ray/ray_results/HuggingFaceTrainer_2022-... \n", + "0 /home/ray/ray_results/HuggingFaceTrainer_2022-... \n", + "\n", + "[4 rows x 33 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tune_results.get_dataframe().sort_values(\"eval_loss\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "best_result = tune_results.get_best_result()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1202,7 +2050,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1216,156 +2064,36 @@ "name": "stderr", "output_type": "stream", "text": [ - "Map Progress (2 actors 1 pending): 0%| | 0/1 [00:12\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
labelscore
0LABEL_10.998539
1LABEL_10.997706
2LABEL_10.998476
3LABEL_10.998498
4LABEL_00.533578
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - " \n", - " " - ], - "text/plain": [ - " label score\n", - "0 LABEL_1 0.998539\n", - "1 LABEL_1 0.997706\n", - "2 LABEL_1 0.998476\n", - "3 LABEL_1 0.998498\n", - "4 LABEL_0 0.533578" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "{'label': 'LABEL_1', 'score': 0.6822417974472046}\n", + "{'label': 'LABEL_1', 'score': 0.6822402477264404}\n", + "{'label': 'LABEL_1', 'score': 0.6822407841682434}\n", + "{'label': 'LABEL_1', 'score': 0.6822386980056763}\n", + "{'label': 'LABEL_1', 'score': 0.6822428107261658}\n", + "{'label': 'LABEL_1', 'score': 0.6822453737258911}\n", + "{'label': 'LABEL_1', 'score': 0.6822437047958374}\n", + "{'label': 'LABEL_1', 'score': 0.6822428703308105}\n", + "{'label': 'LABEL_1', 'score': 0.6822431683540344}\n", + "{'label': 'LABEL_1', 'score': 0.6822426915168762}\n", + "{'label': 'LABEL_1', 'score': 0.6822447776794434}\n", + "{'label': 'LABEL_1', 'score': 0.6822456121444702}\n", + "{'label': 'LABEL_1', 'score': 0.6822471022605896}\n", + "{'label': 'LABEL_1', 'score': 0.6822477579116821}\n", + "{'label': 'LABEL_1', 'score': 0.682244598865509}\n", + "{'label': 'LABEL_1', 'score': 0.6822422742843628}\n", + "{'label': 'LABEL_1', 'score': 0.6822470426559448}\n", + "{'label': 'LABEL_1', 'score': 0.6822417378425598}\n", + "{'label': 'LABEL_1', 'score': 0.6822449564933777}\n", + "{'label': 'LABEL_1', 'score': 0.682239294052124}\n" + ] } ], "source": [ @@ -1373,18 +2101,13 @@ "from ray.train.batch_predictor import BatchPredictor\n", "import pandas as pd\n", "\n", - "sentences = ['Bill whistled past the house.',\n", - " 'The car honked its way down the road.',\n", - " 'Bill pushed Harry off the sofa.',\n", - " 'the kittens yawned awake and played.',\n", - " 'I demand that the more John eats, the more he pay.']\n", "predictor = BatchPredictor.from_checkpoint(\n", - " checkpoint=result.checkpoint,\n", + " checkpoint=best_result.checkpoint,\n", " predictor_cls=HuggingFacePredictor,\n", " task=\"text-classification\",\n", + " device=0 if use_gpu else -1, # -1 is CPU, otherwise device index\n", ")\n", - "data = ray.data.from_pandas(pd.DataFrame(sentences, columns=[\"sentence\"]))\n", - "prediction = predictor.predict(data)\n", + "prediction = predictor.predict(ray_datasets[\"test\"].map_batches(lambda x: x[[\"sentence\"]]), num_gpus_per_worker=int(use_gpu))\n", "prediction.show()" ] }, @@ -1532,7 +2255,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3.9.12 ('.venv': venv)", + "display_name": "Python 3.8.10 ('venv': venv)", "language": "python", "name": "python3" }, @@ -1546,11 +2269,11 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.12" + "version": "3.8.10" }, "vscode": { "interpreter": { - "hash": "a658351b4133f922c5967ed6133cfc05c9f16c53a5161e5843ace3f528fccaf5" + "hash": "3c0d54d489a08ae47a06eae2fd00ff032d6cddb527c382959b7b2575f6a8167f" } } },