diff --git a/.buildkite/pipeline.ml.yml b/.buildkite/pipeline.ml.yml
index 7a169fbd4..1228cfbc1 100644
--- a/.buildkite/pipeline.ml.yml
+++ b/.buildkite/pipeline.ml.yml
@@ -6,9 +6,7 @@
- bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=-gpu,-needs_credentials python/ray/ml/...
# Only setup credentials in branch builds
- if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then exit 0; fi
- - python ./ci/env/setup_credentials.py wandb comet_ml
- bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=-gpu,needs_credentials python/ray/ml/...
- - python ./ci/env/cleanup_test_state.py wandb comet_ml
- label: ":brain: RLlib: Learning discr. actions TF2-static-graph"
conditions: ["RAY_CI_RLLIB_AFFECTED"]
@@ -385,4 +383,6 @@
commands:
- cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
- DOC_TESTING=1 PYTHON=3.7 ./ci/env/install-dependencies.sh
+ - python ./ci/env/setup_credentials.py wandb comet_ml
- bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=ray_air,-gpu,-py37,-post_wheel_build doc/...
+ - python ./ci/env/cleanup_test_state.py wandb comet_ml
diff --git a/doc/source/_toc.yml b/doc/source/_toc.yml
index 225e33d3a..887c8ad35 100644
--- a/doc/source/_toc.yml
+++ b/doc/source/_toc.yml
@@ -184,6 +184,8 @@ parts:
- file: ray-air/examples/analyze_tuning_results
title: "Analyze hyperparameter tuning results"
- file: ray-air/examples/rl_serving_example
+ - file: ray-air/examples/upload_to_comet_ml
+ - file: ray-air/examples/upload_to_wandb
- file: ray-air/examples/lightgbm_example
- file: ray-air/examples/rl_online_example
- file: ray-air/examples/rl_offline_example
diff --git a/doc/source/ray-air/examples/index.rst b/doc/source/ray-air/examples/index.rst
index 2e1296e0b..4f34095cc 100644
--- a/doc/source/ray-air/examples/index.rst
+++ b/doc/source/ray-air/examples/index.rst
@@ -11,6 +11,8 @@ Guides
- :doc:`/ray-air/examples/analyze_tuning_results`: How to analyze trial results (e.g. find the best trial) of a hyperparameter tuning run.
- :doc:`/ray-air/examples/rl_serving_example`
+- :doc:`/ray-air/examples/upload_to_comet_ml`
+- :doc:`/ray-air/examples/upload_to_wandb`
Trainers
diff --git a/doc/source/ray-air/examples/upload_to_comet_ml.ipynb b/doc/source/ray-air/examples/upload_to_comet_ml.ipynb
new file mode 100644
index 000000000..1b1fd4e9e
--- /dev/null
+++ b/doc/source/ray-air/examples/upload_to_comet_ml.ipynb
@@ -0,0 +1,409 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "98d7c620",
+ "metadata": {},
+ "source": [
+ "# Logging results and uploading models to Comet ML\n",
+ "In this example, we train a simple XGBoost model and log the training\n",
+ "results to Comet ML. We also save the resulting model checkpoints\n",
+ "as artifacts."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c6e66577",
+ "metadata": {},
+ "source": [
+ "Let's start with installing our dependencies:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "6d6297ef",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install -qU \"ray[tune]\" sklearn xgboost_ray comet_ml"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c2e21446",
+ "metadata": {},
+ "source": [
+ "Then we need some imports:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "dffff484",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import ray\n",
+ "\n",
+ "from ray.ml import RunConfig\n",
+ "from ray.ml.result import Result\n",
+ "from ray.ml.train.integrations.xgboost import XGBoostTrainer\n",
+ "from ray.tune.integration.comet import CometLoggerCallback\n",
+ "from sklearn.datasets import load_breast_cancer"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "29fcd93b",
+ "metadata": {},
+ "source": [
+ "We define a simple function that returns our training dataset as a Ray Dataset:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "cf830706",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_train_dataset() -> ray.data.Dataset:\n",
+ " \"\"\"Return the \"Breast cancer\" dataset as a Ray dataset.\"\"\"\n",
+ " data_raw = load_breast_cancer(as_frame=True)\n",
+ " df = data_raw[\"data\"]\n",
+ " df[\"target\"] = data_raw[\"target\"]\n",
+ " return ray.data.from_pandas(df)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0f48f948",
+ "metadata": {},
+ "source": [
+ "Now we define a simple training function. All the magic happens within the `CometLoggerCallback`:\n",
+ "\n",
+ "```python\n",
+ "CometLoggerCallback(\n",
+ " project_name=comet_project,\n",
+ " save_checkpoints=True,\n",
+ ")\n",
+ "```\n",
+ "\n",
+ "It will automatically log all results to Comet ML and upload the checkpoints as artifacts. It assumes you're logged in into Comet via an API key or your `~./.comet.config`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "230f23a3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def train_model(train_dataset: ray.data.Dataset, comet_project: str) -> Result:\n",
+ " \"\"\"Train a simple XGBoost model and return the result.\"\"\"\n",
+ " trainer = XGBoostTrainer(\n",
+ " scaling_config={\"num_workers\": 2},\n",
+ " params={\"tree_method\": \"auto\"},\n",
+ " label_column=\"target\",\n",
+ " datasets={\"train\": train_dataset},\n",
+ " num_boost_round=10,\n",
+ " run_config=RunConfig(\n",
+ " callbacks=[\n",
+ " # This is the part needed to enable logging to Comet ML.\n",
+ " # It assumes Comet ML can find a valid API (e.g. by setting\n",
+ " # the ``COMET_API_KEY`` environment variable).\n",
+ " CometLoggerCallback(\n",
+ " project_name=comet_project,\n",
+ " save_checkpoints=True,\n",
+ " )\n",
+ " ]\n",
+ " ),\n",
+ " )\n",
+ " result = trainer.fit()\n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "711b1d7d",
+ "metadata": {},
+ "source": [
+ "Let's kick off a run:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "9bfd9a8d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2022-05-19 15:19:17,237\tINFO services.py:1483 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "== Status ==
Current time: 2022-05-19 15:19:35 (running for 00:00:14.95)
Memory usage on this node: 10.2/16.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/5.12 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/kai/ray_results/XGBoostTrainer_2022-05-19_15-19-19
Number of trials: 1/1 (1 TERMINATED)
\n",
+ "\n",
+ "Trial name | status | loc | iter | total time (s) | train-rmse |
\n",
+ "\n",
+ "\n",
+ "XGBoostTrainer_ac544_00000 | TERMINATED | 127.0.0.1:19852 | 10 | 9.7203 | 0.030717 |
\n",
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "COMET WARNING: As you are running in a Jupyter environment, you will need to call `experiment.end()` when finished to ensure all metrics and code are logged before exiting.\n",
+ "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 15:19:21,584\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=16 --runtime-env-hash=-2010331134\n",
+ "COMET INFO: Experiment is live on comet.ml https://www.comet.ml/krfricke/ray-air-example/ecd3726ca127497ba7386003a249fad6\n",
+ "\n",
+ "COMET WARNING: Failed to add tag(s) None to the experiment\n",
+ "\n",
+ "COMET WARNING: Empty mapping given to log_params({}); ignoring\n",
+ "\u001b[2m\u001b[36m(GBDTTrainable pid=19852)\u001b[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n",
+ "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 15:19:24,628\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=17 --runtime-env-hash=-2010331069\n",
+ "\u001b[2m\u001b[36m(GBDTTrainable pid=19852)\u001b[0m 2022-05-19 15:19:25,961\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n",
+ "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 15:19:26,830\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=18 --runtime-env-hash=-2010331069\n",
+ "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 15:19:26,918\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=20 --runtime-env-hash=-2010331134\n",
+ "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 15:19:26,922\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=21 --runtime-env-hash=-2010331134\n",
+ "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 15:19:26,922\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=22 --runtime-env-hash=-2010331134\n",
+ "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 15:19:26,923\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=19 --runtime-env-hash=-2010331134\n",
+ "\u001b[2m\u001b[36m(GBDTTrainable pid=19852)\u001b[0m 2022-05-19 15:19:29,272\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n",
+ "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=19876)\u001b[0m [15:19:29] task [xgboost.ray]:4505889744 got new rank 1\n",
+ "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=19875)\u001b[0m [15:19:29] task [xgboost.ray]:6941849424 got new rank 0\n",
+ "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n",
+ "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n",
+ "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 1.0.0 created\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Result for XGBoostTrainer_ac544_00000:\n",
+ " date: 2022-05-19_15-19-30\n",
+ " done: false\n",
+ " experiment_id: d3007bd6a2734b328fd90385485c5a8d\n",
+ " hostname: Kais-MacBook-Pro.local\n",
+ " iterations_since_restore: 1\n",
+ " node_ip: 127.0.0.1\n",
+ " pid: 19852\n",
+ " should_checkpoint: true\n",
+ " time_since_restore: 6.529659032821655\n",
+ " time_this_iter_s: 6.529659032821655\n",
+ " time_total_s: 6.529659032821655\n",
+ " timestamp: 1652969970\n",
+ " timesteps_since_restore: 0\n",
+ " train-rmse: 0.357284\n",
+ " training_iteration: 1\n",
+ " trial_id: ac544_00000\n",
+ " warmup_time: 0.003961086273193359\n",
+ " \n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "COMET INFO: Scheduling the upload of 3 assets for a size of 2.48 KB, this can take some time\n",
+ "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:1.0.0' has started uploading asynchronously\n",
+ "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n",
+ "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n",
+ "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 2.0.0 created (previous was: 1.0.0)\n",
+ "COMET INFO: Scheduling the upload of 3 assets for a size of 3.86 KB, this can take some time\n",
+ "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:2.0.0' has started uploading asynchronously\n",
+ "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n",
+ "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n",
+ "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 3.0.0 created (previous was: 2.0.0)\n",
+ "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:1.0.0' has been fully uploaded successfully\n",
+ "COMET INFO: Scheduling the upload of 3 assets for a size of 5.31 KB, this can take some time\n",
+ "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:3.0.0' has started uploading asynchronously\n",
+ "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n",
+ "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n",
+ "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 4.0.0 created (previous was: 3.0.0)\n",
+ "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:2.0.0' has been fully uploaded successfully\n",
+ "COMET INFO: Scheduling the upload of 3 assets for a size of 6.76 KB, this can take some time\n",
+ "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:4.0.0' has started uploading asynchronously\n",
+ "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n",
+ "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n",
+ "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 5.0.0 created (previous was: 4.0.0)\n",
+ "COMET INFO: Scheduling the upload of 3 assets for a size of 8.21 KB, this can take some time\n",
+ "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:3.0.0' has been fully uploaded successfully\n",
+ "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:5.0.0' has started uploading asynchronously\n",
+ "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n",
+ "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n",
+ "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:4.0.0' has been fully uploaded successfully\n",
+ "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 6.0.0 created (previous was: 5.0.0)\n",
+ "COMET INFO: Scheduling the upload of 3 assets for a size of 9.87 KB, this can take some time\n",
+ "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:6.0.0' has started uploading asynchronously\n",
+ "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n",
+ "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n",
+ "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:5.0.0' has been fully uploaded successfully\n",
+ "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 7.0.0 created (previous was: 6.0.0)\n",
+ "COMET INFO: Scheduling the upload of 3 assets for a size of 11.46 KB, this can take some time\n",
+ "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:7.0.0' has started uploading asynchronously\n",
+ "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n",
+ "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n",
+ "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:6.0.0' has been fully uploaded successfully\n",
+ "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 8.0.0 created (previous was: 7.0.0)\n",
+ "COMET INFO: Scheduling the upload of 3 assets for a size of 12.84 KB, this can take some time\n",
+ "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:8.0.0' has started uploading asynchronously\n",
+ "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n",
+ "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n",
+ "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:7.0.0' has been fully uploaded successfully\n",
+ "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 9.0.0 created (previous was: 8.0.0)\n",
+ "COMET INFO: Scheduling the upload of 3 assets for a size of 14.36 KB, this can take some time\n",
+ "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:9.0.0' has started uploading asynchronously\n",
+ "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n",
+ "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n",
+ "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:8.0.0' has been fully uploaded successfully\n",
+ "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 10.0.0 created (previous was: 9.0.0)\n",
+ "COMET INFO: Scheduling the upload of 3 assets for a size of 16.37 KB, this can take some time\n",
+ "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:10.0.0' has started uploading asynchronously\n",
+ "\u001b[2m\u001b[36m(GBDTTrainable pid=19852)\u001b[0m 2022-05-19 15:19:33,890\tINFO main.py:1519 -- [RayXGBoost] Finished XGBoost training on training data with total N=569 in 7.96 seconds (4.61 pure XGBoost training time).\n",
+ "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:9.0.0' has been fully uploaded successfully\n",
+ "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 11.0.0 created (previous was: 10.0.0)\n",
+ "COMET INFO: Scheduling the upload of 3 assets for a size of 16.39 KB, this can take some time\n",
+ "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:11.0.0' has started uploading asynchronously\n",
+ "COMET INFO: ---------------------------\n",
+ "COMET INFO: Comet.ml Experiment Summary\n",
+ "COMET INFO: ---------------------------\n",
+ "COMET INFO: Data:\n",
+ "COMET INFO: display_summary_level : 1\n",
+ "COMET INFO: url : https://www.comet.ml/krfricke/ray-air-example/ecd3726ca127497ba7386003a249fad6\n",
+ "COMET INFO: Metrics [count] (min, max):\n",
+ "COMET INFO: iterations_since_restore [10] : (1, 10)\n",
+ "COMET INFO: time_since_restore [10] : (6.529659032821655, 9.720295906066895)\n",
+ "COMET INFO: time_this_iter_s [10] : (0.3124058246612549, 6.529659032821655)\n",
+ "COMET INFO: time_total_s [10] : (6.529659032821655, 9.720295906066895)\n",
+ "COMET INFO: timestamp [10] : (1652969970, 1652969973)\n",
+ "COMET INFO: timesteps_since_restore : 0\n",
+ "COMET INFO: train-rmse [10] : (0.030717, 0.357284)\n",
+ "COMET INFO: training_iteration [10] : (1, 10)\n",
+ "COMET INFO: warmup_time : 0.003961086273193359\n",
+ "COMET INFO: Others:\n",
+ "COMET INFO: Created from : Ray\n",
+ "COMET INFO: Name : XGBoostTrainer_ac544_00000\n",
+ "COMET INFO: experiment_id : d3007bd6a2734b328fd90385485c5a8d\n",
+ "COMET INFO: trial_id : ac544_00000\n",
+ "COMET INFO: System Information:\n",
+ "COMET INFO: date : 2022-05-19_15-19-33\n",
+ "COMET INFO: hostname : Kais-MacBook-Pro.local\n",
+ "COMET INFO: node_ip : 127.0.0.1\n",
+ "COMET INFO: pid : 19852\n",
+ "COMET INFO: Uploads:\n",
+ "COMET INFO: artifact assets : 33 (107.92 KB)\n",
+ "COMET INFO: artifacts : 11\n",
+ "COMET INFO: environment details : 1\n",
+ "COMET INFO: filename : 1\n",
+ "COMET INFO: installed packages : 1\n",
+ "COMET INFO: notebook : 1\n",
+ "COMET INFO: source_code : 1\n",
+ "COMET INFO: ---------------------------\n",
+ "COMET INFO: Uploading metrics, params, and assets to Comet before program termination (may take several seconds)\n",
+ "COMET INFO: The Python SDK has 3600 seconds to finish before aborting...\n",
+ "COMET INFO: Waiting for completion of the file uploads (may take several seconds)\n",
+ "COMET INFO: The Python SDK has 10800 seconds to finish before aborting...\n",
+ "COMET INFO: Still uploading 6 file(s), remaining 21.05 KB/116.69 KB\n",
+ "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:10.0.0' has been fully uploaded successfully\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:11.0.0' has been fully uploaded successfully\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Result for XGBoostTrainer_ac544_00000:\n",
+ " date: 2022-05-19_15-19-33\n",
+ " done: true\n",
+ " experiment_id: d3007bd6a2734b328fd90385485c5a8d\n",
+ " experiment_tag: '0'\n",
+ " hostname: Kais-MacBook-Pro.local\n",
+ " iterations_since_restore: 10\n",
+ " node_ip: 127.0.0.1\n",
+ " pid: 19852\n",
+ " should_checkpoint: true\n",
+ " time_since_restore: 9.720295906066895\n",
+ " time_this_iter_s: 0.39761900901794434\n",
+ " time_total_s: 9.720295906066895\n",
+ " timestamp: 1652969973\n",
+ " timesteps_since_restore: 0\n",
+ " train-rmse: 0.030717\n",
+ " training_iteration: 10\n",
+ " trial_id: ac544_00000\n",
+ " warmup_time: 0.003961086273193359\n",
+ " \n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2022-05-19 15:19:35,621\tINFO tune.py:753 -- Total run time: 15.75 seconds (14.94 seconds for the tuning loop).\n"
+ ]
+ }
+ ],
+ "source": [
+ "comet_project = \"ray_air_example\"\n",
+ "\n",
+ "train_dataset = get_train_dataset()\n",
+ "result = train_model(train_dataset=train_dataset, comet_project=comet_project)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "be28bdd3",
+ "metadata": {},
+ "source": [
+ "Check out your [Comet ML](https://www.comet.ml/) project to see the results!"
+ ]
+ }
+ ],
+ "metadata": {
+ "jupytext": {
+ "cell_metadata_filter": "-all",
+ "main_language": "python",
+ "notebook_metadata_filter": "-all"
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/doc/source/ray-air/examples/upload_to_wandb.ipynb b/doc/source/ray-air/examples/upload_to_wandb.ipynb
new file mode 100644
index 000000000..48e8769d1
--- /dev/null
+++ b/doc/source/ray-air/examples/upload_to_wandb.ipynb
@@ -0,0 +1,414 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "f37e8a9f",
+ "metadata": {},
+ "source": [
+ "# Logging results and uploading models to Weights & Biases\n",
+ "In this example, we train a simple XGBoost model and log the training\n",
+ "results to Weights & Biases. We also save the resulting model checkpoints\n",
+ "as artifacts."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "27d04c97",
+ "metadata": {},
+ "source": [
+ "Let's start with installing our dependencies:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "4e697e5d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install -qU \"ray[tune]\" sklearn xgboost_ray wandb"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3096e7c9",
+ "metadata": {},
+ "source": [
+ "Then we need some imports:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "9c286701",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import ray\n",
+ "\n",
+ "from ray.ml import RunConfig\n",
+ "from ray.ml.result import Result\n",
+ "from ray.ml.train.integrations.xgboost import XGBoostTrainer\n",
+ "from ray.tune.integration.wandb import WandbLoggerCallback\n",
+ "from sklearn.datasets import load_breast_cancer"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2efa1564",
+ "metadata": {},
+ "source": [
+ "We define a simple function that returns our training dataset as a Ray Dataset:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "a63ebd10",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_train_dataset() -> ray.data.Dataset:\n",
+ " \"\"\"Return the \"Breast cancer\" dataset as a Ray dataset.\"\"\"\n",
+ " data_raw = load_breast_cancer(as_frame=True)\n",
+ " df = data_raw[\"data\"]\n",
+ " df[\"target\"] = data_raw[\"target\"]\n",
+ " return ray.data.from_pandas(df)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d07cf41f",
+ "metadata": {},
+ "source": [
+ "Now we define a simple training function. All the magic happens within the `WandbLoggerCallback`:\n",
+ "\n",
+ "```python\n",
+ "WandbLoggerCallback(\n",
+ " project=wandb_project,\n",
+ " save_checkpoints=True,\n",
+ ")\n",
+ "```\n",
+ "\n",
+ "It will automatically log all results to Weights & Biases and upload the checkpoints as artifacts. It assumes you're logged in into Wandb via an API key or `wandb login`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "52edfde0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def train_model(train_dataset: ray.data.Dataset, wandb_project: str) -> Result:\n",
+ " \"\"\"Train a simple XGBoost model and return the result.\"\"\"\n",
+ " trainer = XGBoostTrainer(\n",
+ " scaling_config={\"num_workers\": 2},\n",
+ " params={\"tree_method\": \"auto\"},\n",
+ " label_column=\"target\",\n",
+ " datasets={\"train\": train_dataset},\n",
+ " num_boost_round=10,\n",
+ " run_config=RunConfig(\n",
+ " callbacks=[\n",
+ " # This is the part needed to enable logging to Weights & Biases.\n",
+ " # It assumes you've logged in before, e.g. with `wandb login`.\n",
+ " WandbLoggerCallback(\n",
+ " project=wandb_project,\n",
+ " save_checkpoints=True,\n",
+ " )\n",
+ " ]\n",
+ " ),\n",
+ " )\n",
+ " result = trainer.fit()\n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1959ce19",
+ "metadata": {},
+ "source": [
+ "Let's kick off a run:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "64f80d6c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2022-05-19 15:22:11,956\tINFO services.py:1483 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8266\u001b[39m\u001b[22m\n",
+ "2022-05-19 15:22:15,995\tINFO wandb.py:172 -- Already logged into W&B.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "== Status ==
Current time: 2022-05-19 15:22:42 (running for 00:00:26.61)
Memory usage on this node: 10.2/16.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/4.6 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/kai/ray_results/XGBoostTrainer_2022-05-19_15-22-14
Number of trials: 1/1 (1 TERMINATED)
\n",
+ "\n",
+ "Trial name | status | loc | iter | total time (s) | train-rmse |
\n",
+ "\n",
+ "\n",
+ "XGBoostTrainer_14a73_00000 | TERMINATED | 127.0.0.1:20065 | 10 | 10.2724 | 0.030717 |
\n",
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 15:22:17,422\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61838 --object-store-name=/tmp/ray/session_2022-05-19_15-22-09_017478_19912/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-22-09_017478_19912/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=63609 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:62933 --redis-password=5241590000000000 --startup-token=16 --runtime-env-hash=-2010331134\n",
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mkaifricke\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
+ "\u001b[2m\u001b[36m(GBDTTrainable pid=20065)\u001b[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n",
+ "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 15:22:23,215\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61838 --object-store-name=/tmp/ray/session_2022-05-19_15-22-09_017478_19912/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-22-09_017478_19912/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=63609 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:62933 --redis-password=5241590000000000 --startup-token=17 --runtime-env-hash=-2010331069\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "Tracking run with wandb version 0.12.16"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Run data is saved locally in /Users/kai/coding/ray/doc/source/ray-air/examples/wandb/run-20220519_152218-14a73_00000
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Syncing run XGBoostTrainer_14a73_00000 to Weights & Biases (docs)
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2m\u001b[36m(GBDTTrainable pid=20065)\u001b[0m 2022-05-19 15:22:24,711\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n",
+ "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 15:22:26,090\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61838 --object-store-name=/tmp/ray/session_2022-05-19_15-22-09_017478_19912/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-22-09_017478_19912/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=63609 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:62933 --redis-password=5241590000000000 --startup-token=18 --runtime-env-hash=-2010331069\n",
+ "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 15:22:26,234\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61838 --object-store-name=/tmp/ray/session_2022-05-19_15-22-09_017478_19912/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-22-09_017478_19912/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=63609 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:62933 --redis-password=5241590000000000 --startup-token=19 --runtime-env-hash=-2010331134\n",
+ "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 15:22:26,236\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61838 --object-store-name=/tmp/ray/session_2022-05-19_15-22-09_017478_19912/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-22-09_017478_19912/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=63609 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:62933 --redis-password=5241590000000000 --startup-token=20 --runtime-env-hash=-2010331134\n",
+ "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 15:22:26,239\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61838 --object-store-name=/tmp/ray/session_2022-05-19_15-22-09_017478_19912/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-22-09_017478_19912/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=63609 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:62933 --redis-password=5241590000000000 --startup-token=21 --runtime-env-hash=-2010331134\n",
+ "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 15:22:26,263\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61838 --object-store-name=/tmp/ray/session_2022-05-19_15-22-09_017478_19912/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-22-09_017478_19912/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=63609 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:62933 --redis-password=5241590000000000 --startup-token=22 --runtime-env-hash=-2010331134\n",
+ "\u001b[2m\u001b[36m(GBDTTrainable pid=20065)\u001b[0m 2022-05-19 15:22:29,260\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n",
+ "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=20130)\u001b[0m [15:22:29] task [xgboost.ray]:6859875216 got new rank 0\n",
+ "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=20131)\u001b[0m [15:22:29] task [xgboost.ray]:4625795280 got new rank 1\n",
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (/Users/kai/ray_results/XGBoostTrainer_2022-05-19_15-22-14/XGBoostTrainer_14a73_00000_0_2022-05-19_15-22-16/checkpoint_000000)... Done. 0.1s\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Result for XGBoostTrainer_14a73_00000:\n",
+ " date: 2022-05-19_15-22-31\n",
+ " done: false\n",
+ " experiment_id: 2d50bfe80d2a441e80f4ca05f7c3b607\n",
+ " hostname: Kais-MacBook-Pro.local\n",
+ " iterations_since_restore: 1\n",
+ " node_ip: 127.0.0.1\n",
+ " pid: 20065\n",
+ " should_checkpoint: true\n",
+ " time_since_restore: 10.080440044403076\n",
+ " time_this_iter_s: 10.080440044403076\n",
+ " time_total_s: 10.080440044403076\n",
+ " timestamp: 1652970151\n",
+ " timesteps_since_restore: 0\n",
+ " train-rmse: 0.357284\n",
+ " training_iteration: 1\n",
+ " trial_id: 14a73_00000\n",
+ " warmup_time: 0.006903171539306641\n",
+ " \n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (/Users/kai/ray_results/XGBoostTrainer_2022-05-19_15-22-14/XGBoostTrainer_14a73_00000_0_2022-05-19_15-22-16/checkpoint_000001)... Done. 0.1s\n",
+ "\u001b[2m\u001b[36m(GBDTTrainable pid=20065)\u001b[0m 2022-05-19 15:22:32,051\tINFO main.py:1519 -- [RayXGBoost] Finished XGBoost training on training data with total N=569 in 7.37 seconds (2.79 pure XGBoost training time).\n",
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (/Users/kai/ray_results/XGBoostTrainer_2022-05-19_15-22-14/XGBoostTrainer_14a73_00000_0_2022-05-19_15-22-16/checkpoint_000002)... Done. 0.1s\n",
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (/Users/kai/ray_results/XGBoostTrainer_2022-05-19_15-22-14/XGBoostTrainer_14a73_00000_0_2022-05-19_15-22-16/checkpoint_000003)... Done. 0.1s\n",
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (/Users/kai/ray_results/XGBoostTrainer_2022-05-19_15-22-14/XGBoostTrainer_14a73_00000_0_2022-05-19_15-22-16/checkpoint_000004)... Done. 0.1s\n",
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (/Users/kai/ray_results/XGBoostTrainer_2022-05-19_15-22-14/XGBoostTrainer_14a73_00000_0_2022-05-19_15-22-16/checkpoint_000005)... Done. 0.1s\n",
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (/Users/kai/ray_results/XGBoostTrainer_2022-05-19_15-22-14/XGBoostTrainer_14a73_00000_0_2022-05-19_15-22-16/checkpoint_000006)... Done. 0.1s\n",
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (/Users/kai/ray_results/XGBoostTrainer_2022-05-19_15-22-14/XGBoostTrainer_14a73_00000_0_2022-05-19_15-22-16/checkpoint_000007)... Done. 0.1s\n",
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (/Users/kai/ray_results/XGBoostTrainer_2022-05-19_15-22-14/XGBoostTrainer_14a73_00000_0_2022-05-19_15-22-16/checkpoint_000008)... Done. 0.1s\n",
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (/Users/kai/ray_results/XGBoostTrainer_2022-05-19_15-22-14/XGBoostTrainer_14a73_00000_0_2022-05-19_15-22-16/checkpoint_000009)... Done. 0.1s\n",
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (/Users/kai/ray_results/XGBoostTrainer_2022-05-19_15-22-14/XGBoostTrainer_14a73_00000_0_2022-05-19_15-22-16/checkpoint_000009)... Done. 0.1s\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "Waiting for W&B process to finish... (success)."
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Result for XGBoostTrainer_14a73_00000:\n",
+ " date: 2022-05-19_15-22-32\n",
+ " done: true\n",
+ " experiment_id: 2d50bfe80d2a441e80f4ca05f7c3b607\n",
+ " experiment_tag: '0'\n",
+ " hostname: Kais-MacBook-Pro.local\n",
+ " iterations_since_restore: 10\n",
+ " node_ip: 127.0.0.1\n",
+ " pid: 20065\n",
+ " should_checkpoint: true\n",
+ " time_since_restore: 10.272444248199463\n",
+ " time_this_iter_s: 0.023891210556030273\n",
+ " time_total_s: 10.272444248199463\n",
+ " timestamp: 1652970152\n",
+ " timesteps_since_restore: 0\n",
+ " train-rmse: 0.030717\n",
+ " training_iteration: 10\n",
+ " trial_id: 14a73_00000\n",
+ " warmup_time: 0.006903171539306641\n",
+ " \n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2022-05-19 15:22:42,727\tINFO tune.py:753 -- Total run time: 27.83 seconds (26.61 seconds for the tuning loop).\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "VBox(children=(Label(value='0.090 MB of 0.090 MB uploaded (0.000 MB deduped)\\r'), FloatProgress(value=1.0, max…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "Run history:
iterations_since_restore | ▁▂▃▃▄▅▆▆▇█ |
time_since_restore | ▁▂▃▃▄▅▅▆▇█ |
time_this_iter_s | █▁▁▁▁▁▁▁▁▁ |
time_total_s | ▁▂▃▃▄▅▅▆▇█ |
timestamp | ▁▁▁▁▁▁▁▁██ |
timesteps_since_restore | ▁▁▁▁▁▁▁▁▁▁ |
train-rmse | █▆▄▃▂▂▂▁▁▁ |
training_iteration | ▁▂▃▃▄▅▆▆▇█ |
warmup_time | ▁▁▁▁▁▁▁▁▁▁ |
Run summary:
iterations_since_restore | 10 |
time_since_restore | 10.27244 |
time_this_iter_s | 0.02389 |
time_total_s | 10.27244 |
timestamp | 1652970152 |
timesteps_since_restore | 0 |
train-rmse | 0.03072 |
training_iteration | 10 |
warmup_time | 0.0069 |
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Synced XGBoostTrainer_14a73_00000: https://wandb.ai/kaifricke/ray_air_example/runs/14a73_00000
Synced 5 W&B file(s), 0 media file(s), 21 artifact file(s) and 0 other file(s)"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Find logs at: ./wandb/run-20220519_152218-14a73_00000/logs
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "wandb_project = \"ray_air_example\"\n",
+ "\n",
+ "train_dataset = get_train_dataset()\n",
+ "result = train_model(train_dataset=train_dataset, wandb_project=wandb_project)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "78701c42",
+ "metadata": {},
+ "source": [
+ "Check out your [WandB](https://wandb.ai/) project to see the results!"
+ ]
+ }
+ ],
+ "metadata": {
+ "jupytext": {
+ "cell_metadata_filter": "-all",
+ "main_language": "python",
+ "notebook_metadata_filter": "-all"
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/python/ray/ml/BUILD b/python/ray/ml/BUILD
index a46d1f9cd..2b2460568 100644
--- a/python/ray/ml/BUILD
+++ b/python/ray/ml/BUILD
@@ -147,22 +147,6 @@ py_test(
args = ["--smoke-test"]
)
-py_test(
- name = "upload_to_comet_ml",
- size = "medium",
- srcs = ["examples/upload_to_comet_ml.py"],
- tags = ["team:ml", "exclusive", "needs_credentials"],
- deps = [":ml_lib"]
-)
-
-py_test(
- name = "upload_to_wandb",
- size = "medium",
- srcs = ["examples/upload_to_wandb.py"],
- tags = ["team:ml", "exclusive", "needs_credentials"],
- deps = [":ml_lib"]
-)
-
# --------------------------------------------------------------------
# Tests from the python/ray/ml/tests directory.
# Covers all tests starting with `test_`.
diff --git a/python/ray/ml/examples/upload_to_comet_ml.ipynb b/python/ray/ml/examples/upload_to_comet_ml.ipynb
new file mode 120000
index 000000000..9c6606ee1
--- /dev/null
+++ b/python/ray/ml/examples/upload_to_comet_ml.ipynb
@@ -0,0 +1 @@
+../../../../doc/source/ray-air/examples/upload_to_comet_ml.ipynb
\ No newline at end of file
diff --git a/python/ray/ml/examples/upload_to_comet_ml.py b/python/ray/ml/examples/upload_to_comet_ml.py
deleted file mode 100644
index e78b89d6e..000000000
--- a/python/ray/ml/examples/upload_to_comet_ml.py
+++ /dev/null
@@ -1,50 +0,0 @@
-"""
-In this example, we train a simple XGBoost model and log the training
-results to Comet ML. We also save the resulting model checkpoints
-as artifacts.
-"""
-import ray
-
-from ray.ml import RunConfig
-from ray.ml.result import Result
-from ray.ml.train.integrations.xgboost import XGBoostTrainer
-from ray.tune.integration.comet import CometLoggerCallback
-from sklearn.datasets import load_breast_cancer
-
-
-def get_train_dataset() -> ray.data.Dataset:
- """Return the "Breast cancer" dataset as a Ray dataset."""
- data_raw = load_breast_cancer(as_frame=True)
- df = data_raw["data"]
- df["target"] = data_raw["target"]
- return ray.data.from_pandas(df)
-
-
-def train_model(train_dataset: ray.data.Dataset, comet_project: str) -> Result:
- """Train a simple XGBoost model and return the result."""
- trainer = XGBoostTrainer(
- scaling_config={"num_workers": 2},
- params={"tree_method": "auto"},
- label_column="target",
- datasets={"train": train_dataset},
- num_boost_round=10,
- run_config=RunConfig(
- callbacks=[
- # This is the part needed to enable logging to Comet ML.
- # It assumes Comet ML can find a valid API (e.g. by setting
- # the ``COMET_API_KEY`` environment variable).
- CometLoggerCallback(
- project_name=comet_project,
- save_checkpoints=True,
- )
- ]
- ),
- )
- result = trainer.fit()
- return result
-
-
-comet_project = "ray_air_example"
-
-train_dataset = get_train_dataset()
-result = train_model(train_dataset=train_dataset, comet_project=comet_project)
diff --git a/python/ray/ml/examples/upload_to_wandb.ipynb b/python/ray/ml/examples/upload_to_wandb.ipynb
new file mode 120000
index 000000000..e241f6fcb
--- /dev/null
+++ b/python/ray/ml/examples/upload_to_wandb.ipynb
@@ -0,0 +1 @@
+../../../../doc/source/ray-air/examples/upload_to_wandb.ipynb
\ No newline at end of file
diff --git a/python/ray/ml/examples/upload_to_wandb.py b/python/ray/ml/examples/upload_to_wandb.py
deleted file mode 100644
index 32e306a36..000000000
--- a/python/ray/ml/examples/upload_to_wandb.py
+++ /dev/null
@@ -1,49 +0,0 @@
-"""
-In this example, we train a simple XGBoost model and log the training
-results to Weights & Biases. We also save the resulting model checkpoints
-as artifacts.
-"""
-import ray
-
-from ray.ml import RunConfig
-from ray.ml.result import Result
-from ray.ml.train.integrations.xgboost import XGBoostTrainer
-from ray.tune.integration.wandb import WandbLoggerCallback
-from sklearn.datasets import load_breast_cancer
-
-
-def get_train_dataset() -> ray.data.Dataset:
- """Return the "Breast cancer" dataset as a Ray dataset."""
- data_raw = load_breast_cancer(as_frame=True)
- df = data_raw["data"]
- df["target"] = data_raw["target"]
- return ray.data.from_pandas(df)
-
-
-def train_model(train_dataset: ray.data.Dataset, wandb_project: str) -> Result:
- """Train a simple XGBoost model and return the result."""
- trainer = XGBoostTrainer(
- scaling_config={"num_workers": 2},
- params={"tree_method": "auto"},
- label_column="target",
- datasets={"train": train_dataset},
- num_boost_round=10,
- run_config=RunConfig(
- callbacks=[
- # This is the part needed to enable logging to Weights & Biases.
- # It assumes you've logged in before, e.g. with `wandb login`.
- WandbLoggerCallback(
- project=wandb_project,
- save_checkpoints=True,
- )
- ]
- ),
- )
- result = trainer.fit()
- return result
-
-
-wandb_project = "ray_air_example"
-
-train_dataset = get_train_dataset()
-result = train_model(train_dataset=train_dataset, wandb_project=wandb_project)