diff --git a/dashboard/client/src/pages/dashboard/Dashboard.tsx b/dashboard/client/src/pages/dashboard/Dashboard.tsx index 018bcad13..b3e631989 100644 --- a/dashboard/client/src/pages/dashboard/Dashboard.tsx +++ b/dashboard/client/src/pages/dashboard/Dashboard.tsx @@ -147,11 +147,11 @@ const Dashboard: React.FC = () => { run the following command: `ray disable-usage-stats` before starting the cluster. See{" "} - https://github.com/ray-project/ray/issues/20857 + https://docs.ray.io/en/master/cluster/usage-stats.html {" "} for more details. diff --git a/doc/source/_toc.yml b/doc/source/_toc.yml index 911fe3eb2..6827c0e11 100644 --- a/doc/source/_toc.yml +++ b/doc/source/_toc.yml @@ -132,6 +132,7 @@ parts: - file: cluster/cloud - file: cluster/deploy - file: cluster/api + - file: cluster/usage-stats - caption: References chapters: diff --git a/doc/source/cluster/usage-stats-data-ref.rst b/doc/source/cluster/usage-stats-data-ref.rst new file mode 100644 index 000000000..f7d0bd949 --- /dev/null +++ b/doc/source/cluster/usage-stats-data-ref.rst @@ -0,0 +1,10 @@ +Usage Stats Data API +==================== + +.. _ray-usage-stats-data-ref: + +UsageStatsToReport +~~~~~~~~~~~~~~~~~~ + +.. autoclass:: ray._private.usage.usage_lib.UsageStatsToReport + :members: diff --git a/doc/source/cluster/usage-stats.rst b/doc/source/cluster/usage-stats.rst new file mode 100644 index 000000000..0a79bb2b2 --- /dev/null +++ b/doc/source/cluster/usage-stats.rst @@ -0,0 +1,84 @@ +.. _ref-usage-stats: + +Usage Stats Collection +====================== + +Starting in Ray 1.13, Ray collects usage stats data by default (guarded by an opt-out prompt). +This data will be used by the open-source Ray engineering team to better understand how to improve our libraries and core APIs, and how to prioritize bug fixes and enhancements. + +Here are the guiding principles of our collection policy: + +- **No surprises** — you will be notified before we begin collecting data. You will be notified of any changes to the data being collected or how it is used. +- **Easy opt-out:** You will be able to easily opt-out of data collection +- **Transparency** — you will be able to review all data that is sent to us +- **Control** — you will have control over your data, and we will honor requests to delete your data. +- We will **not** collect any personally identifiable data or proprietary code/data +- We will **not** sell data or buy data about you. + +You will always be able to :ref:`disable the usage stats collection`. + +For more context, please refer to this `RFC `_. + +What data is collected? +----------------------- + +We collect non-sensitive data that helps us understand how Ray is used (e.g., which Ray libraries are used). +**Personally identifiable data will never be collected.** Please check :ref:`UsageStatsToReport ` to see the data we collect. + +.. _usage-disable: + +How to disable it +----------------- +There are multiple ways to disable usage stats collection before starting a cluster: + +#. Add ``--disable-usage-stats`` option to the command that starts the Ray cluster (e.g., ``ray start --head --disable-usage-stats`` :ref:`command `). + +#. Run :ref:`ray disable-usage-stats ` to disable collection for all future clusters. This won't affect currently running clusters. Under the hood, this command writes ``{"usage_stats": true}`` to the global config file ``~/.ray/config.json``. + +#. Set the environment variable ``RAY_USAGE_STATS_ENABLED`` to 0 (e.g., ``RAY_USAGE_STATS_ENABLED=0 ray start --head`` :ref:`command `). + +Currently there is no way to enable or disable collection for a running cluster; you have to stop and restart the cluster. + + +How does it work? +----------------- + +When a Ray cluster is started via :ref:`ray start --head `, :ref:`ray up `, :ref:`ray submit --start ` or :ref:`ray exec --start `, +Ray will decide whether usage stats collection should be enabled or not by considering the following factors in order: + +#. It checks whether the environment variable ``RAY_USAGE_STATS_ENABLED`` is set: 1 means enabled and 0 means disabled. + +#. If the environment variable is not set, it reads the value of key ``usage_stats`` in the global config file ``~/.ray/config.json``: true means enabled and false means disabled. + +#. If neither is set and the console is interactive, then the user will be prompted to enable or disable the collection. If the console is non-interactive, usage stats collection will be enabled by default. The decision will be saved to ``~/.ray/config.json``, so the prompt is only shown once. + +Note: usage stats collection is not enabled when using local dev clusters started via ``ray.init()``. This means that Ray will never collect data from third-party library users not using Ray directly. + +If usage stats collection is enabled, a background process on the head node will collect the usage stats +and report to ``https://usage-stats.ray.io/`` every hour. The reported usage stats will also be saved to +``/tmp/ray/session_xxx/usage_stats.json`` on the head node for inspection. You can check the existence of this file to see if collection is enabled. + +Usage stats collection is very lightweight and should have no impact on your workload in any way. + +Requesting removal of collected data +------------------------------------ + +To request removal of collected data, please email us at ``usage_stats@ray.io`` with the ``session_id`` that you can find in ``/tmp/ray/session_xxx/usage_stats.json``. + +Frequently Asked Questions (FAQ) +-------------------------------- + +**Does the session_id map to personal data?** + +No, the uuid will be a Ray session/job-specific random ID that cannot be used to identify a specific person nor machine. It will not live beyond the lifetime of your Ray session; and is primarily captured to enable us to honor deletion requests. + +The session_id is logged so that deletion requests can be honored. + +**Could an enterprise easily configure an additional endpoint or substitute a different endpoint?** + +We definitely see this use case and would love to chat with you to make this work -- email ``usage_stats@ray.io``. + + +Contact us +---------- +If you have any feedback regarding usage stats collection, please email us at ``usage_stats@ray.io``. diff --git a/doc/source/ray-core/package-ref.rst b/doc/source/ray-core/package-ref.rst index a28e3ce84..27f6ae7b0 100644 --- a/doc/source/ray-core/package-ref.rst +++ b/doc/source/ray-core/package-ref.rst @@ -314,3 +314,15 @@ The Ray Command Line API .. click:: ray.scripts.scripts:debug :prog: ray debug :show-nested: + +.. _ray-disable-usage-stats-doc: + +.. click:: ray.scripts.scripts:disable_usage_stats + :prog: ray disable-usage-stats + :show-nested: + +.. _ray-enable-usage-stats-doc: + +.. click:: ray.scripts.scripts:enable_usage_stats + :prog: ray enable-usage-stats + :show-nested: diff --git a/doc/source/ray-references/api.rst b/doc/source/ray-references/api.rst index aa1df60eb..a47568ad3 100644 --- a/doc/source/ray-references/api.rst +++ b/doc/source/ray-references/api.rst @@ -13,4 +13,5 @@ API References ../workflows/package-ref.rst ../ray-core/package-ref.rst ../cluster/reference.rst - ../cluster/jobs-package-ref.rst \ No newline at end of file + ../cluster/jobs-package-ref.rst + ../cluster/usage-stats-data-ref.rst \ No newline at end of file diff --git a/python/ray/_private/usage/usage_constants.py b/python/ray/_private/usage/usage_constants.py index 9147d29f6..729c8ccf4 100644 --- a/python/ray/_private/usage/usage_constants.py +++ b/python/ray/_private/usage/usage_constants.py @@ -12,7 +12,7 @@ USAGE_STATS_ENABLED_MESSAGE = ( "Usage stats collection is enabled. To disable this, add `--disable-usage-stats` " "to the command that starts the cluster, or run the following command:" " `ray disable-usage-stats` before starting the cluster. " - "See https://github.com/ray-project/ray/issues/20857 for more details." + "See https://docs.ray.io/en/master/cluster/usage-stats.html for more details." ) USAGE_STATS_DISABLED_MESSAGE = "Usage stats collection is disabled." @@ -23,7 +23,7 @@ USAGE_STATS_ENABLED_BY_DEFAULT_MESSAGE = ( "To disable this, add `--disable-usage-stats` to the command that starts " "the cluster, or run the following command:" " `ray disable-usage-stats` before starting the cluster. " - "See https://github.com/ray-project/ray/issues/20857 for more details." + "See https://docs.ray.io/en/master/cluster/usage-stats.html for more details." ) USAGE_STATS_CONFIRMATION_MESSAGE = ( diff --git a/python/ray/_private/usage/usage_lib.py b/python/ray/_private/usage/usage_lib.py index 06a0351ab..adda162bf 100644 --- a/python/ray/_private/usage/usage_lib.py +++ b/python/ray/_private/usage/usage_lib.py @@ -92,30 +92,49 @@ class ClusterStatusToReport: class UsageStatsToReport: """Usage stats to report""" + #: The Ray version in use. ray_version: str + #: The Python version in use. python_version: str + #: The schema version of the report. schema_version: str + #: The source of the data (i.e. OSS). source: str + #: A random id of the cluster session. session_id: str + #: The git commit hash of Ray (i.e. ray.__commit__). git_commit: str + #: The operating system in use. os: str + #: When the data is collected and reported. collect_timestamp_ms: int + #: When the cluster is started. session_start_timestamp_ms: int + #: The cloud provider found in the cluster.yaml file (e.g., aws). cloud_provider: Optional[str] + #: The min_workers found in the cluster.yaml file. min_workers: Optional[int] + #: The max_workers found in the cluster.yaml file. max_workers: Optional[int] + #: The head node instance type found in the cluster.yaml file (e.g., i3.8xlarge). head_node_instance_type: Optional[str] + #: The worker node instance types found in the cluster.yaml file (e.g., i3.8xlarge). worker_node_instance_types: Optional[List[str]] + #: The total num of cpus in the cluster. total_num_cpus: Optional[int] + #: The total num of gpus in the cluster. total_num_gpus: Optional[int] + #: The total size of memory in the cluster. total_memory_gb: Optional[float] + #: The total size of object store memory in the cluster. total_object_store_memory_gb: Optional[float] + #: The Ray libraries that are used (e.g., rllib). library_usages: Optional[List[str]] - # The total number of successful reports for the lifetime of the cluster. + #: The total number of successful reports for the lifetime of the cluster. total_success: int - # The total number of failed reports for the lifetime of the cluster. + #: The total number of failed reports for the lifetime of the cluster. total_failed: int - # The sequence number of the report. + #: The sequence number of the report. seq_number: int