ray/release/release_tests.yaml

# Global release test configuration file.
# All your release test configuration should go here. Adding release tests here
# will automatically enable them in the Buildkite release testing schedules
# (except they have frequency: disabled).
# Here is an example configuration for reference:
#- name: example_test
#  # Tests with the same group will be grouped in the Buildkite UI
#  group: Example group
#  # Provide the working directory which will be uploaded to the cluster
#  working_dir: example_dir
#
#  # For release test infra migration, we provide these fields that are populated
#  # in the database
#  legacy:
#    test_name: example_test
#    test_suite: examples
#
#  # How often to run the tests.
#  # One of [disabled, any, multi, nightly, weekly].
#  frequency: weekly
#  # Owning team. This field will be persisted to the database
#  team: ml
#
#  # Optional location of a bash setup script to run on the driver
#  # when setting up the local environment. Relative to working_dir
#  driver_setup: setup_driver.sh
#
#  # Cluster information
#  cluster:
#    # Location of cluster env, relative to working_dir
#    cluster_env: cluster_env.yaml
#    # Location of cluster compute, relative to working_dir
#    cluster_compute: cluster_compute.yaml
#    # Autosuspend parameter passed to the cluster.
#    # The cluster will automatically terminate if inactive for this
#    # many minutes. Defaults to 10 if not set.
#    autosuspend_mins: 10
#    # Optional cloud_id to use instead of the default cloud
#    cloud_id: cld_12345678
#    # Alternatively, you can specify a cloud name
#    cloud_name: anyscale_default_cloud
#
#  # Run configuration for the test
#  run:
#    # Type of test. Can be sdk_command or client (job to be implemented soon).
#    # Uses either Anyscale SDK commands or the Ray client to run the actual
#    # release test.
#    type: sdk_command
#
#    # File manager to use to transfer files to and from the cluster.
#    # Can be any of [sdk, client, job].
#    file_manager: job
#
#    # If you want to wait for nodes to be ready, you can specify this here:
#    wait_for_nodes:
#      # Number of nodes
#      num_nodes: 16
#      # Timeout for waiting for nodes. If nodes are not up by then, the
#      # test will fail.
#      timeout: 600
#
#    # Optional prepare script to be run on the cluster before the test script
#    prepare: python prepare.py
#    # The prepare command can have a separate timeout
#    prepare_timeout: 300
#
#    # Main script to run as the test script
#    script: python workloads/train_small.py
#    # Timeout in seconds. After this time the test is considered as failed.
#    timeout: 600
#
#  # You can specify smoke test definitions here. If a smoke test is triggered,
#  # it will deep update the main test configuration with the values provided
#  # here. Smoke tests will automatically run with IS_SMOKE_TEST=1 as en
#  # environment variable and receive the --smoke-test flag as a parameter in the
#  # run script.
#  smoke_test:
#    # Smoke tests can have different frequencies. A smoke test is only triggered
#    # when the regular test is not matched.
#    frequency: nightly
#    # Here we adjust the run timeout down and run on less nodes. The test script
#    # remains the same.
#    run:
#      timeout: 300
#      wait_for_nodes:
#        num_nodes: 4
#        timeout: 600
#
#  # After the test finished, this handler (in alerts/) will process the results.
#  # It can then let the test fail, e.g. if a metric regression is observed.
#  alert: default


#######################
# XGBoost release tests
#######################
- name: xgboost_train_small
  group: XGBoost
  working_dir: xgboost_tests

  legacy:
    test_name: train_small
    test_suite: xgboost_tests

  frequency: nightly
  team: ml

  cluster:
    cluster_env: app_config.yaml
    cluster_compute: tpl_cpu_small.yaml
    autosuspend_mins: 10

  run:
    timeout: 600
    script: python workloads/train_small.py

    wait_for_nodes:
      num_nodes: 4
      timeout: 600

    type: client

  alert: xgboost_tests

- name: xgboost_train_moderate
  group: XGBoost
  working_dir: xgboost_tests

  legacy:
    test_name: train_moderate
    test_suite: xgboost_tests

  frequency: nightly
  team: ml

  cluster:
    cluster_env: app_config.yaml
    cluster_compute: tpl_cpu_moderate.yaml

  run:
    timeout: 600
    script: python workloads/train_moderate.py

    wait_for_nodes:
      num_nodes: 32
      timeout: 600

    type: sdk_command
    file_manager: job

  alert: xgboost_tests

- name: xgboost_train_gpu
  group: XGBoost
  working_dir: xgboost_tests

  legacy:
    test_name: train_gpu
    test_suite: xgboost_tests

  frequency: nightly
  team: ml

  cluster:
    cluster_env: app_config_gpu.yaml
    cluster_compute: tpl_gpu_small.yaml

  run:
    timeout: 600
    script: python workloads/train_gpu.py

    wait_for_nodes:
      num_nodes: 5
      timeout: 600

    type: sdk_command
    file_manager: job

  alert: xgboost_tests

- name: xgboost_distributed_api_test
  group: XGBoost
  working_dir: xgboost_tests

  legacy:
    test_name: distributed_api_test
    test_suite: xgboost_tests

  frequency: nightly
  team: ml

  cluster:
    cluster_env: app_config.yaml
    cluster_compute: tpl_cpu_small.yaml

  run:
    timeout: 600
    script: python workloads/distributed_api_test.py
    wait_for_nodes:
      num_nodes: 4
      timeout: 600

    type: sdk_command
    file_manager: job

  alert: xgboost_tests

- name: xgboost_ft_small_elastic
  group: XGBoost
  working_dir: xgboost_tests

  legacy:
    test_name: ft_small_elastic
    test_suite: xgboost_tests

  frequency: nightly
  team: ml

  cluster:
    cluster_env: app_config.yaml
    cluster_compute: tpl_cpu_small.yaml

  run:
    timeout: 900
    script: python workloads/ft_small_elastic.py

    wait_for_nodes:
      num_nodes: 4
      timeout: 600

    type: sdk_command
    file_manager: job

  alert: xgboost_tests

- name: xgboost_ft_small_non_elastic
  group: XGBoost
  working_dir: xgboost_tests

  legacy:
    test_name: ft_small_non_elastic
    test_suite: xgboost_tests

  frequency: nightly
  team: ml

  cluster:
    cluster_env: app_config.yaml
    cluster_compute: tpl_cpu_small.yaml

  run:
    timeout: 900
    script: python workloads/ft_small_non_elastic.py

    wait_for_nodes:
      num_nodes: 4
      timeout: 600

    type: sdk_command
    file_manager: job

  alert: xgboost_tests

- name: xgboost_tune_small
  group: XGBoost
  working_dir: xgboost_tests

  legacy:
    test_name: tune_small
    test_suite: xgboost_tests

  frequency: nightly
  team: ml

  cluster:
    cluster_env: app_config.yaml
    cluster_compute: tpl_cpu_small.yaml

  run:
    timeout: 600
    script: python workloads/tune_small.py

    wait_for_nodes:
      num_nodes: 4
      timeout: 600

    type: sdk_command
    file_manager: job

  alert: xgboost_tests

- name: xgboost_tune_32x4
  group: XGBoost
  working_dir: xgboost_tests

  legacy:
    test_name: tune_32x4
    test_suite: xgboost_tests

  frequency: nightly
  team: ml

  cluster:
    cluster_env: app_config.yaml
    cluster_compute: tpl_cpu_moderate.yaml

  run:
    timeout: 900
    script: python workloads/tune_32x4.py

    wait_for_nodes:
      num_nodes: 32
      timeout: 600

    type: sdk_command
    file_manager: job

  alert: xgboost_tests

- name: xgboost_tune_4x32
  group: XGBoost
  working_dir: xgboost_tests

  legacy:
    test_name: tune_4x32
    test_suite: xgboost_tests

  frequency: nightly
  team: ml

  cluster:
    cluster_env: app_config.yaml
    cluster_compute: tpl_cpu_moderate.yaml

  run:
    timeout: 900
    script: python workloads/tune_4x32.py

    wait_for_nodes:
      num_nodes: 32
      timeout: 600

    type: sdk_command
    file_manager: job

  alert: xgboost_tests