mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00

Although there's enough quota, it is possible the AWS doesn't have enough capacity to start up new nodes. According to @allenyin55, the current wait for node timeout is too short. This PR increases the timeout to 3000 seconds (50 minutes) from 600 seconds. Let's see if this can resolve the issue. If it makes things worse, I will revert it quickly (I will closely monitor the infra failure rate)
204 lines
No EOL
3.2 KiB
JSON
204 lines
No EOL
3.2 KiB
JSON
{
|
|
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
"$ref": "#/definitions/Test",
|
|
"definitions": {
|
|
"Test": {
|
|
"type": "object",
|
|
"additionalProperties": false,
|
|
"properties": {
|
|
"name": {
|
|
"type": "string"
|
|
},
|
|
"group": {
|
|
"type": "string"
|
|
},
|
|
"working_dir": {
|
|
"type": "string"
|
|
},
|
|
"legacy": {
|
|
"$ref": "#/definitions/Legacy"
|
|
},
|
|
"stable": {
|
|
"type": "boolean"
|
|
},
|
|
"frequency": {
|
|
"type": "string",
|
|
"enum": [
|
|
"disabled",
|
|
"multi",
|
|
"nightly",
|
|
"weekly"
|
|
]
|
|
},
|
|
"team": {
|
|
"type": "string"
|
|
},
|
|
"driver_setup": {
|
|
"type": "string"
|
|
},
|
|
"cluster": {
|
|
"$ref": "#/definitions/Cluster"
|
|
},
|
|
"run": {
|
|
"$ref": "#/definitions/Run"
|
|
},
|
|
"smoke_test": {
|
|
"$ref": "#/definitions/SmokeTest"
|
|
},
|
|
"alert": {
|
|
"type": "string"
|
|
}
|
|
},
|
|
"required": [
|
|
"cluster",
|
|
"frequency",
|
|
"name",
|
|
"run",
|
|
"team",
|
|
"working_dir"
|
|
],
|
|
"title": "Test"
|
|
},
|
|
"Cluster": {
|
|
"type": "object",
|
|
"additionalProperties": false,
|
|
"properties": {
|
|
"cluster_env": {
|
|
"type": "string"
|
|
},
|
|
"cluster_compute": {
|
|
"type": "string"
|
|
},
|
|
"autosuspend_mins": {
|
|
"type": "integer",
|
|
"minimum": -1
|
|
},
|
|
"cloud_id": {
|
|
"type": "string"
|
|
},
|
|
"cloud_name": {
|
|
"type": "string"
|
|
}
|
|
},
|
|
"required": [
|
|
"cluster_compute",
|
|
"cluster_env"
|
|
],
|
|
"title": "Cluster"
|
|
},
|
|
"Legacy": {
|
|
"type": "object",
|
|
"additionalProperties": false,
|
|
"properties": {
|
|
"test_name": {
|
|
"type": "string"
|
|
},
|
|
"test_suite": {
|
|
"type": "string"
|
|
}
|
|
},
|
|
"required": [
|
|
"test_name",
|
|
"test_suite"
|
|
],
|
|
"title": "Legacy"
|
|
},
|
|
"Run": {
|
|
"type": "object",
|
|
"additionalProperties": false,
|
|
"properties": {
|
|
"type": {
|
|
"type": "string",
|
|
"enum": [
|
|
"command",
|
|
"sdk_command",
|
|
"job",
|
|
"client"
|
|
]
|
|
},
|
|
"file_manager": {
|
|
"type": "string",
|
|
"enum": [
|
|
|
|
"sdk",
|
|
"client",
|
|
"job"
|
|
]
|
|
},
|
|
"wait_for_nodes": {
|
|
"$ref": "#/definitions/WaitForNodes"
|
|
},
|
|
"prepare": {
|
|
"type": "string"
|
|
},
|
|
"prepare_timeout": {
|
|
"type": "integer"
|
|
},
|
|
"script": {
|
|
"type": "string"
|
|
},
|
|
"timeout": {
|
|
"type": "integer"
|
|
},
|
|
"long_running": {
|
|
"type": "boolean"
|
|
}
|
|
},
|
|
"required": [
|
|
"script",
|
|
"timeout"
|
|
],
|
|
"title": "Run"
|
|
},
|
|
"WaitForNodes": {
|
|
"type": "object",
|
|
"additionalProperties": false,
|
|
"properties": {
|
|
"num_nodes": {
|
|
"type": "integer"
|
|
},
|
|
"timeout": {
|
|
"type": "integer"
|
|
}
|
|
},
|
|
"required": [
|
|
"num_nodes"
|
|
],
|
|
"title": "WaitForNodes"
|
|
},
|
|
"SmokeTest": {
|
|
"type": "object",
|
|
"additionalProperties": false,
|
|
"title": "SmokeTest",
|
|
"properties": {
|
|
"working_dir": {
|
|
"type": "string"
|
|
},
|
|
"driver_setup": {
|
|
"type": "string"
|
|
},
|
|
"cluster": {
|
|
"type": "object"
|
|
},
|
|
"run": {
|
|
"type": "object"
|
|
},
|
|
"alert": {
|
|
"type": "string"
|
|
},
|
|
"frequency": {
|
|
"type": "string",
|
|
"enum": [
|
|
"disabled",
|
|
"multi",
|
|
"nightly",
|
|
"weekly"
|
|
]
|
|
}
|
|
},
|
|
"required": [
|
|
"frequency"
|
|
]
|
|
}
|
|
}
|
|
} |