mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00

Polars is significantly faster than the current pyarrow-based sort. This PR uses polars for the internal sort implementation if available. No API changes needed. On my laptop, this makes sorting 1GB about 2x faster: without polars $ python release/nightly_tests/dataset/sort.py --partition-size=1e7 --num-partitions=100 Dataset size: 100 partitions, 0.01GB partition size, 1.0GB total Finished in 50.23415923118591 ... Stage 2 sort: executed in 38.59s Substage 0 sort_map: 100/100 blocks executed * Remote wall time: 864.21ms min, 1.94s max, 1.4s mean, 140.39s total * Remote cpu time: 634.07ms min, 825.47ms max, 719.87ms mean, 71.99s total * Output num rows: 1250000 min, 1250000 max, 1250000 mean, 125000000 total * Output size bytes: 10000000 min, 10000000 max, 10000000 mean, 1000000000 total * Tasks per node: 100 min, 100 max, 100 mean; 1 nodes used Substage 1 sort_reduce: 100/100 blocks executed * Remote wall time: 125.66ms min, 2.3s max, 1.09s mean, 109.26s total * Remote cpu time: 96.17ms min, 1.34s max, 725.43ms mean, 72.54s total * Output num rows: 178073 min, 2313038 max, 1250000 mean, 125000000 total * Output size bytes: 1446844 min, 18793434 max, 10156250 mean, 1015625046 total * Tasks per node: 100 min, 100 max, 100 mean; 1 nodes used with polars $ python release/nightly_tests/dataset/sort.py --partition-size=1e7 --num-partitions=100 Dataset size: 100 partitions, 0.01GB partition size, 1.0GB total Finished in 24.097432136535645 ... Stage 2 sort: executed in 14.02s Substage 0 sort_map: 100/100 blocks executed * Remote wall time: 165.15ms min, 595.46ms max, 398.01ms mean, 39.8s total * Remote cpu time: 349.75ms min, 423.81ms max, 383.29ms mean, 38.33s total * Output num rows: 1250000 min, 1250000 max, 1250000 mean, 125000000 total * Output size bytes: 10000000 min, 10000000 max, 10000000 mean, 1000000000 total * Tasks per node: 100 min, 100 max, 100 mean; 1 nodes used Substage 1 sort_reduce: 100/100 blocks executed * Remote wall time: 21.21ms min, 472.34ms max, 232.1ms mean, 23.21s total * Remote cpu time: 29.81ms min, 460.67ms max, 238.1ms mean, 23.81s total * Output num rows: 114079 min, 2591410 max, 1250000 mean, 125000000 total * Output size bytes: 912632 min, 20731280 max, 10000000 mean, 1000000000 total * Tasks per node: 100 min, 100 max, 100 mean; 1 nodes used Related issue number Closes #23612.
103 lines
1.9 KiB
Text
103 lines
1.9 KiB
Text
# These are mirrored in setup.py as install_requires,
|
|
# which is what the users of the ray package will install. The rest of this file
|
|
# sets up all the packages necessary for a /developer/ of Ray.
|
|
#
|
|
# In short, if you change it here, PLEASE also change it in setup.py.
|
|
#
|
|
# setup.py install_requires
|
|
aiohttp>=3.7
|
|
aiosignal
|
|
click >= 7.0, <= 8.0.4
|
|
cloudpickle
|
|
filelock
|
|
frozenlist
|
|
gpustat >= 1.0.0b1
|
|
grpcio >= 1.28.1, != 1.44.0
|
|
jsonschema
|
|
msgpack >= 1.0.0, < 2.0.0
|
|
numpy >= 1.16
|
|
opencensus
|
|
prometheus_client >= 0.7.1, < 0.14.0
|
|
protobuf >= 3.8.0
|
|
py-spy >= 0.2.0
|
|
pydantic >= 1.8
|
|
pyyaml
|
|
requests
|
|
smart_open
|
|
virtualenv
|
|
|
|
## setup.py extras
|
|
dm_tree
|
|
flask
|
|
gym==0.21.0; python_version >= '3.7'
|
|
gym==0.19.0; python_version < '3.7'
|
|
lz4
|
|
scikit-image
|
|
pandas>=1.0.5; python_version < '3.7'
|
|
pandas>=1.2.0; python_version >= '3.7'
|
|
scipy==1.4.1
|
|
tabulate
|
|
tensorboardX >= 1.9
|
|
uvicorn==0.16.0
|
|
dataclasses; python_version < '3.7'
|
|
starlette
|
|
aiorwlock
|
|
|
|
# Requirements for running tests
|
|
pyarrow >= 6.0.1, < 7.0.0
|
|
# Used for Dataset tests.
|
|
polars
|
|
azure-cli-core==2.29.1
|
|
azure-identity==1.7.0
|
|
azure-mgmt-compute==23.1.0
|
|
azure-mgmt-network==19.0.0
|
|
azure-mgmt-resource==20.0.0
|
|
msrestazure==0.6.4
|
|
boto3
|
|
cython >= 0.29.26
|
|
dataclasses; python_version < '3.7'
|
|
feather-format
|
|
google-api-python-client
|
|
google-cloud-storage
|
|
gym-minigrid
|
|
kubernetes
|
|
lxml
|
|
moto[s3,server]
|
|
mypy
|
|
networkx
|
|
numba
|
|
asyncmock
|
|
# higher version of llvmlite breaks windows
|
|
llvmlite==0.34.0
|
|
openpyxl
|
|
opentelemetry-api==1.1.0
|
|
opentelemetry-sdk==1.1.0
|
|
opentelemetry-exporter-otlp==1.1.0
|
|
pexpect
|
|
Pillow; platform_system != "Windows"
|
|
pygments
|
|
pyspark==3.1.2
|
|
pytest==5.4.3
|
|
pytest-asyncio==0.16.0
|
|
pytest-rerunfailures
|
|
pytest-sugar
|
|
pytest-lazy-fixture
|
|
pytest-timeout
|
|
pytest-virtualenv
|
|
redis >= 3.5.0, < 4.0.0
|
|
scikit-learn==0.24.2
|
|
testfixtures
|
|
werkzeug
|
|
xlrd
|
|
starlette
|
|
fastapi
|
|
smart_open[s3]
|
|
tqdm
|
|
async-exit-stack
|
|
async-generator
|
|
cryptography>=3.0.0
|
|
proxy.py
|
|
# For doc tests
|
|
myst-parser==0.15.2
|
|
myst-nb==0.13.1
|
|
jupytext==1.13.6
|