From 4fb92dd2f1188882163912511b1a3bdd0f149c3a Mon Sep 17 00:00:00 2001 From: Clark Zinzow Date: Fri, 10 Jun 2022 16:42:16 -0700 Subject: [PATCH] [Datasets] Fix` __array__` protocol on `TensorArrayElement` and `TensorArray`. (#25647) This PR fixes two issues with the __array__ protocol on the tensor extension: 1. The __array__ protocol on TensorArrayElement was missing the dtype parameter, causing np.asarray(tae, dtype=some_dtype) calls to fail. This PR adds support for the dtype argument. 2. TensorArray and TensorArrayElement didn't support NumPy's scalar casting semantics for single-element tensors. This PR adds support for these scalar casting semantics. --- .../ray/air/util/tensor_extensions/pandas.py | 80 +++++++++++++++++-- python/ray/data/tests/test_dataset.py | 37 +++++++++ 2 files changed, 111 insertions(+), 6 deletions(-) diff --git a/python/ray/air/util/tensor_extensions/pandas.py b/python/ray/air/util/tensor_extensions/pandas.py index 09f5e00b3..94032d439 100644 --- a/python/ray/air/util/tensor_extensions/pandas.py +++ b/python/ray/air/util/tensor_extensions/pandas.py @@ -316,7 +316,31 @@ class TensorOpsMixin(pd.api.extensions.ExtensionScalarOpsMixin): return cls._create_method(op) -class TensorArrayElement(TensorOpsMixin): +class TensorScalarCastMixin: + """ + Mixin for casting scalar tensors to a particular numeric type. + """ + + def _scalarfunc(self, func: Callable[[Any], Any]): + return func(self._tensor) + + def __complex__(self): + return self._scalarfunc(complex) + + def __float__(self): + return self._scalarfunc(float) + + def __int__(self): + return self._scalarfunc(int) + + def __hex__(self): + return self._scalarfunc(hex) + + def __oct__(self): + return self._scalarfunc(oct) + + +class TensorArrayElement(TensorOpsMixin, TensorScalarCastMixin): """ Single element of a TensorArray, wrapping an underlying ndarray. """ @@ -336,18 +360,54 @@ class TensorArrayElement(TensorOpsMixin): def __str__(self): return self._tensor.__str__() + @property + def numpy_dtype(self): + """ + Get the dtype of the tensor. + :return: The numpy dtype of the backing ndarray + """ + return self._tensor.dtype + + @property + def numpy_ndim(self): + """ + Get the number of tensor dimensions. + :return: integer for the number of dimensions + """ + return self._tensor.ndim + + @property + def numpy_shape(self): + """ + Get the shape of the tensor. + :return: A tuple of integers for the numpy shape of the backing ndarray + """ + return self._tensor.shape + + @property + def numpy_size(self): + """ + Get the size of the tensor. + :return: integer for the number of elements in the tensor + """ + return self._tensor.size + def to_numpy(self): """ Return the values of this element as a NumPy ndarray. """ return np.asarray(self._tensor) - def __array__(self): - return np.asarray(self._tensor) + def __array__(self, dtype: np.dtype = None, **kwargs) -> np.ndarray: + return np.asarray(self._tensor, dtype=dtype, **kwargs) @PublicAPI(stability="beta") -class TensorArray(pd.api.extensions.ExtensionArray, TensorOpsMixin): +class TensorArray( + pd.api.extensions.ExtensionArray, + TensorOpsMixin, + TensorScalarCastMixin, +): """ Pandas `ExtensionArray` representing a tensor column, i.e. a column consisting of ndarrays as elements. All tensors in a column must have the @@ -891,8 +951,8 @@ class TensorArray(pd.api.extensions.ExtensionArray, TensorOpsMixin): except KeyError: raise NotImplementedError(f"'{name}' aggregate not implemented.") from None - def __array__(self, dtype: np.dtype = None): - return np.asarray(self._tensor, dtype=dtype) + def __array__(self, dtype: np.dtype = None, **kwargs) -> np.ndarray: + return np.asarray(self._tensor, dtype=dtype, **kwargs) def __array_ufunc__(self, ufunc: Callable, method: str, *inputs, **kwargs): """ @@ -989,6 +1049,14 @@ class TensorArray(pd.api.extensions.ExtensionArray, TensorOpsMixin): """ return self._tensor.shape + @property + def numpy_size(self): + """ + Get the size of the tensor. + :return: integer for the number of elements in the tensor + """ + return self._tensor.size + @property def _is_boolean(self): """ diff --git a/python/ray/data/tests/test_dataset.py b/python/ray/data/tests/test_dataset.py index 68234fc4d..41d26b8f7 100644 --- a/python/ray/data/tests/test_dataset.py +++ b/python/ray/data/tests/test_dataset.py @@ -604,6 +604,43 @@ def test_tensor_array_ops(ray_start_regular_shared): np.testing.assert_equal(apply_logical_ops(arr), apply_logical_ops(df["two"])) +def test_tensor_array_array_protocol(ray_start_regular_shared): + outer_dim = 3 + inner_shape = (2, 2, 2) + shape = (outer_dim,) + inner_shape + num_items = np.prod(np.array(shape)) + arr = np.arange(num_items).reshape(shape) + + t_arr = TensorArray(arr) + + np.testing.assert_array_equal( + np.asarray(t_arr, dtype=np.float32), arr.astype(np.float32) + ) + + t_arr_elem = t_arr[0] + + np.testing.assert_array_equal( + np.asarray(t_arr_elem, dtype=np.float32), arr[0].astype(np.float32) + ) + + +def test_tensor_array_scalar_cast(ray_start_regular_shared): + outer_dim = 3 + inner_shape = (1,) + shape = (outer_dim,) + inner_shape + num_items = np.prod(np.array(shape)) + arr = np.arange(num_items).reshape(shape) + + t_arr = TensorArray(arr) + + for t_arr_elem, arr_elem in zip(t_arr, arr): + assert float(t_arr_elem) == float(arr_elem) + + arr = np.arange(1).reshape((1, 1, 1)) + t_arr = TensorArray(arr) + assert float(t_arr) == float(arr) + + def test_tensor_array_reductions(ray_start_regular_shared): outer_dim = 3 inner_shape = (2, 2, 2)