[Datasets] Support ignoring NaNs in aggregations. (#20787)

Adds support for ignoring NaNs in aggregations. NaNs will now be ignored by default, and the user can pass in `ds.mean("A", ignore_nulls=False)` if they would rather have the NaN be propagated to the output. Specifically, we'd have the following null-handling semantics: 1. Mix of values and nulls - `ignore_nulls`=True: Ignore the nulls, return aggregation of values 2. Mix of values and nulls - `ignore_nulls`=False: Return `None` 3. All nulls: Return `None` 4. Empty dataset: Return `None` This all null and empty dataset handling matches the semantics of NumPy and Pandas.
2025-03-06 10:31:39 -05:00 · 2022-02-09 02:07:58 -06:00 · 2022-02-09 02:07:58 -06:00 · f264cf800a
commit f264cf800a
parent f0d8b6d701
5 changed files with 892 additions and 111 deletions
--- a/python/ray/data/aggregate.py
+++ b/python/ray/data/aggregate.py
@ -3,6 +3,12 @@ from typing import Callable, Optional, List, TYPE_CHECKING
 from ray.util.annotations import PublicAPI
 from ray.data.block import T, U, KeyType, AggType, KeyFn, _validate_key_fn
 from ray.data.impl.null_aggregate import (
    _null_wrap_init,
    _null_wrap_accumulate,
    _null_wrap_merge,
    _null_wrap_finalize,
 )
 if TYPE_CHECKING:
    from ray.data import Dataset
@ -75,13 +81,15 @@ class Count(AggregateFn):
 class Sum(_AggregateOnKeyBase):
    """Defines sum aggregation."""
-    def __init__(self, on: Optional[KeyFn] = None):
+    def __init__(self, on: Optional[KeyFn] = None, ignore_nulls: bool = True):
        self._set_key_fn(on)
        on_fn = _to_on_fn(on)
        super().__init__(
-            init=lambda k: 0,
+            init=_null_wrap_init(lambda k: 0),
-            accumulate=lambda a, r: a + on_fn(r),
+            accumulate=_null_wrap_accumulate(ignore_nulls, on_fn, lambda a, r: a + r),
-            merge=lambda a1, a2: a1 + a2,
+            merge=_null_wrap_merge(ignore_nulls, lambda a1, a2: a1 + a2),
            finalize=_null_wrap_finalize(lambda a: a),
            name=(f"sum({str(on)})"),
        )
@ -90,13 +98,15 @@ class Sum(_AggregateOnKeyBase):
 class Min(_AggregateOnKeyBase):
    """Defines min aggregation."""
-    def __init__(self, on: Optional[KeyFn] = None):
+    def __init__(self, on: Optional[KeyFn] = None, ignore_nulls: bool = True):
        self._set_key_fn(on)
        on_fn = _to_on_fn(on)
        super().__init__(
-            init=lambda k: None,
+            init=_null_wrap_init(lambda k: float("inf")),
-            accumulate=(lambda a, r: (on_fn(r) if a is None else min(a, on_fn(r)))),
+            accumulate=_null_wrap_accumulate(ignore_nulls, on_fn, min),
-            merge=lambda a1, a2: min(a1, a2),
+            merge=_null_wrap_merge(ignore_nulls, min),
            finalize=_null_wrap_finalize(lambda a: a),
            name=(f"min({str(on)})"),
        )
@ -105,13 +115,15 @@ class Min(_AggregateOnKeyBase):
 class Max(_AggregateOnKeyBase):
    """Defines max aggregation."""
-    def __init__(self, on: Optional[KeyFn] = None):
+    def __init__(self, on: Optional[KeyFn] = None, ignore_nulls: bool = True):
        self._set_key_fn(on)
        on_fn = _to_on_fn(on)
        super().__init__(
-            init=lambda k: None,
+            init=_null_wrap_init(lambda k: float("-inf")),
-            accumulate=(lambda a, r: (on_fn(r) if a is None else max(a, on_fn(r)))),
+            accumulate=_null_wrap_accumulate(ignore_nulls, on_fn, max),
-            merge=lambda a1, a2: max(a1, a2),
+            merge=_null_wrap_merge(ignore_nulls, max),
            finalize=_null_wrap_finalize(lambda a: a),
            name=(f"max({str(on)})"),
        )
@ -120,14 +132,19 @@ class Max(_AggregateOnKeyBase):
 class Mean(_AggregateOnKeyBase):
    """Defines mean aggregation."""
-    def __init__(self, on: Optional[KeyFn] = None):
+    def __init__(self, on: Optional[KeyFn] = None, ignore_nulls: bool = True):
        self._set_key_fn(on)
        on_fn = _to_on_fn(on)
        super().__init__(
-            init=lambda k: [0, 0],
+            init=_null_wrap_init(lambda k: [0, 0]),
-            accumulate=lambda a, r: [a[0] + on_fn(r), a[1] + 1],
+            accumulate=_null_wrap_accumulate(
-            merge=lambda a1, a2: [a1[0] + a2[0], a1[1] + a2[1]],
+                ignore_nulls, on_fn, lambda a, r: [a[0] + r, a[1] + 1]
-            finalize=lambda a: a[0] / a[1],
+            ),
            merge=_null_wrap_merge(
                ignore_nulls, lambda a1, a2: [a1[0] + a2[0], a1[1] + a2[1]]
            ),
            finalize=_null_wrap_finalize(lambda a: a[0] / a[1]),
            name=(f"mean({str(on)})"),
        )
@ -145,7 +162,12 @@ class Std(_AggregateOnKeyBase):
    https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
    """
-    def __init__(self, on: Optional[KeyFn] = None, ddof: int = 1):
+    def __init__(
        self,
        on: Optional[KeyFn] = None,
        ddof: int = 1,
        ignore_nulls: bool = True,
    ):
        self._set_key_fn(on)
        on_fn = _to_on_fn(on)
@ -153,14 +175,11 @@ class Std(_AggregateOnKeyBase):
            # Accumulates the current count, the current mean, and the sum of
            # squared differences from the current mean (M2).
            M2, mean, count = a
            # Select the data on which we want to calculate the standard
            # deviation.
            val = on_fn(r)
            count += 1
-            delta = val - mean
+            delta = r - mean
            mean += delta / count
-            delta2 = val - mean
+            delta2 = r - mean
            M2 += delta * delta2
            return [M2, mean, count]
@ -190,10 +209,10 @@ class Std(_AggregateOnKeyBase):
            return math.sqrt(M2 / (count - ddof))
        super().__init__(
-            init=lambda k: [0, 0, 0],
+            init=_null_wrap_init(lambda k: [0, 0, 0]),
-            accumulate=accumulate,
+            accumulate=_null_wrap_accumulate(ignore_nulls, on_fn, accumulate),
-            merge=merge,
+            merge=_null_wrap_merge(ignore_nulls, merge),
-            finalize=finalize,
+            finalize=_null_wrap_finalize(finalize),
            name=(f"std({str(on)})"),
        )
--- a/python/ray/data/dataset.py
+++ b/python/ray/data/dataset.py
@ -562,7 +562,7 @@ class Dataset(Generic[T]):
        return Dataset(new_blocks, self._epoch, stats.build_multistage(stage_info))
    def split(
-        self, n: int, *, equal: bool = False, locality_hints: List[Any] = None
+        self, n: int, *, equal: bool = False, locality_hints: Optional[List[Any]] = None
    ) -> List["Dataset[T]"]:
        """Split the dataset into ``n`` disjoint pieces.
@ -975,7 +975,7 @@ class Dataset(Generic[T]):
            LazyBlockList(calls, metadata, block_partitions), max_epoch, dataset_stats
        )
-    def groupby(self, key: KeyFn) -> "GroupedDataset[T]":
+    def groupby(self, key: Optional[KeyFn]) -> "GroupedDataset[T]":
        """Group the dataset by the key function or column name.
        This is a lazy operation.
@ -1034,7 +1034,9 @@ class Dataset(Generic[T]):
        ret = self.groupby(None).aggregate(*aggs).take(1)
        return ret[0] if len(ret) > 0 else None
-    def sum(self, on: Union[KeyFn, List[KeyFn]] = None) -> U:
+    def sum(
        self, on: Optional[Union[KeyFn, List[KeyFn]]] = None, ignore_nulls: bool = True
    ) -> U:
        """Compute sum over entire dataset.
        This is a blocking operation.
@ -1057,6 +1059,11 @@ class Dataset(Generic[T]):
                - For an Arrow dataset: it can be a column name or a list
                  thereof, and the default is to return an ``ArrowRow``
                  containing the column-wise sum of all columns.
            ignore_nulls: Whether to ignore null values. If ``True``, null
                values will be ignored when computing the sum; if ``False``,
                if a null value is encountered, the output will be None.
                We consider np.nan, None, and pd.NaT to be null values.
                Default is ``True``.
        Returns:
            The sum result.
@ -1079,15 +1086,15 @@ class Dataset(Generic[T]):
            - ``on=["col_1", ..., "col_n"]``: an n-column ``ArrowRow``
              containing the column-wise sum of the provided columns.
-            If the dataset is empty, then the output is 0.
+            If the dataset is empty, all values are null, or any value is null
            AND ``ignore_nulls`` is ``False``, then the output will be None.
        """
-        ret = self._aggregate_on(Sum, on)
+        ret = self._aggregate_on(Sum, on, ignore_nulls)
-        if ret is None:
+        return self._aggregate_result(ret)
            return 0
        else:
            return self._aggregate_result(ret)
-    def min(self, on: Union[KeyFn, List[KeyFn]] = None) -> U:
+    def min(
        self, on: Optional[Union[KeyFn, List[KeyFn]]] = None, ignore_nulls: bool = True
    ) -> U:
        """Compute minimum over entire dataset.
        This is a blocking operation.
@ -1110,6 +1117,11 @@ class Dataset(Generic[T]):
                - For an Arrow dataset: it can be a column name or a list
                  thereof, and the default is to return an ``ArrowRow``
                  containing the column-wise min of all columns.
            ignore_nulls: Whether to ignore null values. If ``True``, null
                values will be ignored when computing the min; if ``False``,
                if a null value is encountered, the output will be None.
                We consider np.nan, None, and pd.NaT to be null values.
                Default is ``True``.
        Returns:
            The min result.
@ -1132,15 +1144,15 @@ class Dataset(Generic[T]):
            - ``on=["col_1", ..., "col_n"]``: an n-column ``ArrowRow``
              containing the column-wise min of the provided columns.
-            If the dataset is empty, then a ``ValueError`` is raised.
+            If the dataset is empty, all values are null, or any value is null
            AND ``ignore_nulls`` is ``False``, then the output will be None.
        """
-        ret = self._aggregate_on(Min, on)
+        ret = self._aggregate_on(Min, on, ignore_nulls)
-        if ret is None:
+        return self._aggregate_result(ret)
            raise ValueError("Cannot compute min on an empty dataset")
        else:
            return self._aggregate_result(ret)
-    def max(self, on: Union[KeyFn, List[KeyFn]] = None) -> U:
+    def max(
        self, on: Optional[Union[KeyFn, List[KeyFn]]] = None, ignore_nulls: bool = True
    ) -> U:
        """Compute maximum over entire dataset.
        This is a blocking operation.
@ -1163,6 +1175,11 @@ class Dataset(Generic[T]):
                - For an Arrow dataset: it can be a column name or a list
                  thereof, and the default is to return an ``ArrowRow``
                  containing the column-wise max of all columns.
            ignore_nulls: Whether to ignore null values. If ``True``, null
                values will be ignored when computing the max; if ``False``,
                if a null value is encountered, the output will be None.
                We consider np.nan, None, and pd.NaT to be null values.
                Default is ``True``.
        Returns:
            The max result.
@ -1185,15 +1202,15 @@ class Dataset(Generic[T]):
            - ``on=["col_1", ..., "col_n"]``: an n-column ``ArrowRow``
              containing the column-wise max of the provided columns.
-            If the dataset is empty, then a ``ValueError`` is raised.
+            If the dataset is empty, all values are null, or any value is null
            AND ``ignore_nulls`` is ``False``, then the output will be None.
        """
-        ret = self._aggregate_on(Max, on)
+        ret = self._aggregate_on(Max, on, ignore_nulls)
-        if ret is None:
+        return self._aggregate_result(ret)
            raise ValueError("Cannot compute max on an empty dataset")
        else:
            return self._aggregate_result(ret)
-    def mean(self, on: Union[KeyFn, List[KeyFn]] = None) -> U:
+    def mean(
        self, on: Optional[Union[KeyFn, List[KeyFn]]] = None, ignore_nulls: bool = True
    ) -> U:
        """Compute mean over entire dataset.
        This is a blocking operation.
@ -1216,6 +1233,11 @@ class Dataset(Generic[T]):
                - For an Arrow dataset: it can be a column name or a list
                  thereof, and the default is to return an ``ArrowRow``
                  containing the column-wise mean of all columns.
            ignore_nulls: Whether to ignore null values. If ``True``, null
                values will be ignored when computing the mean; if ``False``,
                if a null value is encountered, the output will be None.
                We consider np.nan, None, and pd.NaT to be null values.
                Default is ``True``.
        Returns:
            The mean result.
@ -1238,15 +1260,18 @@ class Dataset(Generic[T]):
            - ``on=["col_1", ..., "col_n"]``: an n-column ``ArrowRow``
              containing the column-wise mean of the provided columns.
-            If the dataset is empty, then a ``ValueError`` is raised.
+            If the dataset is empty, all values are null, or any value is null
            AND ``ignore_nulls`` is ``False``, then the output will be None.
        """
-        ret = self._aggregate_on(Mean, on)
+        ret = self._aggregate_on(Mean, on, ignore_nulls)
-        if ret is None:
+        return self._aggregate_result(ret)
            raise ValueError("Cannot compute mean on an empty dataset")
        else:
            return self._aggregate_result(ret)
-    def std(self, on: Union[KeyFn, List[KeyFn]] = None, ddof: int = 1) -> U:
+    def std(
        self,
        on: Optional[Union[KeyFn, List[KeyFn]]] = None,
        ddof: int = 1,
        ignore_nulls: bool = True,
    ) -> U:
        """Compute standard deviation over entire dataset.
        This is a blocking operation.
@ -1279,6 +1304,11 @@ class Dataset(Generic[T]):
                  containing the column-wise std of all columns.
            ddof: Delta Degrees of Freedom. The divisor used in calculations
                is ``N - ddof``, where ``N`` represents the number of elements.
            ignore_nulls: Whether to ignore null values. If ``True``, null
                values will be ignored when computing the std; if ``False``,
                if a null value is encountered, the output will be None.
                We consider np.nan, None, and pd.NaT to be null values.
                Default is ``True``.
        Returns:
            The standard deviation result.
@ -1301,15 +1331,15 @@ class Dataset(Generic[T]):
            - ``on=["col_1", ..., "col_n"]``: an n-column ``ArrowRow``
              containing the column-wise std of the provided columns.
-            If the dataset is empty, then a ``ValueError`` is raised.
+            If the dataset is empty, all values are null, or any value is null
            AND ``ignore_nulls`` is ``False``, then the output will be None.
        """
-        ret = self._aggregate_on(Std, on, ddof=ddof)
+        ret = self._aggregate_on(Std, on, ignore_nulls, ddof=ddof)
-        if ret is None:
+        return self._aggregate_result(ret)
            raise ValueError("Cannot compute std on an empty dataset")
        else:
            return self._aggregate_result(ret)
-    def sort(self, key: KeyFn = None, descending: bool = False) -> "Dataset[T]":
+    def sort(
        self, key: Optional[KeyFn] = None, descending: bool = False
    ) -> "Dataset[T]":
        """Sort the dataset by the specified key column or key function.
        This is a blocking operation.
@ -1864,7 +1894,7 @@ class Dataset(Generic[T]):
        self,
        *,
        prefetch_blocks: int = 0,
-        batch_size: int = None,
+        batch_size: Optional[int] = None,
        batch_format: str = "native",
        drop_last: bool = False,
    ) -> Iterator[BatchType]:
@ -1953,12 +1983,12 @@ class Dataset(Generic[T]):
        self,
        *,
        label_column: Optional[str] = None,
-        feature_columns: Union[
+        feature_columns: Optional[
-            None, List[str], List[List[str]], Dict[str, List[str]]
+            Union[List[str], List[List[str]], Dict[str, List[str]]]
        ] = None,
        label_column_dtype: Optional["torch.dtype"] = None,
-        feature_column_dtypes: Union[
+        feature_column_dtypes: Optional[
-            None, "torch.dtype", List["torch.dtype"], Dict[str, "torch.dtype"]
+            Union["torch.dtype", List["torch.dtype"], Dict[str, "torch.dtype"]]
        ] = None,
        batch_size: int = 1,
        prefetch_blocks: int = 0,
@ -2403,7 +2433,7 @@ Dict[str, List[str]]]): The names of the columns
        block_to_arrow = cached_remote_fn(_block_to_arrow)
        return [block_to_arrow.remote(block) for block in blocks]
-    def repeat(self, times: int = None) -> "DatasetPipeline[T]":
+    def repeat(self, times: Optional[int] = None) -> "DatasetPipeline[T]":
        """Convert this into a DatasetPipeline by looping over this dataset.
        Transformations prior to the call to ``repeat()`` are evaluated once.
@ -2665,7 +2695,7 @@ Dict[str, List[str]]]): The names of the columns
        return "simple"
    def _aggregate_on(
-        self, agg_cls: type, on: Union[KeyFn, List[KeyFn]], *args, **kwargs
+        self, agg_cls: type, on: Optional[Union[KeyFn, List[KeyFn]]], *args, **kwargs
    ):
        """Helper for aggregating on a particular subset of the dataset.
@ -2680,9 +2710,10 @@ Dict[str, List[str]]]): The names of the columns
    def _build_multicolumn_aggs(
        self,
        agg_cls: type,
-        on: Union[KeyFn, List[KeyFn]],
+        on: Optional[Union[KeyFn, List[KeyFn]]],
-        skip_cols: Optional[List[str]] = None,
+        ignore_nulls: bool,
        *args,
        skip_cols: Optional[List[str]] = None,
        **kwargs,
    ):
        """Build set of aggregations for applying a single aggregation to
@ -2706,10 +2737,10 @@ Dict[str, List[str]]]): The names of the columns
        if not isinstance(on, list):
            on = [on]
-        return [agg_cls(on_, *args, **kwargs) for on_ in on]
+        return [agg_cls(on_, *args, ignore_nulls=ignore_nulls, **kwargs) for on_ in on]
    def _aggregate_result(self, result: Union[Tuple, TableRow]) -> U:
-        if len(result) == 1:
+        if result is not None and len(result) == 1:
            if isinstance(result, tuple):
                return result[0]
            else:
--- a/python/ray/data/grouped_dataset.py
+++ b/python/ray/data/grouped_dataset.py
@ -119,7 +119,12 @@ class GroupedDataset(Generic[T]):
        )
    def _aggregate_on(
-        self, agg_cls: type, on: Union[KeyFn, List[KeyFn]], *args, **kwargs
+        self,
        agg_cls: type,
        on: Union[KeyFn, List[KeyFn]],
        ignore_nulls: bool,
        *args,
        **kwargs
    ):
        """Helper for aggregating on a particular subset of the dataset.
@ -129,7 +134,7 @@ class GroupedDataset(Generic[T]):
        aggregation on the entire row for a simple Dataset.
        """
        aggs = self._dataset._build_multicolumn_aggs(
-            agg_cls, on, *args, skip_cols=self._key, **kwargs
+            agg_cls, on, ignore_nulls, *args, skip_cols=self._key, **kwargs
        )
        return self.aggregate(*aggs)
@ -152,7 +157,9 @@ class GroupedDataset(Generic[T]):
        """
        return self.aggregate(Count())
-    def sum(self, on: Union[KeyFn, List[KeyFn]] = None) -> Dataset[U]:
+    def sum(
        self, on: Union[KeyFn, List[KeyFn]] = None, ignore_nulls: bool = True
    ) -> Dataset[U]:
        """Compute grouped sum aggregation.
        This is a blocking operation.
@ -179,6 +186,11 @@ class GroupedDataset(Generic[T]):
                - For an Arrow dataset: it can be a column name or a list
                  thereof, and the default is to do a column-wise sum of all
                  columns.
            ignore_nulls: Whether to ignore null values. If ``True``, null
                values will be ignored when computing the sum; if ``False``,
                if a null value is encountered, the output will be null.
                We consider np.nan, None, and pd.NaT to be null values.
                Default is ``True``.
        Returns:
            The sum result.
@ -203,9 +215,11 @@ class GroupedDataset(Generic[T]):
            If groupby key is ``None`` then the key part of return is omitted.
        """
-        return self._aggregate_on(Sum, on)
+        return self._aggregate_on(Sum, on, ignore_nulls)
-    def min(self, on: Union[KeyFn, List[KeyFn]] = None) -> Dataset[U]:
+    def min(
        self, on: Union[KeyFn, List[KeyFn]] = None, ignore_nulls: bool = True
    ) -> Dataset[U]:
        """Compute grouped min aggregation.
        This is a blocking operation.
@ -232,6 +246,11 @@ class GroupedDataset(Generic[T]):
                - For an Arrow dataset: it can be a column name or a list
                  thereof, and the default is to do a column-wise min of all
                  columns.
            ignore_nulls: Whether to ignore null values. If ``True``, null
                values will be ignored when computing the min; if ``False``,
                if a null value is encountered, the output will be null.
                We consider np.nan, None, and pd.NaT to be null values.
                Default is ``True``.
        Returns:
            The min result.
@ -256,9 +275,11 @@ class GroupedDataset(Generic[T]):
            If groupby key is ``None`` then the key part of return is omitted.
        """
-        return self._aggregate_on(Min, on)
+        return self._aggregate_on(Min, on, ignore_nulls)
-    def max(self, on: Union[KeyFn, List[KeyFn]] = None) -> Dataset[U]:
+    def max(
        self, on: Union[KeyFn, List[KeyFn]] = None, ignore_nulls: bool = True
    ) -> Dataset[U]:
        """Compute grouped max aggregation.
        This is a blocking operation.
@ -285,6 +306,11 @@ class GroupedDataset(Generic[T]):
                - For an Arrow dataset: it can be a column name or a list
                  thereof, and the default is to do a column-wise max of all
                  columns.
            ignore_nulls: Whether to ignore null values. If ``True``, null
                values will be ignored when computing the max; if ``False``,
                if a null value is encountered, the output will be null.
                We consider np.nan, None, and pd.NaT to be null values.
                Default is ``True``.
        Returns:
            The max result.
@ -309,9 +335,11 @@ class GroupedDataset(Generic[T]):
            If groupby key is ``None`` then the key part of return is omitted.
        """
-        return self._aggregate_on(Max, on)
+        return self._aggregate_on(Max, on, ignore_nulls)
-    def mean(self, on: Union[KeyFn, List[KeyFn]] = None) -> Dataset[U]:
+    def mean(
        self, on: Union[KeyFn, List[KeyFn]] = None, ignore_nulls: bool = True
    ) -> Dataset[U]:
        """Compute grouped mean aggregation.
        This is a blocking operation.
@ -338,6 +366,11 @@ class GroupedDataset(Generic[T]):
                - For an Arrow dataset: it can be a column name or a list
                  thereof, and the default is to do a column-wise mean of all
                  columns.
            ignore_nulls: Whether to ignore null values. If ``True``, null
                values will be ignored when computing the mean; if ``False``,
                if a null value is encountered, the output will be null.
                We consider np.nan, None, and pd.NaT to be null values.
                Default is ``True``.
        Returns:
            The mean result.
@ -363,9 +396,14 @@ class GroupedDataset(Generic[T]):
            If groupby key is ``None`` then the key part of return is omitted.
        """
-        return self._aggregate_on(Mean, on)
+        return self._aggregate_on(Mean, on, ignore_nulls)
-    def std(self, on: Union[KeyFn, List[KeyFn]] = None, ddof: int = 1) -> Dataset[U]:
+    def std(
        self,
        on: Union[KeyFn, List[KeyFn]] = None,
        ddof: int = 1,
        ignore_nulls: bool = True,
    ) -> Dataset[U]:
        """Compute grouped standard deviation aggregation.
        This is a blocking operation.
@ -402,6 +440,11 @@ class GroupedDataset(Generic[T]):
                  columns.
            ddof: Delta Degrees of Freedom. The divisor used in calculations
                is ``N - ddof``, where ``N`` represents the number of elements.
            ignore_nulls: Whether to ignore null values. If ``True``, null
                values will be ignored when computing the std; if ``False``,
                if a null value is encountered, the output will be null.
                We consider np.nan, None, and pd.NaT to be null values.
                Default is ``True``.
        Returns:
            The standard deviation result.
@ -426,7 +469,7 @@ class GroupedDataset(Generic[T]):
            If groupby key is ``None`` then the key part of return is omitted.
        """
-        return self._aggregate_on(Std, on, ddof=ddof)
+        return self._aggregate_on(Std, on, ignore_nulls, ddof=ddof)
 def _partition_and_combine_block(
--- a/python/ray/data/impl/null_aggregate.py
+++ b/python/ray/data/impl/null_aggregate.py
@ -0,0 +1,235 @@
 from typing import Tuple, Callable, Any, Union
 from types import ModuleType
 import numpy as np
 from ray.data.block import T, U, KeyType, AggType
 # This module contains aggregation helpers for handling nulls.
 # The null handling policy is:
 #   1. Mix of values and nulls - ignore_nulls=True:   Ignore the nulls, return
 #                                                     aggregation of non-null values.
 #   2. Mix of values and nulls - ignore_nulls=False:  Return None.
 #   3. All nulls:                                     Return None.
 #   4. Empty dataset:                                 Return None.
 #
 # This is accomplished by checking rows for null values and by propagating nulls
 # if found AND if we're not ignoring them. If not ignoring nulls, in order to delineate
 # between found null rows and an empty block accumulation when merging (the latter of
 # which we want to propagate; the former of which we do not), we attach a boolean flag
 # indicating whether or not an accumulation contains valid data to intermediate block
 # accumulations via _wrap_acc() and _unwrap_acc(). This allows us to properly merge
 # intermediate block accumulations under a streaming constraint.
 def _wrap_acc(a: AggType, has_data: bool) -> AggType:
    """
    Wrap accumulation with a numeric boolean flag indicating whether or not
    this accumulation contains real data; if it doesn't, we consider it to be
    empty.
    Args:
        a: The accumulation value.
        has_data: Whether the accumulation contains real data.
    Returns:
        An AggType list with the last element being a numeric boolean flag indicating
        whether or not this accumulation contains real data. If the input a has length
        n, the returned AggType has length n + 1.
    """
    if not isinstance(a, list):
        a = [a]
    return a + [1 if has_data else 0]
 def _unwrap_acc(a: AggType) -> Tuple[AggType, bool]:
    """
    Unwrap the accumulation, which we assume has been wrapped (via _wrap_acc) with a
    numeric boolean flag indicating whether or not this accumulation contains real data.
    Args:
        a: The wrapped accumulation value that we wish to unwrap.
    Returns:
        A tuple containing the unwrapped accumulation value and a boolean indicating
        whether the accumulation contains real data.
    """
    has_data = a[-1] == 1
    a = a[:-1]
    if len(a) == 1:
        a = a[0]
    return a, has_data
 def _null_wrap_init(init: Callable[[KeyType], AggType]) -> Callable[[KeyType], AggType]:
    """
    Wraps an accumulation initializer with null handling.
    The returned initializer function adds on a has_data field that the accumulator
    uses to track whether an aggregation is empty.
    Args:
        init: The core init function to wrap.
    Returns:
        A new accumulation initializer function that can handle nulls.
    """
    def _init(k: KeyType) -> AggType:
        a = init(k)
        # Initializing accumulation, so indicate that the accumulation doesn't represent
        # real data yet.
        return _wrap_acc(a, has_data=False)
    return _init
 def _null_wrap_accumulate(
    ignore_nulls: bool,
    on_fn: Callable[[T], T],
    accum: Callable[[AggType, T], AggType],
 ) -> Callable[[AggType, T], AggType]:
    """
    Wrap accumulator function with null handling.
    The returned accumulate function expects a to be either None or of the form:
    a = [acc_data_1, ..., acc_data_n, has_data].
    This performs an accumulation subject to the following null rules:
    1. If r is null and ignore_nulls=False, return None.
    2. If r is null and ignore_nulls=True, return a.
    3. If r is non-null and a is None, return None.
    5. If r is non-null and a is non-None, return accum(a[:-1], r).
    Args:
        ignore_nulls: Whether nulls should be ignored or cause a None result.
        on_fn: Function selecting a subset of the row to apply the aggregation.
        accum: The core accumulator function to wrap.
    Returns:
        A new accumulator function that handles nulls.
    """
    def _accum(a: AggType, r: T) -> AggType:
        r = on_fn(r)
        if _is_null(r):
            if ignore_nulls:
                # Ignoring nulls, return the current accumulation, ignoring r.
                return a
            else:
                # Not ignoring nulls, so propagate the null.
                return None
        else:
            if a is None:
                # Accumulation is None so (1) a previous row must have been null, and
                # (2) we must be propagating nulls, so continue to pragate this null.
                return None
            else:
                # Row is non-null and accumulation is non-null, so we now apply the core
                # accumulation.
                a, _ = _unwrap_acc(a)
                a = accum(a, r)
                return _wrap_acc(a, has_data=True)
    return _accum
 def _null_wrap_merge(
    ignore_nulls: bool,
    merge: Callable[[AggType, AggType], AggType],
 ) -> AggType:
    """
    Wrap merge function with null handling.
    The returned merge function expects a1 and a2 to be either None or of the form:
    a = [acc_data_1, ..., acc_data_2, has_data].
    This merges two accumulations subject to the following null rules:
    1. If a1 is empty and a2 is empty, return empty accumulation.
    2. If a1 (a2) is empty and a2 (a1) is None, return None.
    3. If a1 (a2) is empty and a2 (a1) is non-None, return a2 (a1).
    4. If a1 (a2) is None, return a2 (a1) if ignoring nulls, None otherwise.
    5. If a1 and a2 are both non-null, return merge(a1, a2).
    Args:
        ignore_nulls: Whether nulls should be ignored or cause a None result.
        merge: The core merge function to wrap.
    Returns:
        A new merge function that handles nulls.
    """
    def _merge(a1: AggType, a2: AggType) -> AggType:
        if a1 is None:
            # If we're ignoring nulls, propagate a2; otherwise, propagate None.
            return a2 if ignore_nulls else None
        unwrapped_a1, a1_has_data = _unwrap_acc(a1)
        if not a1_has_data:
            # If a1 is empty, propagate a2.
            # No matter whether a2 is a real value, empty, or None,
            # propagating each of these is correct if a1 is empty.
            return a2
        if a2 is None:
            # If we're ignoring nulls, propagate a1; otherwise, propagate None.
            return a1 if ignore_nulls else None
        unwrapped_a2, a2_has_data = _unwrap_acc(a2)
        if not a2_has_data:
            # If a2 is empty, propagate a1.
            return a1
        a = merge(unwrapped_a1, unwrapped_a2)
        return _wrap_acc(a, has_data=True)
    return _merge
 def _null_wrap_finalize(
    finalize: Callable[[AggType], AggType]
 ) -> Callable[[AggType], U]:
    """
    Wrap finalizer with null handling.
    If the accumulation is empty or None, the returned finalizer returns None.
    Args:
        finalize: The core finalizing function to wrap.
    Returns:
        A new finalizing function that handles nulls.
    """
    def _finalize(a: AggType) -> U:
        if a is None:
            return None
        a, has_data = _unwrap_acc(a)
        if not has_data:
            return None
        return finalize(a)
    return _finalize
 LazyModule = Union[None, bool, ModuleType]
 _pandas: LazyModule = None
 def _lazy_import_pandas() -> LazyModule:
    global _pandas
    if _pandas is None:
        try:
            import pandas as _pandas
        except ModuleNotFoundError:
            # If module is not found, set _pandas to False so we won't
            # keep trying to import it on every _lazy_import_pandas() call.
            _pandas = False
    return _pandas
 def _is_null(r: Any):
    pd = _lazy_import_pandas()
    if pd:
        return pd.isnull(r)
    try:
        return np.isnan(r)
    except TypeError:
        return r is None
--- a/python/ray/data/tests/test_dataset.py
+++ b/python/ray/data/tests/test_dataset.py
@ -2264,7 +2264,7 @@ def test_split(ray_start_regular_shared):
    assert [1] * 10 + [0] == [
        dataset._blocks.initial_num_blocks() for dataset in datasets
    ]
-    assert 190 == sum([dataset.sum() for dataset in datasets])
+    assert 190 == sum([dataset.sum() or 0 for dataset in datasets])
 def test_split_hints(ray_start_regular_shared):
@ -3271,12 +3271,65 @@ def test_groupby_arrow_sum(ray_start_regular_shared, num_parts):
        {"A": 1, "sum(B)": 1617},
        {"A": 2, "sum(B)": 1650},
    ]
    # Test built-in sum aggregation with nans
    nan_grouped_ds = (
        ray.data.from_items(
            [{"A": (x % 3), "B": x} for x in xs] + [{"A": 0, "B": None}]
        )
        .repartition(num_parts)
        .groupby("A")
    )
    nan_agg_ds = nan_grouped_ds.sum("B")
    assert nan_agg_ds.count() == 3
    assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [
        {"A": 0, "sum(B)": 1683},
        {"A": 1, "sum(B)": 1617},
        {"A": 2, "sum(B)": 1650},
    ]
    # Test ignore_nulls=False
    nan_agg_ds = nan_grouped_ds.sum("B", ignore_nulls=False)
    assert nan_agg_ds.count() == 3
    assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [
        {"A": 0, "sum(B)": None},
        {"A": 1, "sum(B)": 1617},
        {"A": 2, "sum(B)": 1650},
    ]
    # Test all nans
    nan_agg_ds = (
        ray.data.from_items([{"A": (x % 3), "B": None} for x in xs])
        .repartition(num_parts)
        .groupby("A")
        .sum("B")
    )
    assert nan_agg_ds.count() == 3
    assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [
        {"A": 0, "sum(B)": None},
        {"A": 1, "sum(B)": None},
        {"A": 2, "sum(B)": None},
    ]
    # Test built-in global sum aggregation
    assert (
        ray.data.from_items([{"A": x} for x in xs]).repartition(num_parts).sum("A")
        == 4950
    )
-    assert ray.data.range_arrow(10).filter(lambda r: r["value"] > 10).sum("value") == 0
+
    # Test empty dataset
    assert (
        ray.data.range_arrow(10).filter(lambda r: r["value"] > 10).sum("value") is None
    )
    # Test built-in global sum aggregation with nans
    nan_ds = ray.data.from_items([{"A": x} for x in xs] + [{"A": None}]).repartition(
        num_parts
    )
    assert nan_ds.sum("A") == 4950
    # Test ignore_nulls=False
    assert nan_ds.sum("A", ignore_nulls=False) is None
    # Test all nans
    nan_ds = ray.data.from_items([{"A": None}] * len(xs)).repartition(num_parts)
    assert nan_ds.sum("A") is None
@pytest.mark.parametrize("num_parts", [1, 30])
@ -3299,12 +3352,64 @@ def test_groupby_arrow_min(ray_start_regular_shared, num_parts):
        {"A": 1, "min(B)": 1},
        {"A": 2, "min(B)": 2},
    ]
    # Test built-in min aggregation with nans
    nan_grouped_ds = (
        ray.data.from_items(
            [{"A": (x % 3), "B": x} for x in xs] + [{"A": 0, "B": None}]
        )
        .repartition(num_parts)
        .groupby("A")
    )
    nan_agg_ds = nan_grouped_ds.min("B")
    assert nan_agg_ds.count() == 3
    assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [
        {"A": 0, "min(B)": 0},
        {"A": 1, "min(B)": 1},
        {"A": 2, "min(B)": 2},
    ]
    # Test ignore_nulls=False
    nan_agg_ds = nan_grouped_ds.min("B", ignore_nulls=False)
    assert nan_agg_ds.count() == 3
    assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [
        {"A": 0, "min(B)": None},
        {"A": 1, "min(B)": 1},
        {"A": 2, "min(B)": 2},
    ]
    # Test all nans
    nan_agg_ds = (
        ray.data.from_items([{"A": (x % 3), "B": None} for x in xs])
        .repartition(num_parts)
        .groupby("A")
        .min("B")
    )
    assert nan_agg_ds.count() == 3
    assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [
        {"A": 0, "min(B)": None},
        {"A": 1, "min(B)": None},
        {"A": 2, "min(B)": None},
    ]
    # Test built-in global min aggregation
    assert (
        ray.data.from_items([{"A": x} for x in xs]).repartition(num_parts).min("A") == 0
    )
-    with pytest.raises(ValueError):
+
-        ray.data.range_arrow(10).filter(lambda r: r["value"] > 10).min("value")
+    # Test empty dataset
    assert (
        ray.data.range_arrow(10).filter(lambda r: r["value"] > 10).min("value") is None
    )
    # Test built-in global min aggregation with nans
    nan_ds = ray.data.from_items([{"A": x} for x in xs] + [{"A": None}]).repartition(
        num_parts
    )
    assert nan_ds.min("A") == 0
    # Test ignore_nulls=False
    assert nan_ds.min("A", ignore_nulls=False) is None
    # Test all nans
    nan_ds = ray.data.from_items([{"A": None}] * len(xs)).repartition(num_parts)
    assert nan_ds.min("A") is None
@pytest.mark.parametrize("num_parts", [1, 30])
@ -3327,13 +3432,65 @@ def test_groupby_arrow_max(ray_start_regular_shared, num_parts):
        {"A": 1, "max(B)": 97},
        {"A": 2, "max(B)": 98},
    ]
    # Test built-in max aggregation with nans
    nan_grouped_ds = (
        ray.data.from_items(
            [{"A": (x % 3), "B": x} for x in xs] + [{"A": 0, "B": None}]
        )
        .repartition(num_parts)
        .groupby("A")
    )
    nan_agg_ds = nan_grouped_ds.max("B")
    assert nan_agg_ds.count() == 3
    assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [
        {"A": 0, "max(B)": 99},
        {"A": 1, "max(B)": 97},
        {"A": 2, "max(B)": 98},
    ]
    # Test ignore_nulls=False
    nan_agg_ds = nan_grouped_ds.max("B", ignore_nulls=False)
    assert nan_agg_ds.count() == 3
    assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [
        {"A": 0, "max(B)": None},
        {"A": 1, "max(B)": 97},
        {"A": 2, "max(B)": 98},
    ]
    # Test all nans
    nan_agg_ds = (
        ray.data.from_items([{"A": (x % 3), "B": None} for x in xs])
        .repartition(num_parts)
        .groupby("A")
        .max("B")
    )
    assert nan_agg_ds.count() == 3
    assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [
        {"A": 0, "max(B)": None},
        {"A": 1, "max(B)": None},
        {"A": 2, "max(B)": None},
    ]
    # Test built-in global max aggregation
    assert (
        ray.data.from_items([{"A": x} for x in xs]).repartition(num_parts).max("A")
        == 99
    )
-    with pytest.raises(ValueError):
+
-        ray.data.range_arrow(10).filter(lambda r: r["value"] > 10).max("value")
+    # Test empty dataset
    assert (
        ray.data.range_arrow(10).filter(lambda r: r["value"] > 10).max("value") is None
    )
    # Test built-in global max aggregation with nans
    nan_ds = ray.data.from_items([{"A": x} for x in xs] + [{"A": None}]).repartition(
        num_parts
    )
    assert nan_ds.max("A") == 99
    # Test ignore_nulls=False
    assert nan_ds.max("A", ignore_nulls=False) is None
    # Test all nans
    nan_ds = ray.data.from_items([{"A": None}] * len(xs)).repartition(num_parts)
    assert nan_ds.max("A") is None
@pytest.mark.parametrize("num_parts", [1, 30])
@ -3356,13 +3513,65 @@ def test_groupby_arrow_mean(ray_start_regular_shared, num_parts):
        {"A": 1, "mean(B)": 49.0},
        {"A": 2, "mean(B)": 50.0},
    ]
    # Test built-in mean aggregation with nans
    nan_grouped_ds = (
        ray.data.from_items(
            [{"A": (x % 3), "B": x} for x in xs] + [{"A": 0, "B": None}]
        )
        .repartition(num_parts)
        .groupby("A")
    )
    nan_agg_ds = nan_grouped_ds.mean("B")
    assert nan_agg_ds.count() == 3
    assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [
        {"A": 0, "mean(B)": 49.5},
        {"A": 1, "mean(B)": 49.0},
        {"A": 2, "mean(B)": 50.0},
    ]
    # Test ignore_nulls=False
    nan_agg_ds = nan_grouped_ds.mean("B", ignore_nulls=False)
    assert nan_agg_ds.count() == 3
    assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [
        {"A": 0, "mean(B)": None},
        {"A": 1, "mean(B)": 49.0},
        {"A": 2, "mean(B)": 50.0},
    ]
    # Test all nans
    nan_agg_ds = (
        ray.data.from_items([{"A": (x % 3), "B": None} for x in xs])
        .repartition(num_parts)
        .groupby("A")
        .mean("B")
    )
    assert nan_agg_ds.count() == 3
    assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [
        {"A": 0, "mean(B)": None},
        {"A": 1, "mean(B)": None},
        {"A": 2, "mean(B)": None},
    ]
    # Test built-in global mean aggregation
    assert (
        ray.data.from_items([{"A": x} for x in xs]).repartition(num_parts).mean("A")
        == 49.5
    )
-    with pytest.raises(ValueError):
+
-        ray.data.range_arrow(10).filter(lambda r: r["value"] > 10).mean("value")
+    # Test empty dataset
    assert (
        ray.data.range_arrow(10).filter(lambda r: r["value"] > 10).mean("value") is None
    )
    # Test built-in global mean aggregation with nans
    nan_ds = ray.data.from_items([{"A": x} for x in xs] + [{"A": None}]).repartition(
        num_parts
    )
    assert nan_ds.mean("A") == 49.5
    # Test ignore_nulls=False
    assert nan_ds.mean("A", ignore_nulls=False) is None
    # Test all nans
    nan_ds = ray.data.from_items([{"A": None}] * len(xs)).repartition(num_parts)
    assert nan_ds.mean("A") is None
@pytest.mark.parametrize("num_parts", [1, 30])
@ -3387,6 +3596,35 @@ def test_groupby_arrow_std(ray_start_regular_shared, num_parts):
    result = agg_ds.to_pandas()["std(B)"].to_numpy()
    expected = df.groupby("A")["B"].std(ddof=0).to_numpy()
    np.testing.assert_array_almost_equal(result, expected)
    # Test built-in std aggregation with nans
    nan_df = pd.DataFrame({"A": [x % 3 for x in xs] + [0], "B": xs + [None]})
    nan_grouped_ds = ray.data.from_pandas(nan_df).repartition(num_parts).groupby("A")
    nan_agg_ds = nan_grouped_ds.std("B")
    assert nan_agg_ds.count() == 3
    result = nan_agg_ds.to_pandas()["std(B)"].to_numpy()
    expected = nan_df.groupby("A")["B"].std().to_numpy()
    np.testing.assert_array_almost_equal(result, expected)
    # Test ignore_nulls=False
    nan_agg_ds = nan_grouped_ds.std("B", ignore_nulls=False)
    assert nan_agg_ds.count() == 3
    result = nan_agg_ds.to_pandas()["std(B)"].to_numpy()
    expected = nan_df.groupby("A")["B"].std()
    expected[0] = None
    np.testing.assert_array_almost_equal(result, expected)
    # Test all nans
    nan_df = pd.DataFrame({"A": [x % 3 for x in xs], "B": [None] * len(xs)})
    nan_agg_ds = (
        ray.data.from_pandas(nan_df)
        .repartition(num_parts)
        .groupby("A")
        .std("B", ignore_nulls=False)
    )
    assert nan_agg_ds.count() == 3
    result = nan_agg_ds.to_pandas()["std(B)"].to_numpy()
    expected = pd.Series([None] * 3)
    np.testing.assert_array_equal(result, expected)
    # Test built-in global std aggregation
    df = pd.DataFrame({"A": xs})
    assert math.isclose(
@ -3397,11 +3635,22 @@ def test_groupby_arrow_std(ray_start_regular_shared, num_parts):
        ray.data.from_pandas(df).repartition(num_parts).std("A", ddof=0),
        df["A"].std(ddof=0),
    )
-    with pytest.raises(ValueError):
+
-        ray.data.from_pandas(pd.DataFrame({"A": []})).std("A")
+    # Test empty dataset
    assert ray.data.from_pandas(pd.DataFrame({"A": []})).std("A") is None
    # Test edge cases
    assert ray.data.from_pandas(pd.DataFrame({"A": [3]})).std("A") == 0
    # Test built-in global std aggregation with nans
    nan_df = pd.DataFrame({"A": xs + [None]})
    nan_ds = ray.data.from_pandas(nan_df).repartition(num_parts)
    assert math.isclose(nan_ds.std("A"), df["A"].std())
    # Test ignore_nulls=False
    assert nan_ds.std("A", ignore_nulls=False) is None
    # Test all nans
    nan_ds = ray.data.from_items([{"A": None}] * len(xs)).repartition(num_parts)
    assert nan_ds.std("A") is None
@pytest.mark.parametrize("num_parts", [1, 30])
 def test_groupby_arrow_multicolumn(ray_start_regular_shared, num_parts):
@ -3421,6 +3670,7 @@ def test_groupby_arrow_multicolumn(ray_start_regular_shared, num_parts):
        {"A": 1, "mean(B)": 49.0, "mean(C)": 98.0},
        {"A": 2, "mean(B)": 50.0, "mean(C)": 100.0},
    ]
    # Test that unspecified agg column ==> agg on all columns except for
    # groupby keys.
    agg_ds = ray.data.from_pandas(df).repartition(num_parts).groupby("A").mean()
@ -3430,6 +3680,7 @@ def test_groupby_arrow_multicolumn(ray_start_regular_shared, num_parts):
        {"A": 1, "mean(B)": 49.0, "mean(C)": 98.0},
        {"A": 2, "mean(B)": 50.0, "mean(C)": 100.0},
    ]
    # Test built-in global mean aggregation
    df = pd.DataFrame({"A": xs, "B": [2 * x for x in xs]})
    result_row = ray.data.from_pandas(df).repartition(num_parts).mean(["A", "B"])
@ -3511,6 +3762,7 @@ def test_groupby_arrow_multi_agg(ray_start_regular_shared, num_parts):
            np.testing.assert_array_equal(result, expected)
    # Test built-in global std aggregation
    df = pd.DataFrame({"A": xs})
    result_row = (
        ray.data.from_pandas(df)
        .repartition(num_parts)
@ -3549,6 +3801,7 @@ def test_groupby_simple(ray_start_regular_shared):
    ]
    random.shuffle(xs)
    ds = ray.data.from_items(xs, parallelism=parallelism)
    # Mean aggregation
    agg_ds = ds.groupby(lambda r: r[0]).aggregate(
        AggregateFn(
@ -3625,9 +3878,50 @@ def test_groupby_simple_sum(ray_start_regular_shared, num_parts):
    )
    assert agg_ds.count() == 3
    assert agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, 1683), (1, 1617), (2, 1650)]
    # Test built-in sum aggregation with nans
    nan_grouped_ds = (
        ray.data.from_items(xs + [None])
        .repartition(num_parts)
        .groupby(lambda x: int(x or 0) % 3)
    )
    nan_agg_ds = nan_grouped_ds.sum()
    assert nan_agg_ds.count() == 3
    assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [
        (0, 1683),
        (1, 1617),
        (2, 1650),
    ]
    # Test ignore_nulls=False
    nan_agg_ds = nan_grouped_ds.sum(ignore_nulls=False)
    assert nan_agg_ds.count() == 3
    assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [
        (0, None),
        (1, 1617),
        (2, 1650),
    ]
    # Test all nans
    nan_agg_ds = (
        ray.data.from_items([None] * len(xs))
        .repartition(num_parts)
        .groupby(lambda x: 0)
        .sum()
    )
    assert nan_agg_ds.count() == 1
    assert nan_agg_ds.sort(key=lambda r: r[0]).take(1) == [(0, None)]
    # Test built-in global sum aggregation
    assert ray.data.from_items(xs).repartition(num_parts).sum() == 4950
-    assert ray.data.range(10).filter(lambda r: r > 10).sum() == 0
+    assert ray.data.range(10).filter(lambda r: r > 10).sum() is None
    # Test built-in global sum aggregation with nans
    nan_ds = ray.data.from_items(xs + [None]).repartition(num_parts)
    assert nan_ds.sum() == 4950
    # Test ignore_nulls=False
    assert nan_ds.sum(ignore_nulls=False) is None
    # Test all nans
    nan_ds = ray.data.from_items([None] * len(xs)).repartition(num_parts)
    assert nan_ds.sum() is None
@pytest.mark.parametrize("num_parts", [1, 30])
@ -3643,10 +3937,42 @@ def test_groupby_simple_min(ray_start_regular_shared, num_parts):
    )
    assert agg_ds.count() == 3
    assert agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, 0), (1, 1), (2, 2)]
    # Test built-in min aggregation with nans
    nan_grouped_ds = (
        ray.data.from_items(xs + [None])
        .repartition(num_parts)
        .groupby(lambda x: int(x or 0) % 3)
    )
    nan_agg_ds = nan_grouped_ds.min()
    assert nan_agg_ds.count() == 3
    assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, 0), (1, 1), (2, 2)]
    # Test ignore_nulls=False
    nan_agg_ds = nan_grouped_ds.min(ignore_nulls=False)
    assert nan_agg_ds.count() == 3
    assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, None), (1, 1), (2, 2)]
    # Test all nans
    nan_agg_ds = (
        ray.data.from_items([None] * len(xs))
        .repartition(num_parts)
        .groupby(lambda x: 0)
        .min()
    )
    assert nan_agg_ds.count() == 1
    assert nan_agg_ds.sort(key=lambda r: r[0]).take(1) == [(0, None)]
    # Test built-in global min aggregation
    assert ray.data.from_items(xs).repartition(num_parts).min() == 0
-    with pytest.raises(ValueError):
+    assert ray.data.range(10).filter(lambda r: r > 10).min() is None
-        ray.data.range(10).filter(lambda r: r > 10).min()
+
    # Test built-in global min aggregation with nans
    nan_ds = ray.data.from_items(xs + [None]).repartition(num_parts)
    assert nan_ds.min() == 0
    # Test ignore_nulls=False
    assert nan_ds.min(ignore_nulls=False) is None
    # Test all nans
    nan_ds = ray.data.from_items([None] * len(xs)).repartition(num_parts)
    assert nan_ds.min() is None
@pytest.mark.parametrize("num_parts", [1, 30])
@ -3662,10 +3988,42 @@ def test_groupby_simple_max(ray_start_regular_shared, num_parts):
    )
    assert agg_ds.count() == 3
    assert agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, 99), (1, 97), (2, 98)]
    # Test built-in max aggregation with nans
    nan_grouped_ds = (
        ray.data.from_items(xs + [None])
        .repartition(num_parts)
        .groupby(lambda x: int(x or 0) % 3)
    )
    nan_agg_ds = nan_grouped_ds.max()
    assert nan_agg_ds.count() == 3
    assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, 99), (1, 97), (2, 98)]
    # Test ignore_nulls=False
    nan_agg_ds = nan_grouped_ds.max(ignore_nulls=False)
    assert nan_agg_ds.count() == 3
    assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, None), (1, 97), (2, 98)]
    # Test all nans
    nan_agg_ds = (
        ray.data.from_items([None] * len(xs))
        .repartition(num_parts)
        .groupby(lambda x: 0)
        .max()
    )
    assert nan_agg_ds.count() == 1
    assert nan_agg_ds.sort(key=lambda r: r[0]).take(1) == [(0, None)]
    # Test built-in global max aggregation
    assert ray.data.from_items(xs).repartition(num_parts).max() == 99
-    with pytest.raises(ValueError):
+    assert ray.data.range(10).filter(lambda r: r > 10).max() is None
-        ray.data.range(10).filter(lambda r: r > 10).max()
+
    # Test built-in global max aggregation with nans
    nan_ds = ray.data.from_items(xs + [None]).repartition(num_parts)
    assert nan_ds.max() == 99
    # Test ignore_nulls=False
    assert nan_ds.max(ignore_nulls=False) is None
    # Test all nans
    nan_ds = ray.data.from_items([None] * len(xs)).repartition(num_parts)
    assert nan_ds.max() is None
@pytest.mark.parametrize("num_parts", [1, 30])
@ -3681,10 +4039,51 @@ def test_groupby_simple_mean(ray_start_regular_shared, num_parts):
    )
    assert agg_ds.count() == 3
    assert agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, 49.5), (1, 49.0), (2, 50.0)]
    # Test built-in mean aggregation with nans
    nan_grouped_ds = (
        ray.data.from_items(xs + [None])
        .repartition(num_parts)
        .groupby(lambda x: int(x or 0) % 3)
    )
    nan_agg_ds = nan_grouped_ds.mean()
    assert nan_agg_ds.count() == 3
    assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [
        (0, 49.5),
        (1, 49.0),
        (2, 50.0),
    ]
    # Test ignore_nulls=False
    nan_agg_ds = nan_grouped_ds.mean(ignore_nulls=False)
    assert nan_agg_ds.count() == 3
    assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [
        (0, None),
        (1, 49.0),
        (2, 50.0),
    ]
    # Test all nans
    nan_agg_ds = (
        ray.data.from_items([None] * len(xs))
        .repartition(num_parts)
        .groupby(lambda x: 0)
        .mean()
    )
    assert nan_agg_ds.count() == 1
    assert nan_agg_ds.sort(key=lambda r: r[0]).take(1) == [(0, None)]
    # Test built-in global mean aggregation
    assert ray.data.from_items(xs).repartition(num_parts).mean() == 49.5
-    with pytest.raises(ValueError):
+    # Test empty dataset
-        ray.data.range(10).filter(lambda r: r > 10).mean()
+    assert ray.data.range(10).filter(lambda r: r > 10).mean() is None
    # Test built-in global mean aggregation with nans
    nan_ds = ray.data.from_items(xs + [None]).repartition(num_parts)
    assert nan_ds.mean() == 49.5
    # Test ignore_nulls=False
    assert nan_ds.mean(ignore_nulls=False) is None
    # Test all nans
    nan_ds = ray.data.from_items([None] * len(xs)).repartition(num_parts)
    assert nan_ds.mean() is None
@pytest.mark.parametrize("num_parts", [1, 30])
@ -3721,6 +4120,48 @@ def test_groupby_simple_std(ray_start_regular_shared, num_parts):
    result_df = pd.DataFrame({"A": list(groups), "B": list(stds)})
    result_df = result_df.set_index("A")
    pd.testing.assert_series_equal(result_df["B"], expected)
    # Test built-in std aggregation with nans
    nan_grouped_ds = (
        ray.data.from_items(xs + [None])
        .repartition(num_parts)
        .groupby(lambda x: int(x or 0) % 3)
    )
    nan_agg_ds = nan_grouped_ds.std()
    assert nan_agg_ds.count() == 3
    nan_df = pd.DataFrame({"A": [x % 3 for x in xs] + [0], "B": xs + [None]})
    expected = nan_df.groupby("A")["B"].std()
    result = nan_agg_ds.sort(key=lambda r: r[0]).take(3)
    groups, stds = zip(*result)
    result_df = pd.DataFrame({"A": list(groups), "B": list(stds)})
    result_df = result_df.set_index("A")
    pd.testing.assert_series_equal(result_df["B"], expected)
    # Test ignore_nulls=False
    nan_agg_ds = nan_grouped_ds.std(ignore_nulls=False)
    assert nan_agg_ds.count() == 3
    expected = nan_df.groupby("A")["B"].std()
    expected[0] = None
    result = nan_agg_ds.sort(key=lambda r: r[0]).take(3)
    groups, stds = zip(*result)
    result_df = pd.DataFrame({"A": list(groups), "B": list(stds)})
    result_df = result_df.set_index("A")
    pd.testing.assert_series_equal(result_df["B"], expected)
    # Test all nans
    nan_agg_ds = (
        ray.data.from_items([None] * len(xs))
        .repartition(num_parts)
        .groupby(lambda x: 0)
        .std(ignore_nulls=False)
    )
    assert nan_agg_ds.count() == 1
    expected = pd.Series([None], name="B")
    expected.index.rename("A", inplace=True)
    result = nan_agg_ds.sort(key=lambda r: r[0]).take(1)
    groups, stds = zip(*result)
    result_df = pd.DataFrame({"A": list(groups), "B": list(stds)})
    result_df = result_df.set_index("A")
    pd.testing.assert_series_equal(result_df["B"], expected)
    # Test built-in global std aggregation
    assert math.isclose(
        ray.data.from_items(xs).repartition(num_parts).std(), pd.Series(xs).std()
@ -3730,11 +4171,21 @@ def test_groupby_simple_std(ray_start_regular_shared, num_parts):
        ray.data.from_items(xs).repartition(num_parts).std(ddof=0),
        pd.Series(xs).std(ddof=0),
    )
-    with pytest.raises(ValueError):
+
-        ray.data.from_items([]).std()
+    # Test empty dataset
    assert ray.data.from_items([]).std() is None
    # Test edge cases
    assert ray.data.from_items([3]).std() == 0
    # Test built-in global std aggregation with nans
    nan_ds = ray.data.from_items(xs + [None]).repartition(num_parts)
    assert math.isclose(nan_ds.std(), pd.Series(xs).std())
    # Test ignore_nulls=False
    assert nan_ds.std(ignore_nulls=False) is None
    # Test all nans
    nan_ds = ray.data.from_items([None] * len(xs)).repartition(num_parts)
    assert nan_ds.std() is None
@pytest.mark.parametrize("num_parts", [1, 30])
 def test_groupby_simple_multilambda(ray_start_regular_shared, num_parts):
@ -3760,10 +4211,12 @@ def test_groupby_simple_multilambda(ray_start_regular_shared, num_parts):
    assert ray.data.from_items([[x, 2 * x] for x in xs]).repartition(num_parts).mean(
        [lambda x: x[0], lambda x: x[1]]
    ) == (49.5, 99.0)
-    with pytest.raises(ValueError):
+    assert (
-        ray.data.from_items([[x, 2 * x] for x in range(10)]).filter(
+        ray.data.from_items([[x, 2 * x] for x in range(10)])
-            lambda r: r[0] > 10
+        .filter(lambda r: r[0] > 10)
-        ).mean([lambda x: x[0], lambda x: x[1]])
+        .mean([lambda x: x[0], lambda x: x[1]])
        is None
    )
@pytest.mark.parametrize("num_parts", [1, 30])