[Dataframe] Change pandas and ray.dataframe imports (#1942)

* fixing zero length partitions * fixing bugs to fully handle zero len parts * resolve comments * renaming imports
2025-03-06 02:21:39 -05:00 · 2018-06-15 16:17:16 -07:00 · 2018-06-15 16:17:16 -07:00 · 8560993b46
commit 8560993b46
parent fa0ade2bc5
10 changed files with 408 additions and 332 deletions
--- a/python/ray/dataframe/init.py
+++ b/python/ray/dataframe/init.py
@ -2,7 +2,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import pandas as pd
+import pandas
 # TODO: In the future `set_option` or similar needs to run on every node
 # in order to keep all pandas instances across nodes consistent
 from pandas import (eval, unique, value_counts, cut, to_numeric, factorize,
@ -12,11 +12,11 @@ from pandas import (eval, unique, value_counts, cut, to_numeric, factorize,
                    set_option, NaT, PeriodIndex, Categorical)
 import threading

-pd_version = pd.__version__
-pd_major = int(pd_version.split(".")[0])
-pd_minor = int(pd_version.split(".")[1])
+pandas_version = pandas.__version__
+pandas_major = int(pandas_version.split(".")[0])
+pandas_minor = int(pandas_version.split(".")[1])

-if pd_major == 0 and pd_minor != 22:
+if pandas_major == 0 and pandas_minor != 22:
    raise Exception("In order to use Pandas on Ray, your pandas version must "
                    "be 0.22. You can run 'pip install pandas==0.22'")

--- a/python/ray/dataframe/dataframe.py
+++ b/python/ray/dataframe/dataframe.py
--- a/python/ray/dataframe/groupby.py
+++ b/python/ray/dataframe/groupby.py
@ -2,7 +2,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import pandas as pd
+import pandas
 import numpy as np
 import pandas.core.groupby
 from pandas.core.dtypes.common import is_list_like
@ -12,6 +12,7 @@ import ray

 from .utils import _inherit_docstrings, _reindex_helper
 from .concat import concat
+from .index_metadata import _IndexMetadata


@_inherit_docstrings(pandas.core.groupby.DataFrameGroupBy,
@ -31,12 +32,13 @@ class DataFrameGroupBy(object):

        if axis == 0:
            partitions = [column for column in df._block_partitions.T]
-            self._index_grouped = pd.Series(self._index, index=self._index)\
+            self._index_grouped = \
+                pandas.Series(self._index, index=self._index) \
                .groupby(by=by, sort=sort)
        else:
            partitions = [row for row in df._block_partitions]
            self._index_grouped = \
-                pd.Series(self._columns, index=self._columns) \
+                pandas.Series(self._columns, index=self._columns) \
                .groupby(by=by, sort=sort)

        self._keys_and_values = [(k, v)
@ -127,7 +129,7 @@ class DataFrameGroupBy(object):

    @property
    def groups(self):
-        return {k: pd.Index(v) for k, v in self._keys_and_values}
+        return {k: pandas.Index(v) for k, v in self._keys_and_values}

    def min(self, **kwargs):
        return self._apply_agg_function(lambda df: df.min(axis=self._axis,
@ -194,7 +196,7 @@ class DataFrameGroupBy(object):

        result = [func(v) for k, v in self._iter]
        if self._axis == 0:
-            if isinstance(result[0], pd.Series):
+            if isinstance(result[0], pandas.Series):
                # Applied an aggregation function
                new_df = concat(result, axis=1).T
                new_df.columns = self._columns
@ -208,8 +210,11 @@ class DataFrameGroupBy(object):
                    num_return_vals=len(new_df._block_partitions))
                    for block in new_df._block_partitions.T]).T
                new_df.index = self._index
+                new_df._row_metadata = \
+                    _IndexMetadata(new_df._block_partitions[:, 0],
+                                   index=new_df.index, axis=0)
        else:
-            if isinstance(result[0], pd.Series):
+            if isinstance(result[0], pandas.Series):
                # Applied an aggregation function
                new_df = concat(result, axis=1)
                new_df.columns = [k for k, v in self._iter]
@ -223,6 +228,9 @@ class DataFrameGroupBy(object):
                    num_return_vals=new_df._block_partitions.shape[1])
                    for block in new_df._block_partitions])
                new_df.columns = self._columns
+                new_df._col_metadata = \
+                    _IndexMetadata(new_df._block_partitions[0, :],
+                                   index=new_df.columns, axis=1)
        return new_df

    @property
@ -392,6 +400,9 @@ class DataFrameGroupBy(object):
                num_return_vals=len(new_df._block_partitions))
                for block in new_df._block_partitions.T]).T
            new_df.index = sorted_index
+            new_df._row_metadata = \
+                _IndexMetadata(new_df._block_partitions[:, 0],
+                               index=new_df.index, axis=0)

        return new_df

@ -447,6 +458,9 @@ class DataFrameGroupBy(object):
                num_return_vals=len(new_df._block_partitions))
                for block in new_df._block_partitions.T]).T
            new_df.index = sorted_index
+            new_df._row_metadata = \
+                _IndexMetadata(new_df._block_partitions[:, 0],
+                               index=new_df.index, axis=0)

        return new_df

@ -516,6 +530,9 @@ class DataFrameGroupBy(object):
                num_return_vals=len(new_df._block_partitions))
                for block in new_df._block_partitions.T]).T
            new_df.index = self._index
+            new_df._row_metadata = \
+                _IndexMetadata(new_df._block_partitions[:, 0],
+                               index=new_df.index, axis=0)
        else:
            new_df._block_partitions = np.array([_reindex_helper._submit(
                args=tuple([new_df.columns, self._columns, 0,
@ -524,6 +541,9 @@ class DataFrameGroupBy(object):
                num_return_vals=new_df._block_partitions.shape[1])
                for block in new_df._block_partitions])
            new_df.columns = self._columns
+            new_df._col_metadata = \
+                _IndexMetadata(new_df._block_partitions[0, :],
+                               index=new_df.columns, axis=1)

        return new_df

@ -531,7 +551,7 @@ class DataFrameGroupBy(object):
@ray.remote
 def groupby(by, axis, level, as_index, sort, group_keys, squeeze, *df):

-    df = pd.concat(df, axis=axis)
+    df = pandas.concat(df, axis=axis)

    return [v for k, v in df.groupby(by=by,
                                     axis=axis,
--- a/python/ray/dataframe/index_metadata.py
+++ b/python/ray/dataframe/index_metadata.py
@ -1,11 +1,12 @@
-import pandas as pd
+import pandas
 import numpy as np
 import ray

 from .utils import (
    _build_row_lengths,
    _build_col_widths,
-    _build_coord_df)
+    _build_coord_df,
+    _check_empty)

 from pandas.core.indexing import convert_to_index_sliceable

@ -32,12 +33,12 @@ class _IndexMetadata(object):

        Args:
            dfs ([ObjectID]): ObjectIDs of dataframe partitions
-            index (pd.Index): Index of the Ray DataFrame.
+            index (pandas.Index): Index of the Ray DataFrame.
            axis: Axis of partition (0=row partitions, 1=column partitions)

        Returns:
-            A IndexMetadata backed by the specified pd.Index, partitioned off
-            specified partitions
+            A IndexMetadata backed by the specified pandas.Index, partitioned
+            off specified partitions
        """
        assert (lengths_oid is None) == (coord_df_oid is None), \
            "Must pass both or neither of lengths_oid and coord_df_oid"
@ -48,6 +49,9 @@ class _IndexMetadata(object):
            else:
                lengths_oid = _build_col_widths.remote(dfs)
            coord_df_oid = _build_coord_df.remote(lengths_oid, index)
+            self._empty = _check_empty.remote(dfs)
+        else:
+            self._empty = True

        self._lengths = lengths_oid
        self._coord_df = coord_df_oid
@ -115,7 +119,7 @@ class _IndexMetadata(object):
        This design is more straightforward than caching indexes on setting the
        coord_df to an OID due to the possibility of an OID-to-OID change.
        """
-        new_index = pd.DataFrame(index=new_index).index
+        new_index = pandas.DataFrame(index=new_index).index
        assert len(new_index) == len(self)

        self._index_cache = new_index
@ -138,7 +142,7 @@ class _IndexMetadata(object):
            The Index object in _index_cache.
        """
        if self._index_cache_validator is None:
-            self._index_cache_validator = pd.RangeIndex(len(self))
+            self._index_cache_validator = pandas.RangeIndex(len(self))
        elif isinstance(self._index_cache_validator,
                        ray.ObjectID):
            self._index_cache_validator = ray.get(self._index_cache_validator)
@ -157,6 +161,16 @@ class _IndexMetadata(object):
    # cache to accept ObjectIDs and ray.get them when needed.
    _index_cache = property(_get_index_cache, _set_index_cache)

+    def _get_empty(self):
+        if isinstance(self._empty_cache, ray.ObjectID):
+            self._empty_cache = ray.get(self._empty_cache)
+        return self._empty_cache
+
+    def _set_empty(self, empty):
+        self._empty_cache = empty
+
+    _empty = property(_get_empty, _set_empty)
+
    def coords_of(self, key):
        """Returns the coordinates (partition, index_within_partition) of the
        provided key in the index. Can be called on its own or implicitly
@ -170,9 +184,9 @@ class _IndexMetadata(object):

        Returns:
            Pandas object with the keys specified. If key is a single object
-            it will be a pd.Series with items `partition` and
+            it will be a pandas.Series with items `partition` and
            `index_within_partition`, and if key is a slice or if the key is
-            duplicate it will be a pd.DataFrame with said items as columns.
+            duplicate it will be a pandas.DataFrame with said items as columns.
        """
        return self._coord_df.loc[key]

@ -191,7 +205,7 @@ class _IndexMetadata(object):
                    'index_within_partition']

    def __len__(self):
-        return sum(self._lengths)
+        return int(sum(self._lengths))

    def reset_partition_coords(self, partitions=None):
        partitions = np.array(partitions)
@ -200,7 +214,7 @@ class _IndexMetadata(object):
            partition_mask = (self._coord_df['partition'] == partition)
            # Since we are replacing columns with RangeIndex inside the
            # partition, we have to make sure that our reference to it is
-            # updated as well.
+            # upandasated as well.
            try:
                self._coord_df.loc[partition_mask,
                                   'index_within_partition'] = np.arange(
@ -263,7 +277,7 @@ class _IndexMetadata(object):
        # TODO: Determine if there's a better way to do a row-index insert in
        # pandas, because this is very annoying/unsure of efficiency
        # Create new coord entry to insert
-        coord_to_insert = pd.DataFrame(
+        coord_to_insert = pandas.DataFrame(
                {'partition': partition,
                 'index_within_partition': index_within_partition},
                index=[key])
@ -329,9 +343,9 @@ class _IndexMetadata(object):

        Returns:
            Pandas object with the keys specified. If key is a single object
-            it will be a pd.Series with items `partition` and
+            it will be a pandas.Series with items `partition` and
            `index_within_partition`, and if key is a slice or if the key is
-            duplicate it will be a pd.DataFrame with said items as columns.
+            duplicate it will be a pandas.DataFrame with said items as columns.
        """
        return self.coords_of(key)

@ -355,26 +369,37 @@ class _IndexMetadata(object):
        """
        dropped = self.coords_of(labels)

-        # Update first lengths to prevent possible length inconsistencies
-        if isinstance(dropped, pd.DataFrame):
+        # Upandasate first lengths to prevent possible length inconsistencies
+        if isinstance(dropped, pandas.DataFrame):
            try:
                drop_per_part = dropped.groupby(["partition"]).size()\
-                        .reindex(index=pd.RangeIndex(len(self._lengths)),
+                        .reindex(index=pandas.RangeIndex(len(self._lengths)),
                                 fill_value=0)
            except ValueError:
                # Copy the arrow sealed dataframe so we can mutate it.
                dropped = dropped.copy()
                drop_per_part = dropped.groupby(["partition"]).size()\
-                    .reindex(index=pd.RangeIndex(len(self._lengths)),
+                    .reindex(index=pandas.RangeIndex(len(self._lengths)),
                             fill_value=0)
-        elif isinstance(dropped, pd.Series):
+        elif isinstance(dropped, pandas.Series):
            drop_per_part = np.zeros_like(self._lengths)
            drop_per_part[dropped["partition"]] = 1
        else:
            raise AssertionError("Unrecognized result from `coords_of`")
-        self._lengths = self._lengths - drop_per_part

-        self._coord_df = self._coord_df.drop(labels, errors=errors)
+        self._lengths = self._lengths - np.array(drop_per_part)
+
+        new_coord_df = self._coord_df.drop(labels, errors=errors)
+
+        num_dropped = 0
+        for i, length in enumerate(self._lengths):
+            if length == 0:
+                num_dropped += 1
+            if num_dropped > 0:
+                new_coord_df['partition'][new_coord_df['partition'] == i] \
+                    -= num_dropped
+
+        self._coord_df = new_coord_df
        return dropped

    def rename_index(self, mapper):
--- a/python/ray/dataframe/indexing.py
+++ b/python/ray/dataframe/indexing.py
@ -13,7 +13,7 @@ _LocIndexer and _iLocIndexer is responsible for indexer specific logic and
 An illustration is available at
 https://github.com/ray-project/ray/pull/1955#issuecomment-386781826
 """
-import pandas as pd
+import pandas
 import numpy as np
 import ray
 from warnings import warn
@ -96,7 +96,7 @@ def _is_enlargement(locator, coord_df):
    """
    if is_list_like(locator) and not is_slice(
            locator) and len(locator) > 0 and not is_boolean_array(locator):
-        n_diff_elems = len(pd.Index(locator).difference(coord_df.index))
+        n_diff_elems = len(pandas.Index(locator).difference(coord_df.index))
        is_enlargement_boolean = n_diff_elems > 0
        return is_enlargement_boolean
    return False
@ -140,8 +140,8 @@ class _Location_Indexer_Base():
    def __getitem__(self, row_lookup, col_lookup, ndim):
        """
        Args:
-            row_lookup: A pd dataframe, a partial view from row_coord_df
-            col_lookup: A pd dataframe, a partial view from col_coord_df
+            row_lookup: A pandas dataframe, a partial view from row_coord_df
+            col_lookup: A pandas dataframe, a partial view from col_coord_df
            ndim: the dimension of returned data
        """
        if ndim == 2:
@ -152,7 +152,7 @@ class _Location_Indexer_Base():
            result = ray.get(_blocks_to_col.remote(*extracted)).squeeze()

            if is_scalar(result):
-                result = pd.Series(result)
+                result = pandas.Series(result)

            scaler_axis = row_lookup if len(row_lookup) == 1 else col_lookup
            series_name = scaler_axis.iloc[0].name
@ -213,8 +213,8 @@ class _Location_Indexer_Base():
    def __setitem__(self, row_lookup, col_lookup, item):
        """
        Args:
-            row_lookup: A pd dataframe, a partial view from row_coord_df
-            col_lookup: A pd dataframe, a partial view from col_coord_df
+            row_lookup: A pandas dataframe, a partial view from row_coord_df
+            col_lookup: A pandas dataframe, a partial view from col_coord_df
            item: The new item needs to be set. It can be any shape that's
                broadcastable to the product of the lookup tables.
        """
@ -348,14 +348,14 @@ class _Loc_Indexer(_Location_Indexer_Base):
            [self.block_oids, nan_blks], axis=0 if row_based_bool else 1)

        # 3. Prepare metadata to return
-        nan_coord_df = pd.DataFrame(data=[{
+        nan_coord_df = pandas.DataFrame(data=[{
            '': name,
            'partition': blk_part_n_row if row_based_bool else blk_part_n_col,
            'index_within_partition': i
        } for name, i in zip(nan_labels, np.arange(num_nan_labels))
        ]).set_index('')

-        coord_df = pd.concat([major_meta._coord_df, nan_coord_df])
+        coord_df = pandas.concat([major_meta._coord_df, nan_coord_df])
        coord_df = coord_df.loc[locator]  # Re-index that allows duplicates

        lens = major_meta._lengths
@ -370,7 +370,7 @@ class _Loc_Indexer(_Location_Indexer_Base):
        Returns:
             nan_labels: The labels needs to be added
        """
-        locator_as_index = pd.Index(locator)
+        locator_as_index = pandas.Index(locator)

        nan_labels = locator_as_index.difference(base_index)
        common_labels = locator_as_index.intersection(base_index)
--- a/python/ray/dataframe/io.py
+++ b/python/ray/dataframe/io.py
@ -7,9 +7,9 @@ from io import BytesIO
 import os
 import re
 import warnings
+import pandas

 from pyarrow.parquet import ParquetFile
-import pandas as pd
 from pandas.io.common import _infer_compression  # don't depend on internal API


@ -62,23 +62,23 @@ def _read_parquet_row_group(path, columns, row_group_id, kwargs={}):


@ray.remote
-def _split_df(pd_df, chunksize):
-    """Split a pd_df into partitions.
+def _split_df(pandas_df, chunksize):
+    """Split a pandas_df into partitions.

    Returns:
        remote_df_ids ([ObjectID])
    """
    dataframes = []

-    while len(pd_df) > chunksize:
-        t_df = pd_df[:chunksize]
+    while len(pandas_df) > chunksize:
+        t_df = pandas_df[:chunksize]
        t_df.reset_index(drop=True)
        top = ray.put(t_df)
        dataframes.append(top)
-        pd_df = pd_df[chunksize:]
+        pandas_df = pandas_df[chunksize:]
    else:
-        pd_df = pd_df.reset_index(drop=True)
-        dataframes.append(ray.put(pd_df))
+        pandas_df = pandas_df.reset_index(drop=True)
+        dataframes.append(ray.put(pandas_df))

    return dataframes

@ -122,7 +122,7 @@ def _get_firstline(file_path):


 def _infer_column(first_line, kwargs={}):
-    return pd.read_csv(BytesIO(first_line), **kwargs).columns
+    return pandas.read_csv(BytesIO(first_line), **kwargs).columns


@ray.remote
@ -133,11 +133,11 @@ def _read_csv_with_offset(fn, start, end, kwargs={}, header=b''):
    bio.seek(start)
    to_read = header + bio.read(end - start)
    bio.close()
-    pd_df = pd.read_csv(BytesIO(to_read), **kwargs)
-    index = pd_df.index
+    pandas_df = pandas.read_csv(BytesIO(to_read), **kwargs)
+    index = pandas_df.index
    # Partitions must have RangeIndex
-    pd_df.index = pd.RangeIndex(0, len(pd_df))
-    return pd_df, index
+    pandas_df.index = pandas.RangeIndex(0, len(pandas_df))
+    return pandas_df, index


@ray.remote
@ -271,11 +271,11 @@ def read_csv(filepath_or_buffer,
        warnings.warn("Defaulting to Pandas implementation",
                      PendingDeprecationWarning)

-        pd_obj = pd.read_csv(filepath_or_buffer, **kwargs)
-        if isinstance(pd_obj, pd.DataFrame):
-            return from_pandas(pd_obj, get_npartitions())
+        pandas_obj = pandas.read_csv(filepath_or_buffer, **kwargs)
+        if isinstance(pandas_obj, pandas.DataFrame):
+            return from_pandas(pandas_obj, get_npartitions())

-        return pd_obj
+        return pandas_obj

    filepath = filepath_or_buffer

@ -332,10 +332,11 @@ def read_json(path_or_buf=None,
    warnings.warn("Defaulting to Pandas implementation",
                  PendingDeprecationWarning)

-    port_frame = pd.read_json(path_or_buf, orient, typ, dtype,
-                              convert_axes, convert_dates, keep_default_dates,
-                              numpy, precise_float, date_unit, encoding,
-                              lines, chunksize, compression)
+    port_frame = pandas.read_json(path_or_buf, orient, typ, dtype,
+                                  convert_axes, convert_dates,
+                                  keep_default_dates, numpy, precise_float,
+                                  date_unit, encoding, lines, chunksize,
+                                  compression)
    ray_frame = from_pandas(port_frame, get_npartitions())

    return ray_frame
@ -360,10 +361,10 @@ def read_html(io,
    warnings.warn("Defaulting to Pandas implementation",
                  PendingDeprecationWarning)

-    port_frame = pd.read_html(io, match, flavor, header, index_col,
-                              skiprows, attrs, parse_dates, tupleize_cols,
-                              thousands, encoding, decimal, converters,
-                              na_values, keep_default_na)
+    port_frame = pandas.read_html(io, match, flavor, header, index_col,
+                                  skiprows, attrs, parse_dates, tupleize_cols,
+                                  thousands, encoding, decimal, converters,
+                                  na_values, keep_default_na)
    ray_frame = from_pandas(port_frame[0], get_npartitions())

    return ray_frame
@ -374,7 +375,7 @@ def read_clipboard(sep=r'\s+'):
    warnings.warn("Defaulting to Pandas implementation",
                  PendingDeprecationWarning)

-    port_frame = pd.read_clipboard(sep)
+    port_frame = pandas.read_clipboard(sep)
    ray_frame = from_pandas(port_frame, get_npartitions())

    return ray_frame
@ -403,11 +404,11 @@ def read_excel(io,
    warnings.warn("Defaulting to Pandas implementation",
                  PendingDeprecationWarning)

-    port_frame = pd.read_excel(io, sheet_name, header, skiprows, skip_footer,
-                               index_col, names, usecols, parse_dates,
-                               date_parser, na_values, thousands,
-                               convert_float, converters, dtype, true_values,
-                               false_values, engine, squeeze)
+    port_frame = pandas.read_excel(io, sheet_name, header, skiprows,
+                                   skip_footer, index_col, names, usecols,
+                                   parse_dates, date_parser, na_values,
+                                   thousands, convert_float, converters, dtype,
+                                   true_values, false_values, engine, squeeze)
    ray_frame = from_pandas(port_frame, get_npartitions())

    return ray_frame
@ -420,7 +421,7 @@ def read_hdf(path_or_buf,
    warnings.warn("Defaulting to Pandas implementation",
                  PendingDeprecationWarning)

-    port_frame = pd.read_hdf(path_or_buf, key, mode)
+    port_frame = pandas.read_hdf(path_or_buf, key, mode)
    ray_frame = from_pandas(port_frame, get_npartitions())

    return ray_frame
@ -432,7 +433,7 @@ def read_feather(path,
    warnings.warn("Defaulting to Pandas implementation",
                  PendingDeprecationWarning)

-    port_frame = pd.read_feather(path)
+    port_frame = pandas.read_feather(path)
    ray_frame = from_pandas(port_frame, get_npartitions())

    return ray_frame
@ -445,7 +446,7 @@ def read_msgpack(path_or_buf,
    warnings.warn("Defaulting to Pandas implementation",
                  PendingDeprecationWarning)

-    port_frame = pd.read_msgpack(path_or_buf, encoding, iterator)
+    port_frame = pandas.read_msgpack(path_or_buf, encoding, iterator)
    ray_frame = from_pandas(port_frame, get_npartitions())

    return ray_frame
@ -466,10 +467,10 @@ def read_stata(filepath_or_buffer,
    warnings.warn("Defaulting to Pandas implementation",
                  PendingDeprecationWarning)

-    port_frame = pd.read_stata(filepath_or_buffer, convert_dates,
-                               convert_categoricals, encoding, index_col,
-                               convert_missing, preserve_dtypes, columns,
-                               order_categoricals, chunksize, iterator)
+    port_frame = pandas.read_stata(filepath_or_buffer, convert_dates,
+                                   convert_categoricals, encoding, index_col,
+                                   convert_missing, preserve_dtypes, columns,
+                                   order_categoricals, chunksize, iterator)
    ray_frame = from_pandas(port_frame, get_npartitions())

    return ray_frame
@ -485,8 +486,8 @@ def read_sas(filepath_or_buffer,
    warnings.warn("Defaulting to Pandas implementation",
                  PendingDeprecationWarning)

-    port_frame = pd.read_sas(filepath_or_buffer, format, index, encoding,
-                             chunksize, iterator)
+    port_frame = pandas.read_sas(filepath_or_buffer, format, index, encoding,
+                                 chunksize, iterator)
    ray_frame = from_pandas(port_frame, get_npartitions())

    return ray_frame
@ -498,7 +499,7 @@ def read_pickle(path,
    warnings.warn("Defaulting to Pandas implementation",
                  PendingDeprecationWarning)

-    port_frame = pd.read_pickle(path, compression)
+    port_frame = pandas.read_pickle(path, compression)
    ray_frame = from_pandas(port_frame, get_npartitions())

    return ray_frame
@ -516,8 +517,8 @@ def read_sql(sql,
    warnings.warn("Defaulting to Pandas implementation",
                  PendingDeprecationWarning)

-    port_frame = pd.read_sql(sql, con, index_col, coerce_float, params,
-                             parse_dates, columns, chunksize)
+    port_frame = pandas.read_sql(sql, con, index_col, coerce_float, params,
+                                 parse_dates, columns, chunksize)
    ray_frame = from_pandas(port_frame, get_npartitions())

    return ray_frame
--- a/python/ray/dataframe/pandas_code_gen.py
+++ b/python/ray/dataframe/pandas_code_gen.py
@ -5,17 +5,17 @@ from __future__ import print_function
 import inspect


-def code_gen(pd_obj, ray_obj, path):
+def code_gen(pandas_obj, ray_obj, path):
    """Generate code skeleton for methods not in Ray

    Args:
-        pd_obj: The pandas object to generate code from.
+        pandas_obj: The pandas object to generate code from.
        ray_obj: The ray object to diff against.
        path: Path to output the file to.
    """

    with open(path, "w") as outfile:
-        funcs = pandas_ray_diff(pd_obj, ray_obj)
+        funcs = pandas_ray_diff(pandas_obj, ray_obj)

        for func in funcs:
            if func[0] == "_" and func[1] != "_":
@ -24,9 +24,10 @@ def code_gen(pd_obj, ray_obj, path):
                # let's not mess with these
                continue
            try:
-                outfile.write("\ndef " + func +
-                              str(inspect.signature(getattr(pd_obj, func))) +
-                              ":\n")
+                outfile.write(
+                    "\ndef " + func +
+                    str(inspect.signature(getattr(pandas_obj, func))) + ":\n")
+
            except TypeError:
                outfile.write("\n@property")
                outfile.write("\ndef " + func + "(self):\n")
@ -73,21 +74,21 @@ def code_gen_test(ray_obj, path, name):
            outfile.write(")\n")


-def pandas_ray_diff(pd_obj, ray_obj):
+def pandas_ray_diff(pandas_obj, ray_obj):
    """Gets the diff of the methods in the Pandas and Ray objects.

    Args:
-        pd_obj: The Pandas object to diff.
+        pandas_obj: The Pandas object to diff.
        ray_obj: The Ray object to diff.

    Returns:
        A list of method names that are different between the two.
    """
-    pd_funcs = dir(pd_obj)
+    pandas_funcs = dir(pandas_obj)
    ray_funcs = dir(ray_obj)

-    pd_funcs = set(filter(lambda f: f[0] != "_" or f[1] == "_",
-                          pd_funcs))
+    pandas_funcs = set(filter(lambda f: f[0] != "_" or f[1] == "_",
+                              pandas_funcs))

-    diff = [x for x in pd_funcs if x not in set(ray_funcs)]
+    diff = [x for x in pandas_funcs if x not in set(ray_funcs)]
    return diff
--- a/python/ray/dataframe/series.py
+++ b/python/ray/dataframe/series.py
@ -3,7 +3,7 @@ from __future__ import division
 from __future__ import print_function

 import numpy as np
-import pandas as pd
+import pandas

 from .utils import _inherit_docstrings

@ -14,7 +14,8 @@ def na_op():
    raise NotImplementedError("Not Yet implemented.")


-@_inherit_docstrings(pd.Series, excluded=[pd.Series, pd.Series.__init__])
+@_inherit_docstrings(pandas.Series, excluded=[pandas.Series,
+                                              pandas.Series.__init__])
 class Series(object):

    def __init__(self, series_oids):
@ -832,7 +833,7 @@ class Series(object):
    def unstack(self, level=-1, fill_value=None):
        raise NotImplementedError("Not Yet implemented.")

-    def update(self, other):
+    def upandasate(self, other):
        raise NotImplementedError("Not Yet implemented.")

    def valid(self, inplace=False, **kwargs):
--- a/python/ray/dataframe/test/test_series.py
+++ b/python/ray/dataframe/test/test_series.py
@ -3,12 +3,12 @@ from __future__ import division
 from __future__ import print_function

 import pytest
-import ray.dataframe as rdf
+import ray.dataframe as pd


@pytest.fixture
 def create_test_series():
-    return rdf.Series(None)
+    return pd.Series(None)


 def test_T():
--- a/python/ray/dataframe/utils.py
+++ b/python/ray/dataframe/utils.py
@ -3,7 +3,7 @@ from __future__ import division
 from __future__ import print_function

 import collections
-import pandas as pd
+import pandas
 import numpy as np
 import ray

@ -119,14 +119,14 @@ def _get_nan_block_id(n_row=1, n_col=1, transpose=False):
    shape = (n_row, n_col)
    if shape not in _NAN_BLOCKS:
        arr = np.tile(np.array(np.NaN), shape)
-        _NAN_BLOCKS[shape] = ray.put(pd.DataFrame(data=arr))
+        _NAN_BLOCKS[shape] = ray.put(pandas.DataFrame(data=arr))
    return _NAN_BLOCKS[shape]


 def _get_lengths(df):
    """Gets the length of the dataframe.
    Args:
-        df: A remote pd.DataFrame object.
+        df: A remote pandas.DataFrame object.
    Returns:
        Returns an integer length of the dataframe object. If the attempt
            fails, returns 0 as the length.
@ -142,7 +142,7 @@ def _get_lengths(df):
 def _get_widths(df):
    """Gets the width (number of columns) of the dataframe.
    Args:
-        df: A remote pd.DataFrame object.
+        df: A remote pandas.DataFrame object.
    Returns:
        Returns an integer width of the dataframe object. If the attempt
            fails, returns 0 as the length.
@ -155,6 +155,11 @@ def _get_widths(df):
        return 0


+def _get_empty(df):
+    """Return True if the DataFrame is empty"""
+    return df.empty
+
+
 def _partition_pandas_dataframe(df, num_partitions=None, row_chunksize=None):
    """Partitions a Pandas DataFrame object.
    Args:
@ -178,10 +183,10 @@ def _partition_pandas_dataframe(df, num_partitions=None, row_chunksize=None):
    row_partitions = []
    while len(temp_df) > row_chunksize:
        t_df = temp_df[:row_chunksize]
-        # reset_index here because we want a pd.RangeIndex
+        # reset_index here because we want a pandas.RangeIndex
        # within the partitions. It is smaller and sometimes faster.
        t_df.reset_index(drop=True, inplace=True)
-        t_df.columns = pd.RangeIndex(0, len(t_df.columns))
+        t_df.columns = pandas.RangeIndex(0, len(t_df.columns))
        top = ray.put(t_df)
        row_partitions.append(top)
        temp_df = temp_df[row_chunksize:]
@ -190,7 +195,7 @@ def _partition_pandas_dataframe(df, num_partitions=None, row_chunksize=None):
        # This call is necessary to prevent modifying original df
        temp_df = temp_df[:]
        temp_df.reset_index(drop=True, inplace=True)
-        temp_df.columns = pd.RangeIndex(0, len(temp_df.columns))
+        temp_df.columns = pandas.RangeIndex(0, len(temp_df.columns))
        row_partitions.append(ray.put(temp_df))

    return row_partitions
@ -223,10 +228,10 @@ def to_pandas(df):
    Returns:
        A new pandas DataFrame.
    """
-    pd_df = pd.concat(ray.get(df._row_partitions), copy=False)
-    pd_df.index = df.index
-    pd_df.columns = df.columns
-    return pd_df
+    pandas_df = pandas.concat(ray.get(df._row_partitions), copy=False)
+    pandas_df.index = df.index
+    pandas_df.columns = df.columns
+    return pandas_df


@ray.remote
@ -342,17 +347,27 @@ def _build_row_lengths(df_row):
@ray.remote
 def _build_coord_df(lengths, index):
    """Build the coordinate dataframe over all partitions."""
-    coords = np.vstack([np.column_stack((np.full(l, i), np.arange(l)))
-                        for i, l in enumerate(lengths)])
-
+    filtered_lengths = [x for x in lengths if x > 0]
+    coords = None
+    if len(filtered_lengths) > 0:
+        coords = np.vstack([np.column_stack((np.full(l, i), np.arange(l)))
+                            for i, l in enumerate(filtered_lengths)])
    col_names = ("partition", "index_within_partition")
-    return pd.DataFrame(coords, index=index, columns=col_names)
+    return pandas.DataFrame(coords, index=index, columns=col_names)
+
+
+@ray.remote
+def _check_empty(dfs):
+    """Check if all partitions are empty"""
+    return all(ray.get([_deploy_func.remote(_get_empty, d) for d in dfs]))


 def _create_block_partitions(partitions, axis=0, length=None):

    if length is not None and length != 0 and get_npartitions() > length:
        npartitions = length
+    elif length == 0:
+        npartitions = 1
    else:
        npartitions = get_npartitions()

@ -385,8 +400,8 @@ def create_blocks_helper(df, npartitions, axis):
        if df.shape[axis ^ 1] % npartitions == 0 \
        else df.shape[axis ^ 1] // npartitions + 1

-    # if not isinstance(df.columns, pd.RangeIndex):
-    #     df.columns = pd.RangeIndex(0, len(df.columns))
+    # if not isinstance(df.columns, pandas.RangeIndex):
+    #     df.columns = pandas.RangeIndex(0, len(df.columns))

    blocks = [df.iloc[:, i * block_size: (i + 1) * block_size]
              if axis == 0
@ -394,7 +409,7 @@ def create_blocks_helper(df, npartitions, axis):
              for i in range(npartitions)]

    for block in blocks:
-        block.columns = pd.RangeIndex(0, len(block.columns))
+        block.columns = pandas.RangeIndex(0, len(block.columns))
        block.reset_index(inplace=True, drop=True)
    return blocks

@ -403,20 +418,20 @@ def create_blocks_helper(df, npartitions, axis):
@ray.remote
 def _blocks_to_col(*partition):
    if len(partition):
-        return pd.concat(partition, axis=0, copy=False)\
+        return pandas.concat(partition, axis=0, copy=False)\
            .reset_index(drop=True)
    else:
-        return pd.Series()
+        return pandas.Series()


@memoize
@ray.remote
 def _blocks_to_row(*partition):
-    row_part = pd.concat(partition, axis=1, copy=False)\
+    row_part = pandas.concat(partition, axis=1, copy=False)\
        .reset_index(drop=True)
    # Because our block partitions contain different indices (for the
    # columns), this change is needed to ensure correctness.
-    row_part.columns = pd.RangeIndex(0, len(row_part.columns))
+    row_part.columns = pandas.RangeIndex(0, len(row_part.columns))
    return row_part


@ -468,7 +483,7 @@ def _reindex_helper(old_index, new_index, axis, npartitions, *df):
    Returns:
        A new set of blocks made up of DataFrames.
    """
-    df = pd.concat(df, axis=axis ^ 1)
+    df = pandas.concat(df, axis=axis ^ 1)
    if axis == 1:
        df.index = old_index
    elif axis == 0:
@ -497,12 +512,12 @@ def _co_op_helper(func, left_columns, right_columns, left_df_len, left_idx,
    Returns:
         A new set of blocks for the partitioned DataFrame.
    """
-    left = pd.concat(zipped[:left_df_len], axis=1, copy=False).copy()
+    left = pandas.concat(zipped[:left_df_len], axis=1, copy=False).copy()
    left.columns = left_columns
    if left_idx is not None:
        left.index = left_idx

-    right = pd.concat(zipped[left_df_len:], axis=1, copy=False).copy()
+    right = pandas.concat(zipped[left_df_len:], axis=1, copy=False).copy()
    right.columns = right_columns

    new_rows = func(left, right)
@ -546,7 +561,7 @@ def _match_partitioning(column_partition, lengths, index):
    column_partition.index = index
    for length in lengths:
        if len(column_partition) == 0:
-            partitioned_list.append(pd.DataFrame(columns=columns))
+            partitioned_list.append(pandas.DataFrame(columns=columns))
            continue

        partitioned_list.append(column_partition.iloc[:length, :])
@ -570,4 +585,4 @@ def fix_blocks_dimensions(blocks, axis):
@ray.remote
 def _compile_remote_dtypes(*column_of_blocks):
    small_dfs = [df.loc[0:0] for df in column_of_blocks]
-    return pd.concat(small_dfs, copy=False).dtypes
+    return pandas.concat(small_dfs, copy=False).dtypes