mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[Dataframe] Change pandas and ray.dataframe imports (#1942)
* fixing zero length partitions * fixing bugs to fully handle zero len parts * resolve comments * renaming imports
This commit is contained in:
parent
fa0ade2bc5
commit
8560993b46
10 changed files with 408 additions and 332 deletions
|
@ -2,7 +2,7 @@ from __future__ import absolute_import
|
|||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import pandas as pd
|
||||
import pandas
|
||||
# TODO: In the future `set_option` or similar needs to run on every node
|
||||
# in order to keep all pandas instances across nodes consistent
|
||||
from pandas import (eval, unique, value_counts, cut, to_numeric, factorize,
|
||||
|
@ -12,11 +12,11 @@ from pandas import (eval, unique, value_counts, cut, to_numeric, factorize,
|
|||
set_option, NaT, PeriodIndex, Categorical)
|
||||
import threading
|
||||
|
||||
pd_version = pd.__version__
|
||||
pd_major = int(pd_version.split(".")[0])
|
||||
pd_minor = int(pd_version.split(".")[1])
|
||||
pandas_version = pandas.__version__
|
||||
pandas_major = int(pandas_version.split(".")[0])
|
||||
pandas_minor = int(pandas_version.split(".")[1])
|
||||
|
||||
if pd_major == 0 and pd_minor != 22:
|
||||
if pandas_major == 0 and pandas_minor != 22:
|
||||
raise Exception("In order to use Pandas on Ray, your pandas version must "
|
||||
"be 0.22. You can run 'pip install pandas==0.22'")
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -2,7 +2,7 @@ from __future__ import absolute_import
|
|||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import pandas as pd
|
||||
import pandas
|
||||
import numpy as np
|
||||
import pandas.core.groupby
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
|
@ -12,6 +12,7 @@ import ray
|
|||
|
||||
from .utils import _inherit_docstrings, _reindex_helper
|
||||
from .concat import concat
|
||||
from .index_metadata import _IndexMetadata
|
||||
|
||||
|
||||
@_inherit_docstrings(pandas.core.groupby.DataFrameGroupBy,
|
||||
|
@ -31,12 +32,13 @@ class DataFrameGroupBy(object):
|
|||
|
||||
if axis == 0:
|
||||
partitions = [column for column in df._block_partitions.T]
|
||||
self._index_grouped = pd.Series(self._index, index=self._index)\
|
||||
self._index_grouped = \
|
||||
pandas.Series(self._index, index=self._index) \
|
||||
.groupby(by=by, sort=sort)
|
||||
else:
|
||||
partitions = [row for row in df._block_partitions]
|
||||
self._index_grouped = \
|
||||
pd.Series(self._columns, index=self._columns) \
|
||||
pandas.Series(self._columns, index=self._columns) \
|
||||
.groupby(by=by, sort=sort)
|
||||
|
||||
self._keys_and_values = [(k, v)
|
||||
|
@ -127,7 +129,7 @@ class DataFrameGroupBy(object):
|
|||
|
||||
@property
|
||||
def groups(self):
|
||||
return {k: pd.Index(v) for k, v in self._keys_and_values}
|
||||
return {k: pandas.Index(v) for k, v in self._keys_and_values}
|
||||
|
||||
def min(self, **kwargs):
|
||||
return self._apply_agg_function(lambda df: df.min(axis=self._axis,
|
||||
|
@ -194,7 +196,7 @@ class DataFrameGroupBy(object):
|
|||
|
||||
result = [func(v) for k, v in self._iter]
|
||||
if self._axis == 0:
|
||||
if isinstance(result[0], pd.Series):
|
||||
if isinstance(result[0], pandas.Series):
|
||||
# Applied an aggregation function
|
||||
new_df = concat(result, axis=1).T
|
||||
new_df.columns = self._columns
|
||||
|
@ -208,8 +210,11 @@ class DataFrameGroupBy(object):
|
|||
num_return_vals=len(new_df._block_partitions))
|
||||
for block in new_df._block_partitions.T]).T
|
||||
new_df.index = self._index
|
||||
new_df._row_metadata = \
|
||||
_IndexMetadata(new_df._block_partitions[:, 0],
|
||||
index=new_df.index, axis=0)
|
||||
else:
|
||||
if isinstance(result[0], pd.Series):
|
||||
if isinstance(result[0], pandas.Series):
|
||||
# Applied an aggregation function
|
||||
new_df = concat(result, axis=1)
|
||||
new_df.columns = [k for k, v in self._iter]
|
||||
|
@ -223,6 +228,9 @@ class DataFrameGroupBy(object):
|
|||
num_return_vals=new_df._block_partitions.shape[1])
|
||||
for block in new_df._block_partitions])
|
||||
new_df.columns = self._columns
|
||||
new_df._col_metadata = \
|
||||
_IndexMetadata(new_df._block_partitions[0, :],
|
||||
index=new_df.columns, axis=1)
|
||||
return new_df
|
||||
|
||||
@property
|
||||
|
@ -392,6 +400,9 @@ class DataFrameGroupBy(object):
|
|||
num_return_vals=len(new_df._block_partitions))
|
||||
for block in new_df._block_partitions.T]).T
|
||||
new_df.index = sorted_index
|
||||
new_df._row_metadata = \
|
||||
_IndexMetadata(new_df._block_partitions[:, 0],
|
||||
index=new_df.index, axis=0)
|
||||
|
||||
return new_df
|
||||
|
||||
|
@ -447,6 +458,9 @@ class DataFrameGroupBy(object):
|
|||
num_return_vals=len(new_df._block_partitions))
|
||||
for block in new_df._block_partitions.T]).T
|
||||
new_df.index = sorted_index
|
||||
new_df._row_metadata = \
|
||||
_IndexMetadata(new_df._block_partitions[:, 0],
|
||||
index=new_df.index, axis=0)
|
||||
|
||||
return new_df
|
||||
|
||||
|
@ -516,6 +530,9 @@ class DataFrameGroupBy(object):
|
|||
num_return_vals=len(new_df._block_partitions))
|
||||
for block in new_df._block_partitions.T]).T
|
||||
new_df.index = self._index
|
||||
new_df._row_metadata = \
|
||||
_IndexMetadata(new_df._block_partitions[:, 0],
|
||||
index=new_df.index, axis=0)
|
||||
else:
|
||||
new_df._block_partitions = np.array([_reindex_helper._submit(
|
||||
args=tuple([new_df.columns, self._columns, 0,
|
||||
|
@ -524,6 +541,9 @@ class DataFrameGroupBy(object):
|
|||
num_return_vals=new_df._block_partitions.shape[1])
|
||||
for block in new_df._block_partitions])
|
||||
new_df.columns = self._columns
|
||||
new_df._col_metadata = \
|
||||
_IndexMetadata(new_df._block_partitions[0, :],
|
||||
index=new_df.columns, axis=1)
|
||||
|
||||
return new_df
|
||||
|
||||
|
@ -531,7 +551,7 @@ class DataFrameGroupBy(object):
|
|||
@ray.remote
|
||||
def groupby(by, axis, level, as_index, sort, group_keys, squeeze, *df):
|
||||
|
||||
df = pd.concat(df, axis=axis)
|
||||
df = pandas.concat(df, axis=axis)
|
||||
|
||||
return [v for k, v in df.groupby(by=by,
|
||||
axis=axis,
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
import pandas as pd
|
||||
import pandas
|
||||
import numpy as np
|
||||
import ray
|
||||
|
||||
from .utils import (
|
||||
_build_row_lengths,
|
||||
_build_col_widths,
|
||||
_build_coord_df)
|
||||
_build_coord_df,
|
||||
_check_empty)
|
||||
|
||||
from pandas.core.indexing import convert_to_index_sliceable
|
||||
|
||||
|
@ -32,12 +33,12 @@ class _IndexMetadata(object):
|
|||
|
||||
Args:
|
||||
dfs ([ObjectID]): ObjectIDs of dataframe partitions
|
||||
index (pd.Index): Index of the Ray DataFrame.
|
||||
index (pandas.Index): Index of the Ray DataFrame.
|
||||
axis: Axis of partition (0=row partitions, 1=column partitions)
|
||||
|
||||
Returns:
|
||||
A IndexMetadata backed by the specified pd.Index, partitioned off
|
||||
specified partitions
|
||||
A IndexMetadata backed by the specified pandas.Index, partitioned
|
||||
off specified partitions
|
||||
"""
|
||||
assert (lengths_oid is None) == (coord_df_oid is None), \
|
||||
"Must pass both or neither of lengths_oid and coord_df_oid"
|
||||
|
@ -48,6 +49,9 @@ class _IndexMetadata(object):
|
|||
else:
|
||||
lengths_oid = _build_col_widths.remote(dfs)
|
||||
coord_df_oid = _build_coord_df.remote(lengths_oid, index)
|
||||
self._empty = _check_empty.remote(dfs)
|
||||
else:
|
||||
self._empty = True
|
||||
|
||||
self._lengths = lengths_oid
|
||||
self._coord_df = coord_df_oid
|
||||
|
@ -115,7 +119,7 @@ class _IndexMetadata(object):
|
|||
This design is more straightforward than caching indexes on setting the
|
||||
coord_df to an OID due to the possibility of an OID-to-OID change.
|
||||
"""
|
||||
new_index = pd.DataFrame(index=new_index).index
|
||||
new_index = pandas.DataFrame(index=new_index).index
|
||||
assert len(new_index) == len(self)
|
||||
|
||||
self._index_cache = new_index
|
||||
|
@ -138,7 +142,7 @@ class _IndexMetadata(object):
|
|||
The Index object in _index_cache.
|
||||
"""
|
||||
if self._index_cache_validator is None:
|
||||
self._index_cache_validator = pd.RangeIndex(len(self))
|
||||
self._index_cache_validator = pandas.RangeIndex(len(self))
|
||||
elif isinstance(self._index_cache_validator,
|
||||
ray.ObjectID):
|
||||
self._index_cache_validator = ray.get(self._index_cache_validator)
|
||||
|
@ -157,6 +161,16 @@ class _IndexMetadata(object):
|
|||
# cache to accept ObjectIDs and ray.get them when needed.
|
||||
_index_cache = property(_get_index_cache, _set_index_cache)
|
||||
|
||||
def _get_empty(self):
|
||||
if isinstance(self._empty_cache, ray.ObjectID):
|
||||
self._empty_cache = ray.get(self._empty_cache)
|
||||
return self._empty_cache
|
||||
|
||||
def _set_empty(self, empty):
|
||||
self._empty_cache = empty
|
||||
|
||||
_empty = property(_get_empty, _set_empty)
|
||||
|
||||
def coords_of(self, key):
|
||||
"""Returns the coordinates (partition, index_within_partition) of the
|
||||
provided key in the index. Can be called on its own or implicitly
|
||||
|
@ -170,9 +184,9 @@ class _IndexMetadata(object):
|
|||
|
||||
Returns:
|
||||
Pandas object with the keys specified. If key is a single object
|
||||
it will be a pd.Series with items `partition` and
|
||||
it will be a pandas.Series with items `partition` and
|
||||
`index_within_partition`, and if key is a slice or if the key is
|
||||
duplicate it will be a pd.DataFrame with said items as columns.
|
||||
duplicate it will be a pandas.DataFrame with said items as columns.
|
||||
"""
|
||||
return self._coord_df.loc[key]
|
||||
|
||||
|
@ -191,7 +205,7 @@ class _IndexMetadata(object):
|
|||
'index_within_partition']
|
||||
|
||||
def __len__(self):
|
||||
return sum(self._lengths)
|
||||
return int(sum(self._lengths))
|
||||
|
||||
def reset_partition_coords(self, partitions=None):
|
||||
partitions = np.array(partitions)
|
||||
|
@ -200,7 +214,7 @@ class _IndexMetadata(object):
|
|||
partition_mask = (self._coord_df['partition'] == partition)
|
||||
# Since we are replacing columns with RangeIndex inside the
|
||||
# partition, we have to make sure that our reference to it is
|
||||
# updated as well.
|
||||
# upandasated as well.
|
||||
try:
|
||||
self._coord_df.loc[partition_mask,
|
||||
'index_within_partition'] = np.arange(
|
||||
|
@ -263,7 +277,7 @@ class _IndexMetadata(object):
|
|||
# TODO: Determine if there's a better way to do a row-index insert in
|
||||
# pandas, because this is very annoying/unsure of efficiency
|
||||
# Create new coord entry to insert
|
||||
coord_to_insert = pd.DataFrame(
|
||||
coord_to_insert = pandas.DataFrame(
|
||||
{'partition': partition,
|
||||
'index_within_partition': index_within_partition},
|
||||
index=[key])
|
||||
|
@ -329,9 +343,9 @@ class _IndexMetadata(object):
|
|||
|
||||
Returns:
|
||||
Pandas object with the keys specified. If key is a single object
|
||||
it will be a pd.Series with items `partition` and
|
||||
it will be a pandas.Series with items `partition` and
|
||||
`index_within_partition`, and if key is a slice or if the key is
|
||||
duplicate it will be a pd.DataFrame with said items as columns.
|
||||
duplicate it will be a pandas.DataFrame with said items as columns.
|
||||
"""
|
||||
return self.coords_of(key)
|
||||
|
||||
|
@ -355,26 +369,37 @@ class _IndexMetadata(object):
|
|||
"""
|
||||
dropped = self.coords_of(labels)
|
||||
|
||||
# Update first lengths to prevent possible length inconsistencies
|
||||
if isinstance(dropped, pd.DataFrame):
|
||||
# Upandasate first lengths to prevent possible length inconsistencies
|
||||
if isinstance(dropped, pandas.DataFrame):
|
||||
try:
|
||||
drop_per_part = dropped.groupby(["partition"]).size()\
|
||||
.reindex(index=pd.RangeIndex(len(self._lengths)),
|
||||
.reindex(index=pandas.RangeIndex(len(self._lengths)),
|
||||
fill_value=0)
|
||||
except ValueError:
|
||||
# Copy the arrow sealed dataframe so we can mutate it.
|
||||
dropped = dropped.copy()
|
||||
drop_per_part = dropped.groupby(["partition"]).size()\
|
||||
.reindex(index=pd.RangeIndex(len(self._lengths)),
|
||||
.reindex(index=pandas.RangeIndex(len(self._lengths)),
|
||||
fill_value=0)
|
||||
elif isinstance(dropped, pd.Series):
|
||||
elif isinstance(dropped, pandas.Series):
|
||||
drop_per_part = np.zeros_like(self._lengths)
|
||||
drop_per_part[dropped["partition"]] = 1
|
||||
else:
|
||||
raise AssertionError("Unrecognized result from `coords_of`")
|
||||
self._lengths = self._lengths - drop_per_part
|
||||
|
||||
self._coord_df = self._coord_df.drop(labels, errors=errors)
|
||||
self._lengths = self._lengths - np.array(drop_per_part)
|
||||
|
||||
new_coord_df = self._coord_df.drop(labels, errors=errors)
|
||||
|
||||
num_dropped = 0
|
||||
for i, length in enumerate(self._lengths):
|
||||
if length == 0:
|
||||
num_dropped += 1
|
||||
if num_dropped > 0:
|
||||
new_coord_df['partition'][new_coord_df['partition'] == i] \
|
||||
-= num_dropped
|
||||
|
||||
self._coord_df = new_coord_df
|
||||
return dropped
|
||||
|
||||
def rename_index(self, mapper):
|
||||
|
|
|
@ -13,7 +13,7 @@ _LocIndexer and _iLocIndexer is responsible for indexer specific logic and
|
|||
An illustration is available at
|
||||
https://github.com/ray-project/ray/pull/1955#issuecomment-386781826
|
||||
"""
|
||||
import pandas as pd
|
||||
import pandas
|
||||
import numpy as np
|
||||
import ray
|
||||
from warnings import warn
|
||||
|
@ -96,7 +96,7 @@ def _is_enlargement(locator, coord_df):
|
|||
"""
|
||||
if is_list_like(locator) and not is_slice(
|
||||
locator) and len(locator) > 0 and not is_boolean_array(locator):
|
||||
n_diff_elems = len(pd.Index(locator).difference(coord_df.index))
|
||||
n_diff_elems = len(pandas.Index(locator).difference(coord_df.index))
|
||||
is_enlargement_boolean = n_diff_elems > 0
|
||||
return is_enlargement_boolean
|
||||
return False
|
||||
|
@ -140,8 +140,8 @@ class _Location_Indexer_Base():
|
|||
def __getitem__(self, row_lookup, col_lookup, ndim):
|
||||
"""
|
||||
Args:
|
||||
row_lookup: A pd dataframe, a partial view from row_coord_df
|
||||
col_lookup: A pd dataframe, a partial view from col_coord_df
|
||||
row_lookup: A pandas dataframe, a partial view from row_coord_df
|
||||
col_lookup: A pandas dataframe, a partial view from col_coord_df
|
||||
ndim: the dimension of returned data
|
||||
"""
|
||||
if ndim == 2:
|
||||
|
@ -152,7 +152,7 @@ class _Location_Indexer_Base():
|
|||
result = ray.get(_blocks_to_col.remote(*extracted)).squeeze()
|
||||
|
||||
if is_scalar(result):
|
||||
result = pd.Series(result)
|
||||
result = pandas.Series(result)
|
||||
|
||||
scaler_axis = row_lookup if len(row_lookup) == 1 else col_lookup
|
||||
series_name = scaler_axis.iloc[0].name
|
||||
|
@ -213,8 +213,8 @@ class _Location_Indexer_Base():
|
|||
def __setitem__(self, row_lookup, col_lookup, item):
|
||||
"""
|
||||
Args:
|
||||
row_lookup: A pd dataframe, a partial view from row_coord_df
|
||||
col_lookup: A pd dataframe, a partial view from col_coord_df
|
||||
row_lookup: A pandas dataframe, a partial view from row_coord_df
|
||||
col_lookup: A pandas dataframe, a partial view from col_coord_df
|
||||
item: The new item needs to be set. It can be any shape that's
|
||||
broadcastable to the product of the lookup tables.
|
||||
"""
|
||||
|
@ -348,14 +348,14 @@ class _Loc_Indexer(_Location_Indexer_Base):
|
|||
[self.block_oids, nan_blks], axis=0 if row_based_bool else 1)
|
||||
|
||||
# 3. Prepare metadata to return
|
||||
nan_coord_df = pd.DataFrame(data=[{
|
||||
nan_coord_df = pandas.DataFrame(data=[{
|
||||
'': name,
|
||||
'partition': blk_part_n_row if row_based_bool else blk_part_n_col,
|
||||
'index_within_partition': i
|
||||
} for name, i in zip(nan_labels, np.arange(num_nan_labels))
|
||||
]).set_index('')
|
||||
|
||||
coord_df = pd.concat([major_meta._coord_df, nan_coord_df])
|
||||
coord_df = pandas.concat([major_meta._coord_df, nan_coord_df])
|
||||
coord_df = coord_df.loc[locator] # Re-index that allows duplicates
|
||||
|
||||
lens = major_meta._lengths
|
||||
|
@ -370,7 +370,7 @@ class _Loc_Indexer(_Location_Indexer_Base):
|
|||
Returns:
|
||||
nan_labels: The labels needs to be added
|
||||
"""
|
||||
locator_as_index = pd.Index(locator)
|
||||
locator_as_index = pandas.Index(locator)
|
||||
|
||||
nan_labels = locator_as_index.difference(base_index)
|
||||
common_labels = locator_as_index.intersection(base_index)
|
||||
|
|
|
@ -7,9 +7,9 @@ from io import BytesIO
|
|||
import os
|
||||
import re
|
||||
import warnings
|
||||
import pandas
|
||||
|
||||
from pyarrow.parquet import ParquetFile
|
||||
import pandas as pd
|
||||
from pandas.io.common import _infer_compression # don't depend on internal API
|
||||
|
||||
|
||||
|
@ -62,23 +62,23 @@ def _read_parquet_row_group(path, columns, row_group_id, kwargs={}):
|
|||
|
||||
|
||||
@ray.remote
|
||||
def _split_df(pd_df, chunksize):
|
||||
"""Split a pd_df into partitions.
|
||||
def _split_df(pandas_df, chunksize):
|
||||
"""Split a pandas_df into partitions.
|
||||
|
||||
Returns:
|
||||
remote_df_ids ([ObjectID])
|
||||
"""
|
||||
dataframes = []
|
||||
|
||||
while len(pd_df) > chunksize:
|
||||
t_df = pd_df[:chunksize]
|
||||
while len(pandas_df) > chunksize:
|
||||
t_df = pandas_df[:chunksize]
|
||||
t_df.reset_index(drop=True)
|
||||
top = ray.put(t_df)
|
||||
dataframes.append(top)
|
||||
pd_df = pd_df[chunksize:]
|
||||
pandas_df = pandas_df[chunksize:]
|
||||
else:
|
||||
pd_df = pd_df.reset_index(drop=True)
|
||||
dataframes.append(ray.put(pd_df))
|
||||
pandas_df = pandas_df.reset_index(drop=True)
|
||||
dataframes.append(ray.put(pandas_df))
|
||||
|
||||
return dataframes
|
||||
|
||||
|
@ -122,7 +122,7 @@ def _get_firstline(file_path):
|
|||
|
||||
|
||||
def _infer_column(first_line, kwargs={}):
|
||||
return pd.read_csv(BytesIO(first_line), **kwargs).columns
|
||||
return pandas.read_csv(BytesIO(first_line), **kwargs).columns
|
||||
|
||||
|
||||
@ray.remote
|
||||
|
@ -133,11 +133,11 @@ def _read_csv_with_offset(fn, start, end, kwargs={}, header=b''):
|
|||
bio.seek(start)
|
||||
to_read = header + bio.read(end - start)
|
||||
bio.close()
|
||||
pd_df = pd.read_csv(BytesIO(to_read), **kwargs)
|
||||
index = pd_df.index
|
||||
pandas_df = pandas.read_csv(BytesIO(to_read), **kwargs)
|
||||
index = pandas_df.index
|
||||
# Partitions must have RangeIndex
|
||||
pd_df.index = pd.RangeIndex(0, len(pd_df))
|
||||
return pd_df, index
|
||||
pandas_df.index = pandas.RangeIndex(0, len(pandas_df))
|
||||
return pandas_df, index
|
||||
|
||||
|
||||
@ray.remote
|
||||
|
@ -271,11 +271,11 @@ def read_csv(filepath_or_buffer,
|
|||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
pd_obj = pd.read_csv(filepath_or_buffer, **kwargs)
|
||||
if isinstance(pd_obj, pd.DataFrame):
|
||||
return from_pandas(pd_obj, get_npartitions())
|
||||
pandas_obj = pandas.read_csv(filepath_or_buffer, **kwargs)
|
||||
if isinstance(pandas_obj, pandas.DataFrame):
|
||||
return from_pandas(pandas_obj, get_npartitions())
|
||||
|
||||
return pd_obj
|
||||
return pandas_obj
|
||||
|
||||
filepath = filepath_or_buffer
|
||||
|
||||
|
@ -332,10 +332,11 @@ def read_json(path_or_buf=None,
|
|||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
port_frame = pd.read_json(path_or_buf, orient, typ, dtype,
|
||||
convert_axes, convert_dates, keep_default_dates,
|
||||
numpy, precise_float, date_unit, encoding,
|
||||
lines, chunksize, compression)
|
||||
port_frame = pandas.read_json(path_or_buf, orient, typ, dtype,
|
||||
convert_axes, convert_dates,
|
||||
keep_default_dates, numpy, precise_float,
|
||||
date_unit, encoding, lines, chunksize,
|
||||
compression)
|
||||
ray_frame = from_pandas(port_frame, get_npartitions())
|
||||
|
||||
return ray_frame
|
||||
|
@ -360,10 +361,10 @@ def read_html(io,
|
|||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
port_frame = pd.read_html(io, match, flavor, header, index_col,
|
||||
skiprows, attrs, parse_dates, tupleize_cols,
|
||||
thousands, encoding, decimal, converters,
|
||||
na_values, keep_default_na)
|
||||
port_frame = pandas.read_html(io, match, flavor, header, index_col,
|
||||
skiprows, attrs, parse_dates, tupleize_cols,
|
||||
thousands, encoding, decimal, converters,
|
||||
na_values, keep_default_na)
|
||||
ray_frame = from_pandas(port_frame[0], get_npartitions())
|
||||
|
||||
return ray_frame
|
||||
|
@ -374,7 +375,7 @@ def read_clipboard(sep=r'\s+'):
|
|||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
port_frame = pd.read_clipboard(sep)
|
||||
port_frame = pandas.read_clipboard(sep)
|
||||
ray_frame = from_pandas(port_frame, get_npartitions())
|
||||
|
||||
return ray_frame
|
||||
|
@ -403,11 +404,11 @@ def read_excel(io,
|
|||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
port_frame = pd.read_excel(io, sheet_name, header, skiprows, skip_footer,
|
||||
index_col, names, usecols, parse_dates,
|
||||
date_parser, na_values, thousands,
|
||||
convert_float, converters, dtype, true_values,
|
||||
false_values, engine, squeeze)
|
||||
port_frame = pandas.read_excel(io, sheet_name, header, skiprows,
|
||||
skip_footer, index_col, names, usecols,
|
||||
parse_dates, date_parser, na_values,
|
||||
thousands, convert_float, converters, dtype,
|
||||
true_values, false_values, engine, squeeze)
|
||||
ray_frame = from_pandas(port_frame, get_npartitions())
|
||||
|
||||
return ray_frame
|
||||
|
@ -420,7 +421,7 @@ def read_hdf(path_or_buf,
|
|||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
port_frame = pd.read_hdf(path_or_buf, key, mode)
|
||||
port_frame = pandas.read_hdf(path_or_buf, key, mode)
|
||||
ray_frame = from_pandas(port_frame, get_npartitions())
|
||||
|
||||
return ray_frame
|
||||
|
@ -432,7 +433,7 @@ def read_feather(path,
|
|||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
port_frame = pd.read_feather(path)
|
||||
port_frame = pandas.read_feather(path)
|
||||
ray_frame = from_pandas(port_frame, get_npartitions())
|
||||
|
||||
return ray_frame
|
||||
|
@ -445,7 +446,7 @@ def read_msgpack(path_or_buf,
|
|||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
port_frame = pd.read_msgpack(path_or_buf, encoding, iterator)
|
||||
port_frame = pandas.read_msgpack(path_or_buf, encoding, iterator)
|
||||
ray_frame = from_pandas(port_frame, get_npartitions())
|
||||
|
||||
return ray_frame
|
||||
|
@ -466,10 +467,10 @@ def read_stata(filepath_or_buffer,
|
|||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
port_frame = pd.read_stata(filepath_or_buffer, convert_dates,
|
||||
convert_categoricals, encoding, index_col,
|
||||
convert_missing, preserve_dtypes, columns,
|
||||
order_categoricals, chunksize, iterator)
|
||||
port_frame = pandas.read_stata(filepath_or_buffer, convert_dates,
|
||||
convert_categoricals, encoding, index_col,
|
||||
convert_missing, preserve_dtypes, columns,
|
||||
order_categoricals, chunksize, iterator)
|
||||
ray_frame = from_pandas(port_frame, get_npartitions())
|
||||
|
||||
return ray_frame
|
||||
|
@ -485,8 +486,8 @@ def read_sas(filepath_or_buffer,
|
|||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
port_frame = pd.read_sas(filepath_or_buffer, format, index, encoding,
|
||||
chunksize, iterator)
|
||||
port_frame = pandas.read_sas(filepath_or_buffer, format, index, encoding,
|
||||
chunksize, iterator)
|
||||
ray_frame = from_pandas(port_frame, get_npartitions())
|
||||
|
||||
return ray_frame
|
||||
|
@ -498,7 +499,7 @@ def read_pickle(path,
|
|||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
port_frame = pd.read_pickle(path, compression)
|
||||
port_frame = pandas.read_pickle(path, compression)
|
||||
ray_frame = from_pandas(port_frame, get_npartitions())
|
||||
|
||||
return ray_frame
|
||||
|
@ -516,8 +517,8 @@ def read_sql(sql,
|
|||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
port_frame = pd.read_sql(sql, con, index_col, coerce_float, params,
|
||||
parse_dates, columns, chunksize)
|
||||
port_frame = pandas.read_sql(sql, con, index_col, coerce_float, params,
|
||||
parse_dates, columns, chunksize)
|
||||
ray_frame = from_pandas(port_frame, get_npartitions())
|
||||
|
||||
return ray_frame
|
||||
|
|
|
@ -5,17 +5,17 @@ from __future__ import print_function
|
|||
import inspect
|
||||
|
||||
|
||||
def code_gen(pd_obj, ray_obj, path):
|
||||
def code_gen(pandas_obj, ray_obj, path):
|
||||
"""Generate code skeleton for methods not in Ray
|
||||
|
||||
Args:
|
||||
pd_obj: The pandas object to generate code from.
|
||||
pandas_obj: The pandas object to generate code from.
|
||||
ray_obj: The ray object to diff against.
|
||||
path: Path to output the file to.
|
||||
"""
|
||||
|
||||
with open(path, "w") as outfile:
|
||||
funcs = pandas_ray_diff(pd_obj, ray_obj)
|
||||
funcs = pandas_ray_diff(pandas_obj, ray_obj)
|
||||
|
||||
for func in funcs:
|
||||
if func[0] == "_" and func[1] != "_":
|
||||
|
@ -24,9 +24,10 @@ def code_gen(pd_obj, ray_obj, path):
|
|||
# let's not mess with these
|
||||
continue
|
||||
try:
|
||||
outfile.write("\ndef " + func +
|
||||
str(inspect.signature(getattr(pd_obj, func))) +
|
||||
":\n")
|
||||
outfile.write(
|
||||
"\ndef " + func +
|
||||
str(inspect.signature(getattr(pandas_obj, func))) + ":\n")
|
||||
|
||||
except TypeError:
|
||||
outfile.write("\n@property")
|
||||
outfile.write("\ndef " + func + "(self):\n")
|
||||
|
@ -73,21 +74,21 @@ def code_gen_test(ray_obj, path, name):
|
|||
outfile.write(")\n")
|
||||
|
||||
|
||||
def pandas_ray_diff(pd_obj, ray_obj):
|
||||
def pandas_ray_diff(pandas_obj, ray_obj):
|
||||
"""Gets the diff of the methods in the Pandas and Ray objects.
|
||||
|
||||
Args:
|
||||
pd_obj: The Pandas object to diff.
|
||||
pandas_obj: The Pandas object to diff.
|
||||
ray_obj: The Ray object to diff.
|
||||
|
||||
Returns:
|
||||
A list of method names that are different between the two.
|
||||
"""
|
||||
pd_funcs = dir(pd_obj)
|
||||
pandas_funcs = dir(pandas_obj)
|
||||
ray_funcs = dir(ray_obj)
|
||||
|
||||
pd_funcs = set(filter(lambda f: f[0] != "_" or f[1] == "_",
|
||||
pd_funcs))
|
||||
pandas_funcs = set(filter(lambda f: f[0] != "_" or f[1] == "_",
|
||||
pandas_funcs))
|
||||
|
||||
diff = [x for x in pd_funcs if x not in set(ray_funcs)]
|
||||
diff = [x for x in pandas_funcs if x not in set(ray_funcs)]
|
||||
return diff
|
||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import division
|
|||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pandas
|
||||
|
||||
from .utils import _inherit_docstrings
|
||||
|
||||
|
@ -14,7 +14,8 @@ def na_op():
|
|||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
|
||||
@_inherit_docstrings(pd.Series, excluded=[pd.Series, pd.Series.__init__])
|
||||
@_inherit_docstrings(pandas.Series, excluded=[pandas.Series,
|
||||
pandas.Series.__init__])
|
||||
class Series(object):
|
||||
|
||||
def __init__(self, series_oids):
|
||||
|
@ -832,7 +833,7 @@ class Series(object):
|
|||
def unstack(self, level=-1, fill_value=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def update(self, other):
|
||||
def upandasate(self, other):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def valid(self, inplace=False, **kwargs):
|
||||
|
|
|
@ -3,12 +3,12 @@ from __future__ import division
|
|||
from __future__ import print_function
|
||||
|
||||
import pytest
|
||||
import ray.dataframe as rdf
|
||||
import ray.dataframe as pd
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def create_test_series():
|
||||
return rdf.Series(None)
|
||||
return pd.Series(None)
|
||||
|
||||
|
||||
def test_T():
|
||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import division
|
|||
from __future__ import print_function
|
||||
|
||||
import collections
|
||||
import pandas as pd
|
||||
import pandas
|
||||
import numpy as np
|
||||
import ray
|
||||
|
||||
|
@ -119,14 +119,14 @@ def _get_nan_block_id(n_row=1, n_col=1, transpose=False):
|
|||
shape = (n_row, n_col)
|
||||
if shape not in _NAN_BLOCKS:
|
||||
arr = np.tile(np.array(np.NaN), shape)
|
||||
_NAN_BLOCKS[shape] = ray.put(pd.DataFrame(data=arr))
|
||||
_NAN_BLOCKS[shape] = ray.put(pandas.DataFrame(data=arr))
|
||||
return _NAN_BLOCKS[shape]
|
||||
|
||||
|
||||
def _get_lengths(df):
|
||||
"""Gets the length of the dataframe.
|
||||
Args:
|
||||
df: A remote pd.DataFrame object.
|
||||
df: A remote pandas.DataFrame object.
|
||||
Returns:
|
||||
Returns an integer length of the dataframe object. If the attempt
|
||||
fails, returns 0 as the length.
|
||||
|
@ -142,7 +142,7 @@ def _get_lengths(df):
|
|||
def _get_widths(df):
|
||||
"""Gets the width (number of columns) of the dataframe.
|
||||
Args:
|
||||
df: A remote pd.DataFrame object.
|
||||
df: A remote pandas.DataFrame object.
|
||||
Returns:
|
||||
Returns an integer width of the dataframe object. If the attempt
|
||||
fails, returns 0 as the length.
|
||||
|
@ -155,6 +155,11 @@ def _get_widths(df):
|
|||
return 0
|
||||
|
||||
|
||||
def _get_empty(df):
|
||||
"""Return True if the DataFrame is empty"""
|
||||
return df.empty
|
||||
|
||||
|
||||
def _partition_pandas_dataframe(df, num_partitions=None, row_chunksize=None):
|
||||
"""Partitions a Pandas DataFrame object.
|
||||
Args:
|
||||
|
@ -178,10 +183,10 @@ def _partition_pandas_dataframe(df, num_partitions=None, row_chunksize=None):
|
|||
row_partitions = []
|
||||
while len(temp_df) > row_chunksize:
|
||||
t_df = temp_df[:row_chunksize]
|
||||
# reset_index here because we want a pd.RangeIndex
|
||||
# reset_index here because we want a pandas.RangeIndex
|
||||
# within the partitions. It is smaller and sometimes faster.
|
||||
t_df.reset_index(drop=True, inplace=True)
|
||||
t_df.columns = pd.RangeIndex(0, len(t_df.columns))
|
||||
t_df.columns = pandas.RangeIndex(0, len(t_df.columns))
|
||||
top = ray.put(t_df)
|
||||
row_partitions.append(top)
|
||||
temp_df = temp_df[row_chunksize:]
|
||||
|
@ -190,7 +195,7 @@ def _partition_pandas_dataframe(df, num_partitions=None, row_chunksize=None):
|
|||
# This call is necessary to prevent modifying original df
|
||||
temp_df = temp_df[:]
|
||||
temp_df.reset_index(drop=True, inplace=True)
|
||||
temp_df.columns = pd.RangeIndex(0, len(temp_df.columns))
|
||||
temp_df.columns = pandas.RangeIndex(0, len(temp_df.columns))
|
||||
row_partitions.append(ray.put(temp_df))
|
||||
|
||||
return row_partitions
|
||||
|
@ -223,10 +228,10 @@ def to_pandas(df):
|
|||
Returns:
|
||||
A new pandas DataFrame.
|
||||
"""
|
||||
pd_df = pd.concat(ray.get(df._row_partitions), copy=False)
|
||||
pd_df.index = df.index
|
||||
pd_df.columns = df.columns
|
||||
return pd_df
|
||||
pandas_df = pandas.concat(ray.get(df._row_partitions), copy=False)
|
||||
pandas_df.index = df.index
|
||||
pandas_df.columns = df.columns
|
||||
return pandas_df
|
||||
|
||||
|
||||
@ray.remote
|
||||
|
@ -342,17 +347,27 @@ def _build_row_lengths(df_row):
|
|||
@ray.remote
|
||||
def _build_coord_df(lengths, index):
|
||||
"""Build the coordinate dataframe over all partitions."""
|
||||
coords = np.vstack([np.column_stack((np.full(l, i), np.arange(l)))
|
||||
for i, l in enumerate(lengths)])
|
||||
|
||||
filtered_lengths = [x for x in lengths if x > 0]
|
||||
coords = None
|
||||
if len(filtered_lengths) > 0:
|
||||
coords = np.vstack([np.column_stack((np.full(l, i), np.arange(l)))
|
||||
for i, l in enumerate(filtered_lengths)])
|
||||
col_names = ("partition", "index_within_partition")
|
||||
return pd.DataFrame(coords, index=index, columns=col_names)
|
||||
return pandas.DataFrame(coords, index=index, columns=col_names)
|
||||
|
||||
|
||||
@ray.remote
|
||||
def _check_empty(dfs):
|
||||
"""Check if all partitions are empty"""
|
||||
return all(ray.get([_deploy_func.remote(_get_empty, d) for d in dfs]))
|
||||
|
||||
|
||||
def _create_block_partitions(partitions, axis=0, length=None):
|
||||
|
||||
if length is not None and length != 0 and get_npartitions() > length:
|
||||
npartitions = length
|
||||
elif length == 0:
|
||||
npartitions = 1
|
||||
else:
|
||||
npartitions = get_npartitions()
|
||||
|
||||
|
@ -385,8 +400,8 @@ def create_blocks_helper(df, npartitions, axis):
|
|||
if df.shape[axis ^ 1] % npartitions == 0 \
|
||||
else df.shape[axis ^ 1] // npartitions + 1
|
||||
|
||||
# if not isinstance(df.columns, pd.RangeIndex):
|
||||
# df.columns = pd.RangeIndex(0, len(df.columns))
|
||||
# if not isinstance(df.columns, pandas.RangeIndex):
|
||||
# df.columns = pandas.RangeIndex(0, len(df.columns))
|
||||
|
||||
blocks = [df.iloc[:, i * block_size: (i + 1) * block_size]
|
||||
if axis == 0
|
||||
|
@ -394,7 +409,7 @@ def create_blocks_helper(df, npartitions, axis):
|
|||
for i in range(npartitions)]
|
||||
|
||||
for block in blocks:
|
||||
block.columns = pd.RangeIndex(0, len(block.columns))
|
||||
block.columns = pandas.RangeIndex(0, len(block.columns))
|
||||
block.reset_index(inplace=True, drop=True)
|
||||
return blocks
|
||||
|
||||
|
@ -403,20 +418,20 @@ def create_blocks_helper(df, npartitions, axis):
|
|||
@ray.remote
|
||||
def _blocks_to_col(*partition):
|
||||
if len(partition):
|
||||
return pd.concat(partition, axis=0, copy=False)\
|
||||
return pandas.concat(partition, axis=0, copy=False)\
|
||||
.reset_index(drop=True)
|
||||
else:
|
||||
return pd.Series()
|
||||
return pandas.Series()
|
||||
|
||||
|
||||
@memoize
|
||||
@ray.remote
|
||||
def _blocks_to_row(*partition):
|
||||
row_part = pd.concat(partition, axis=1, copy=False)\
|
||||
row_part = pandas.concat(partition, axis=1, copy=False)\
|
||||
.reset_index(drop=True)
|
||||
# Because our block partitions contain different indices (for the
|
||||
# columns), this change is needed to ensure correctness.
|
||||
row_part.columns = pd.RangeIndex(0, len(row_part.columns))
|
||||
row_part.columns = pandas.RangeIndex(0, len(row_part.columns))
|
||||
return row_part
|
||||
|
||||
|
||||
|
@ -468,7 +483,7 @@ def _reindex_helper(old_index, new_index, axis, npartitions, *df):
|
|||
Returns:
|
||||
A new set of blocks made up of DataFrames.
|
||||
"""
|
||||
df = pd.concat(df, axis=axis ^ 1)
|
||||
df = pandas.concat(df, axis=axis ^ 1)
|
||||
if axis == 1:
|
||||
df.index = old_index
|
||||
elif axis == 0:
|
||||
|
@ -497,12 +512,12 @@ def _co_op_helper(func, left_columns, right_columns, left_df_len, left_idx,
|
|||
Returns:
|
||||
A new set of blocks for the partitioned DataFrame.
|
||||
"""
|
||||
left = pd.concat(zipped[:left_df_len], axis=1, copy=False).copy()
|
||||
left = pandas.concat(zipped[:left_df_len], axis=1, copy=False).copy()
|
||||
left.columns = left_columns
|
||||
if left_idx is not None:
|
||||
left.index = left_idx
|
||||
|
||||
right = pd.concat(zipped[left_df_len:], axis=1, copy=False).copy()
|
||||
right = pandas.concat(zipped[left_df_len:], axis=1, copy=False).copy()
|
||||
right.columns = right_columns
|
||||
|
||||
new_rows = func(left, right)
|
||||
|
@ -546,7 +561,7 @@ def _match_partitioning(column_partition, lengths, index):
|
|||
column_partition.index = index
|
||||
for length in lengths:
|
||||
if len(column_partition) == 0:
|
||||
partitioned_list.append(pd.DataFrame(columns=columns))
|
||||
partitioned_list.append(pandas.DataFrame(columns=columns))
|
||||
continue
|
||||
|
||||
partitioned_list.append(column_partition.iloc[:length, :])
|
||||
|
@ -570,4 +585,4 @@ def fix_blocks_dimensions(blocks, axis):
|
|||
@ray.remote
|
||||
def _compile_remote_dtypes(*column_of_blocks):
|
||||
small_dfs = [df.loc[0:0] for df in column_of_blocks]
|
||||
return pd.concat(small_dfs, copy=False).dtypes
|
||||
return pandas.concat(small_dfs, copy=False).dtypes
|
||||
|
|
Loading…
Add table
Reference in a new issue