From 74cca3b28414e05e8302f3559e866a0ccf4d5b4c Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Sat, 26 May 2018 11:24:01 -0700 Subject: [PATCH] [DataFrame] Fixing the code formatting of the tests (#2123) * Fixing the code formatting of the tests * Fixing tests and removing from_pandas * Addressing comment * Addressing comments * Fix lint --- python/ray/dataframe/test/test_dataframe.py | 1340 +++++++++---------- 1 file changed, 606 insertions(+), 734 deletions(-) diff --git a/python/ray/dataframe/test/test_dataframe.py b/python/ray/dataframe/test/test_dataframe.py index 537110b6b..6edf121ac 100644 --- a/python/ray/dataframe/test/test_dataframe.py +++ b/python/ray/dataframe/test/test_dataframe.py @@ -4,177 +4,45 @@ from __future__ import print_function import pytest import numpy as np -import pandas as pd +import pandas import pandas.util.testing as tm -import ray.dataframe as rdf -from ray.dataframe.utils import ( - from_pandas, - to_pandas) - from pandas.tests.frame.common import TestData +import ray.dataframe as pd +from ray.dataframe.utils import to_pandas @pytest.fixture def ray_df_equals_pandas(ray_df, pandas_df): - return to_pandas(ray_df).sort_index().equals(pandas_df.sort_index()) + return to_pandas(ray_df).equals(pandas_df) @pytest.fixture def ray_series_equals_pandas(ray_df, pandas_df): - return ray_df.sort_index().equals(pandas_df.sort_index()) + return ray_df.equals(pandas_df) @pytest.fixture def ray_df_equals(ray_df1, ray_df2): - return to_pandas(ray_df1).sort_index().equals( - to_pandas(ray_df2).sort_index() - ) - - -@pytest.fixture -def test_roundtrip(ray_df, pandas_df): - assert(ray_df_equals_pandas(ray_df, pandas_df)) - - -@pytest.fixture -def test_index(ray_df, pandas_df): - assert(ray_df.index.equals(pandas_df.index)) - ray_df_cp = ray_df.copy() - pandas_df_cp = pandas_df.copy() - - ray_df_cp.index = [str(i) for i in ray_df_cp.index] - pandas_df_cp.index = [str(i) for i in pandas_df_cp.index] - assert(ray_df_cp.index.sort_values().equals(pandas_df_cp.index)) - - -@pytest.fixture -def test_size(ray_df, pandas_df): - assert(ray_df.size == pandas_df.size) - - -@pytest.fixture -def test_ndim(ray_df, pandas_df): - assert(ray_df.ndim == pandas_df.ndim) - - -@pytest.fixture -def test_ftypes(ray_df, pandas_df): - assert(ray_df.ftypes.equals(pandas_df.ftypes)) - - -@pytest.fixture -def test_dtypes(ray_df, pandas_df): - assert(ray_df.dtypes.equals(pandas_df.dtypes)) - - -@pytest.fixture -def test_values(ray_df, pandas_df): - np.testing.assert_equal(ray_df.values, pandas_df.values) - - -@pytest.fixture -def test_axes(ray_df, pandas_df): - for ray_axis, pd_axis in zip(ray_df.axes, pandas_df.axes): - assert (np.array_equal(ray_axis, pd_axis)) - - -@pytest.fixture -def test_shape(ray_df, pandas_df): - assert(ray_df.shape == pandas_df.shape) - - -@pytest.fixture -def test_add_prefix(ray_df, pandas_df): - test_prefix = "TEST" - new_ray_df = ray_df.add_prefix(test_prefix) - new_pandas_df = pandas_df.add_prefix(test_prefix) - assert(new_ray_df.columns.equals(new_pandas_df.columns)) - - -@pytest.fixture -def test_add_suffix(ray_df, pandas_df): - test_suffix = "TEST" - new_ray_df = ray_df.add_suffix(test_suffix) - new_pandas_df = pandas_df.add_suffix(test_suffix) - - assert(new_ray_df.columns.equals(new_pandas_df.columns)) - - -@pytest.fixture -def test_applymap(ray_df, pandas_df, testfunc): - new_ray_df = ray_df.applymap(testfunc) - new_pandas_df = pandas_df.applymap(testfunc) - - assert(ray_df_equals_pandas(new_ray_df, new_pandas_df)) - - -@pytest.fixture -def test_copy(ray_df): - new_ray_df = ray_df.copy() - - assert new_ray_df is not ray_df - assert np.array_equal(new_ray_df._block_partitions, - ray_df._block_partitions) - - -@pytest.fixture -def test_sum(ray_df, pandas_df): - assert(ray_df.sum().sort_index().equals(pandas_df.sum().sort_index())) - - -@pytest.fixture -def test_abs(ray_df, pandas_df): - assert(ray_df_equals_pandas(ray_df.abs(), pandas_df.abs())) - - -@pytest.fixture -def test_keys(ray_df, pandas_df): - assert(ray_df.keys().equals(pandas_df.keys())) - - -@pytest.fixture -def test_transpose(ray_df, pandas_df): - assert(ray_df_equals_pandas(ray_df.T, pandas_df.T)) - assert(ray_df_equals_pandas(ray_df.transpose(), pandas_df.transpose())) - - -@pytest.fixture -def test_get(ray_df, pandas_df, key): - assert(ray_df.get(key).equals(pandas_df.get(key))) - assert ray_df.get( - key, default='default').equals( - pandas_df.get(key, default='default')) - - -@pytest.fixture -def test_get_dtype_counts(ray_df, pandas_df): - assert(ray_df.get_dtype_counts().equals(pandas_df.get_dtype_counts())) - - -@pytest.fixture -def test_get_ftype_counts(ray_df, pandas_df): - assert(ray_df.get_ftype_counts().equals(pandas_df.get_ftype_counts())) + return to_pandas(ray_df1).equals(to_pandas(ray_df2)) @pytest.fixture def create_test_dataframe(): - df = pd.DataFrame({'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0]}) - - return from_pandas(df, 2) + return pd.DataFrame({'col1': [0, 1, 2, 3], + 'col2': [4, 5, 6, 7], + 'col3': [8, 9, 10, 11], + 'col4': [12, 13, 14, 15], + 'col5': [0, 0, 0, 0]}) def test_int_dataframe(): - - pandas_df = pd.DataFrame({'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0]}) - ray_df = from_pandas(pandas_df, 2) + frame_data = {'col1': [0, 1, 2, 3], + 'col2': [4, 5, 6, 7], + 'col3': [8, 9, 10, 11], + 'col4': [12, 13, 14, 15], + 'col5': [0, 0, 0, 0]} + pandas_df = pandas.DataFrame(frame_data) + ray_df = pd.DataFrame(frame_data) testfuncs = [lambda x: x + 1, lambda x: str(x), @@ -195,7 +63,6 @@ def test_int_dataframe(): 'like': 'col'} test_filter(ray_df, pandas_df, filter_by) - test_roundtrip(ray_df, pandas_df) test_index(ray_df, pandas_df) test_size(ray_df, pandas_df) test_ndim(ray_df, pandas_df) @@ -224,9 +91,12 @@ def test_int_dataframe(): test_var(ray_df, pandas_df) test_std(ray_df, pandas_df) test_median(ray_df, pandas_df) - test_quantile(ray_df, pandas_df, .25) - test_quantile(ray_df, pandas_df, .5) - test_quantile(ray_df, pandas_df, .75) + + quantiles = [0.25, 0.5, 0.75, 0.66, 0.01] + + for q in quantiles: + test_quantile(ray_df, pandas_df, q) + test_describe(ray_df, pandas_df) test_diff(ray_df, pandas_df) test_rank(ray_df, pandas_df) @@ -336,14 +206,14 @@ def test_int_dataframe(): def test_float_dataframe(): + frame_data = {'col1': [0.0, 1.0, 2.0, 3.0], + 'col2': [4.0, 5.0, 6.0, 7.0], + 'col3': [8.0, 9.0, 10.0, 11.0], + 'col4': [12.0, 13.0, 14.0, 15.0], + 'col5': [0.0, 0.0, 0.0, 0.0]} - pandas_df = pd.DataFrame({'col1': [0.0, 1.0, 2.0, 3.0], - 'col2': [4.0, 5.0, 6.0, 7.0], - 'col3': [8.0, 9.0, 10.0, 11.0], - 'col4': [12.0, 13.0, 14.0, 15.0], - 'col5': [0.0, 0.0, 0.0, 0.0]}) - - ray_df = from_pandas(pandas_df, 3) + pandas_df = pandas.DataFrame(frame_data) + ray_df = pd.DataFrame(frame_data) testfuncs = [lambda x: x + 1, lambda x: str(x), @@ -364,7 +234,6 @@ def test_float_dataframe(): 'like': 'col'} test_filter(ray_df, pandas_df, filter_by) - test_roundtrip(ray_df, pandas_df) test_index(ray_df, pandas_df) test_size(ray_df, pandas_df) test_ndim(ray_df, pandas_df) @@ -394,9 +263,12 @@ def test_float_dataframe(): # test_var(ray_df, pandas_df) test_std(ray_df, pandas_df) test_median(ray_df, pandas_df) - test_quantile(ray_df, pandas_df, .25) - test_quantile(ray_df, pandas_df, .5) - test_quantile(ray_df, pandas_df, .75) + + quantiles = [0.25, 0.5, 0.75, 0.66, 0.01] + + for q in quantiles: + test_quantile(ray_df, pandas_df, q) + test_describe(ray_df, pandas_df) test_diff(ray_df, pandas_df) test_rank(ray_df, pandas_df) @@ -507,13 +379,13 @@ def test_float_dataframe(): def test_mixed_dtype_dataframe(): - pandas_df = pd.DataFrame({ - 'col1': [1, 2, 3, 4], - 'col2': [4, 5, 6, 7], - 'col3': [8.0, 9.4, 10.1, 11.3], - 'col4': ['a', 'b', 'c', 'd']}) + frame_data = {'col1': [1, 2, 3, 4], + 'col2': [4, 5, 6, 7], + 'col3': [8.0, 9.4, 10.1, 11.3], + 'col4': ['a', 'b', 'c', 'd']} - ray_df = from_pandas(pandas_df, 2) + pandas_df = pandas.DataFrame(frame_data) + ray_df = pd.DataFrame(frame_data) testfuncs = [lambda x: x + x, lambda x: str(x), @@ -533,7 +405,6 @@ def test_mixed_dtype_dataframe(): 'like': 'col'} test_filter(ray_df, pandas_df, filter_by) - test_roundtrip(ray_df, pandas_df) test_index(ray_df, pandas_df) test_size(ray_df, pandas_df) test_ndim(ray_df, pandas_df) @@ -565,9 +436,12 @@ def test_mixed_dtype_dataframe(): # test_var(ray_df, pandas_df) test_std(ray_df, pandas_df) test_median(ray_df, pandas_df) - test_quantile(ray_df, pandas_df, .25) - test_quantile(ray_df, pandas_df, .5) - test_quantile(ray_df, pandas_df, .75) + + quantiles = [0.25, 0.5, 0.75, 0.66, 0.01] + + for q in quantiles: + test_quantile(ray_df, pandas_df, q) + test_describe(ray_df, pandas_df) # TODO Reolve once Pandas-20962 is resolved. @@ -674,13 +548,13 @@ def test_mixed_dtype_dataframe(): def test_nan_dataframe(): - pandas_df = pd.DataFrame({ - 'col1': [1, 2, 3, np.nan], - 'col2': [4, 5, np.nan, 7], - 'col3': [8, np.nan, 10, 11], - 'col4': [np.nan, 13, 14, 15]}) + frame_data = {'col1': [1, 2, 3, np.nan], + 'col2': [4, 5, np.nan, 7], + 'col3': [8, np.nan, 10, 11], + 'col4': [np.nan, 13, 14, 15]} - ray_df = from_pandas(pandas_df, 2) + pandas_df = pandas.DataFrame(frame_data) + ray_df = pd.DataFrame(frame_data) testfuncs = [lambda x: x + x, lambda x: str(x), @@ -700,7 +574,6 @@ def test_nan_dataframe(): 'like': 'col'} test_filter(ray_df, pandas_df, filter_by) - test_roundtrip(ray_df, pandas_df) test_index(ray_df, pandas_df) test_size(ray_df, pandas_df) test_ndim(ray_df, pandas_df) @@ -727,9 +600,12 @@ def test_nan_dataframe(): test_var(ray_df, pandas_df) test_std(ray_df, pandas_df) test_median(ray_df, pandas_df) - test_quantile(ray_df, pandas_df, .25) - test_quantile(ray_df, pandas_df, .5) - test_quantile(ray_df, pandas_df, .75) + + quantiles = [0.25, 0.5, 0.75, 0.66, 0.01] + + for q in quantiles: + test_quantile(ray_df, pandas_df, q) + test_describe(ray_df, pandas_df) test_diff(ray_df, pandas_df) test_rank(ray_df, pandas_df) @@ -839,15 +715,12 @@ def test_nan_dataframe(): def test_dense_nan_df(): - ray_df = rdf.DataFrame([[np.nan, 2, np.nan, 0], - [3, 4, np.nan, 1], - [np.nan, np.nan, np.nan, 5]], - columns=list('ABCD')) + frame_data = [[np.nan, 2, np.nan, 0], + [3, 4, np.nan, 1], + [np.nan, np.nan, np.nan, 5]] + ray_df = pd.DataFrame(frame_data, columns=list('ABCD')) - pd_df = pd.DataFrame([[np.nan, 2, np.nan, 0], - [3, 4, np.nan, 1], - [np.nan, np.nan, np.nan, 5]], - columns=list('ABCD')) + pd_df = pandas.DataFrame(frame_data, columns=list('ABCD')) column_subsets = [list('AD'), list('BC'), list('CD')] row_subsets = [[0, 1], [0, 1, 2], [2, 0]] @@ -862,11 +735,11 @@ def test_dense_nan_df(): @pytest.fixture def test_inter_df_math(op, simple=False): - ray_df = rdf.DataFrame({"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]}) + frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], + "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]} - pandas_df = pd.DataFrame({"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]}) + ray_df = pd.DataFrame(frame_data) + pandas_df = pandas.DataFrame(frame_data) assert ray_df_equals_pandas(getattr(ray_df, op)(ray_df), getattr(pandas_df, op)(pandas_df)) @@ -875,8 +748,9 @@ def test_inter_df_math(op, simple=False): assert ray_df_equals_pandas(getattr(ray_df, op)(4.0), getattr(pandas_df, op)(4.0)) - ray_df2 = rdf.DataFrame({"A": [0, 2], "col1": [0, 19], "col2": [1, 1]}) - pandas_df2 = pd.DataFrame({"A": [0, 2], "col1": [0, 19], "col2": [1, 1]}) + frame_data = {"A": [0, 2], "col1": [0, 19], "col2": [1, 1]} + ray_df2 = pd.DataFrame(frame_data) + pandas_df2 = pandas.DataFrame(frame_data) assert ray_df_equals_pandas(getattr(ray_df, op)(ray_df2), getattr(pandas_df, op)(pandas_df2)) @@ -893,11 +767,11 @@ def test_inter_df_math(op, simple=False): @pytest.fixture def test_comparison_inter_ops(op): - ray_df = rdf.DataFrame({"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]}) + frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], + "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]} - pandas_df = pd.DataFrame({"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]}) + ray_df = pd.DataFrame(frame_data) + pandas_df = pandas.DataFrame(frame_data) assert ray_df_equals_pandas(getattr(ray_df, op)(ray_df), getattr(pandas_df, op)(pandas_df)) @@ -906,8 +780,10 @@ def test_comparison_inter_ops(op): assert ray_df_equals_pandas(getattr(ray_df, op)(4.0), getattr(pandas_df, op)(4.0)) - ray_df2 = rdf.DataFrame({"A": [0, 2], "col1": [0, 19], "col2": [1, 1]}) - pandas_df2 = pd.DataFrame({"A": [0, 2], "col1": [0, 19], "col2": [1, 1]}) + frame_data = {"A": [0, 2], "col1": [0, 19], "col2": [1, 1]} + + ray_df2 = pd.DataFrame(frame_data) + pandas_df2 = pandas.DataFrame(frame_data) assert ray_df_equals_pandas(getattr(ray_df2, op)(ray_df2), getattr(pandas_df2, op)(pandas_df2)) @@ -915,11 +791,11 @@ def test_comparison_inter_ops(op): @pytest.fixture def test_inter_df_math_right_ops(op): - ray_df = rdf.DataFrame({"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]}) + frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], + "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]} - pandas_df = pd.DataFrame({"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]}) + ray_df = pd.DataFrame(frame_data) + pandas_df = pandas.DataFrame(frame_data) assert ray_df_equals_pandas(getattr(ray_df, op)(4), getattr(pandas_df, op)(4)) @@ -927,6 +803,125 @@ def test_inter_df_math_right_ops(op): getattr(pandas_df, op)(4.0)) +@pytest.fixture +def test_index(ray_df, pandas_df): + assert ray_df.index.equals(pandas_df.index) + ray_df_cp = ray_df.copy() + pandas_df_cp = pandas_df.copy() + + ray_df_cp.index = [str(i) for i in ray_df_cp.index] + pandas_df_cp.index = [str(i) for i in pandas_df_cp.index] + assert ray_df_cp.index.equals(pandas_df_cp.index) + + +@pytest.fixture +def test_size(ray_df, pandas_df): + assert ray_df.size == pandas_df.size + + +@pytest.fixture +def test_ndim(ray_df, pandas_df): + assert ray_df.ndim == pandas_df.ndim + + +@pytest.fixture +def test_ftypes(ray_df, pandas_df): + assert ray_df.ftypes.equals(pandas_df.ftypes) + + +@pytest.fixture +def test_dtypes(ray_df, pandas_df): + assert ray_df.dtypes.equals(pandas_df.dtypes) + + +@pytest.fixture +def test_values(ray_df, pandas_df): + np.testing.assert_equal(ray_df.values, pandas_df.values) + + +@pytest.fixture +def test_axes(ray_df, pandas_df): + for ray_axis, pd_axis in zip(ray_df.axes, pandas_df.axes): + assert (np.array_equal(ray_axis, pd_axis)) + + +@pytest.fixture +def test_shape(ray_df, pandas_df): + assert ray_df.shape == pandas_df.shape + + +@pytest.fixture +def test_add_prefix(ray_df, pandas_df): + test_prefix = "TEST" + new_ray_df = ray_df.add_prefix(test_prefix) + new_pandas_df = pandas_df.add_prefix(test_prefix) + assert new_ray_df.columns.equals(new_pandas_df.columns) + + +@pytest.fixture +def test_add_suffix(ray_df, pandas_df): + test_suffix = "TEST" + new_ray_df = ray_df.add_suffix(test_suffix) + new_pandas_df = pandas_df.add_suffix(test_suffix) + + assert new_ray_df.columns.equals(new_pandas_df.columns) + + +@pytest.fixture +def test_applymap(ray_df, pandas_df, testfunc): + new_ray_df = ray_df.applymap(testfunc) + new_pandas_df = pandas_df.applymap(testfunc) + + assert ray_df_equals_pandas(new_ray_df, new_pandas_df) + + +@pytest.fixture +def test_copy(ray_df): + new_ray_df = ray_df.copy() + + assert new_ray_df is not ray_df + assert np.array_equal(new_ray_df._block_partitions, + ray_df._block_partitions) + + +@pytest.fixture +def test_sum(ray_df, pandas_df): + assert ray_df.sum().equals(pandas_df.sum()) + + +@pytest.fixture +def test_abs(ray_df, pandas_df): + assert ray_df_equals_pandas(ray_df.abs(), pandas_df.abs()) + + +@pytest.fixture +def test_keys(ray_df, pandas_df): + assert ray_df.keys().equals(pandas_df.keys()) + + +@pytest.fixture +def test_transpose(ray_df, pandas_df): + assert ray_df_equals_pandas(ray_df.T, pandas_df.T) + assert ray_df_equals_pandas(ray_df.transpose(), pandas_df.transpose()) + + +@pytest.fixture +def test_get(ray_df, pandas_df, key): + assert ray_df.get(key).equals(pandas_df.get(key)) + assert ray_df.get( + key, default='default').equals(pandas_df.get(key, default='default')) + + +@pytest.fixture +def test_get_dtype_counts(ray_df, pandas_df): + assert ray_df.get_dtype_counts().equals(pandas_df.get_dtype_counts()) + + +@pytest.fixture +def test_get_ftype_counts(ray_df, pandas_df): + assert ray_df.get_ftype_counts().equals(pandas_df.get_ftype_counts()) + + def test_add(): test_inter_df_math("add", simple=False) @@ -935,7 +930,7 @@ def test_add(): def test_agg(ray_df, pandas_df, func, axis): ray_result = ray_df.agg(func, axis) pandas_result = pandas_df.agg(func, axis) - if isinstance(ray_result, rdf.DataFrame): + if isinstance(ray_result, pd.DataFrame): assert ray_df_equals_pandas(ray_result, pandas_result) else: assert ray_result.equals(pandas_result) @@ -945,7 +940,7 @@ def test_agg(ray_df, pandas_df, func, axis): def test_aggregate(ray_df, pandas_df, func, axis): ray_result = ray_df.aggregate(func, axis) pandas_result = pandas_df.aggregate(func, axis) - if isinstance(ray_result, rdf.DataFrame): + if isinstance(ray_result, pd.DataFrame): assert ray_df_equals_pandas(ray_result, pandas_result) else: assert ray_result.equals(pandas_result) @@ -971,15 +966,16 @@ def test_any(ray_df, pd_df): def test_append(): - ray_df = rdf.DataFrame({"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]}) + frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], + "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]} - pandas_df = pd.DataFrame({"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]}) + ray_df = pd.DataFrame(frame_data) + pandas_df = pandas.DataFrame(frame_data) - ray_df2 = rdf.DataFrame({"col5": [0], "col6": [1]}) + frame_data2 = {"col5": [0], "col6": [1]} - pandas_df2 = pd.DataFrame({"col5": [0], "col6": [1]}) + ray_df2 = pd.DataFrame(frame_data2) + pandas_df2 = pandas.DataFrame(frame_data2) assert ray_df_equals_pandas(ray_df.append(ray_df2), pandas_df.append(pandas_df2)) @@ -992,7 +988,7 @@ def test_append(): def test_apply(ray_df, pandas_df, func, axis): ray_result = ray_df.apply(func, axis) pandas_result = pandas_df.apply(func, axis) - if isinstance(ray_result, rdf.DataFrame): + if isinstance(ray_result, pd.DataFrame): assert ray_df_equals_pandas(ray_result, pandas_result) else: assert ray_result.equals(pandas_result) @@ -1007,7 +1003,7 @@ def test_as_blocks(): def test_as_matrix(): test_data = TestData() - frame = rdf.DataFrame(test_data.frame) + frame = pd.DataFrame(test_data.frame) mat = frame.as_matrix() frame_columns = frame.columns @@ -1020,15 +1016,15 @@ def test_as_matrix(): assert value == frame[col][i] # mixed type - mat = rdf.DataFrame(test_data.mixed_frame).as_matrix(['foo', 'A']) + mat = pd.DataFrame(test_data.mixed_frame).as_matrix(['foo', 'A']) assert mat[0, 0] == 'bar' - df = rdf.DataFrame({'real': [1, 2, 3], 'complex': [1j, 2j, 3j]}) + df = pd.DataFrame({'real': [1, 2, 3], 'complex': [1j, 2j, 3j]}) mat = df.as_matrix() assert mat[0, 0] == 1j # single block corner case - mat = rdf.DataFrame(test_data.frame).as_matrix(['A', 'B']) + mat = pd.DataFrame(test_data.frame).as_matrix(['A', 'B']) expected = test_data.frame.reindex(columns=['A', 'B']).values tm.assert_almost_equal(mat, expected) @@ -1056,27 +1052,27 @@ def test_assign(): def test_astype(): td = TestData() - ray_df_frame = from_pandas(td.frame, 2) - our_df_casted = ray_df_frame.astype(np.int32) - expected_df_casted = pd.DataFrame(td.frame.values.astype(np.int32), - index=td.frame.index, - columns=td.frame.columns) + ray_df = pd.DataFrame(td.frame) + our_df_casted = ray_df.astype(np.int32) + expected_df_casted = pandas.DataFrame(td.frame.values.astype(np.int32), + index=td.frame.index, + columns=td.frame.columns) - assert(ray_df_equals_pandas(our_df_casted, expected_df_casted)) + assert ray_df_equals_pandas(our_df_casted, expected_df_casted) - our_df_casted = ray_df_frame.astype(np.float64) - expected_df_casted = pd.DataFrame(td.frame.values.astype(np.float64), - index=td.frame.index, - columns=td.frame.columns) + our_df_casted = ray_df.astype(np.float64) + expected_df_casted = pandas.DataFrame(td.frame.values.astype(np.float64), + index=td.frame.index, + columns=td.frame.columns) - assert(ray_df_equals_pandas(our_df_casted, expected_df_casted)) + assert ray_df_equals_pandas(our_df_casted, expected_df_casted) - our_df_casted = ray_df_frame.astype(str) - expected_df_casted = pd.DataFrame(td.frame.values.astype(str), - index=td.frame.index, - columns=td.frame.columns) + our_df_casted = ray_df.astype(str) + expected_df_casted = pandas.DataFrame(td.frame.values.astype(str), + index=td.frame.index, + columns=td.frame.columns) - assert(ray_df_equals_pandas(our_df_casted, expected_df_casted)) + assert ray_df_equals_pandas(our_df_casted, expected_df_casted) def test_at_time(): @@ -1094,15 +1090,12 @@ def test_between_time(): @pytest.fixture -def test_bfill(num_partitions=2): +def test_bfill(): test_data = TestData() test_data.tsframe['A'][:5] = np.nan test_data.tsframe['A'][-5:] = np.nan - ray_df = from_pandas(test_data.tsframe, num_partitions) - assert ray_df_equals_pandas( - ray_df.bfill(), - test_data.tsframe.bfill() - ) + ray_df = pd.DataFrame(test_data.tsframe) + assert ray_df_equals_pandas(ray_df.bfill(), test_data.tsframe.bfill()) @pytest.fixture @@ -1111,8 +1104,8 @@ def test_bool(ray_df, pd_df): ray_df.bool() pd_df.bool() - single_bool_pd_df = pd.DataFrame([True]) - single_bool_ray_df = from_pandas(single_bool_pd_df, 1) + single_bool_pd_df = pandas.DataFrame([True]) + single_bool_ray_df = pd.DataFrame([True]) assert single_bool_pd_df.bool() == single_bool_ray_df.bool() @@ -1209,35 +1202,35 @@ def test_cov(): @pytest.fixture def test_cummax(ray_df, pandas_df): - assert(ray_df_equals_pandas(ray_df.cummax(), pandas_df.cummax())) + assert ray_df_equals_pandas(ray_df.cummax(), pandas_df.cummax()) @pytest.fixture def test_cummin(ray_df, pandas_df): - assert(ray_df_equals_pandas(ray_df.cummin(), pandas_df.cummin())) + assert ray_df_equals_pandas(ray_df.cummin(), pandas_df.cummin()) @pytest.fixture def test_cumprod(ray_df, pandas_df): - assert(ray_df_equals_pandas(ray_df.cumprod(), pandas_df.cumprod())) + assert ray_df_equals_pandas(ray_df.cumprod(), pandas_df.cumprod()) @pytest.fixture def test_cumsum(ray_df, pandas_df): - assert(ray_df_equals_pandas(ray_df.cumsum(), pandas_df.cumsum())) + assert ray_df_equals_pandas(ray_df.cumsum(), pandas_df.cumsum()) @pytest.fixture def test_describe(ray_df, pandas_df): - assert(ray_df.describe().equals(pandas_df.describe())) + assert ray_df.describe().equals(pandas_df.describe()) @pytest.fixture def test_diff(ray_df, pandas_df): - assert(ray_df_equals_pandas(ray_df.diff(), pandas_df.diff())) - assert(ray_df_equals_pandas(ray_df.diff(axis=1), pandas_df.diff(axis=1))) - assert(ray_df_equals_pandas(ray_df.diff(periods=1), - pandas_df.diff(periods=1))) + assert ray_df_equals_pandas(ray_df.diff(), pandas_df.diff()) + assert ray_df_equals_pandas(ray_df.diff(axis=1), pandas_df.diff(axis=1)) + assert ray_df_equals_pandas(ray_df.diff(periods=1), + pandas_df.diff(periods=1)) def test_div(): @@ -1256,9 +1249,9 @@ def test_dot(): def test_drop(): - ray_df = create_test_dataframe() - simple = pd.DataFrame({"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]}) - ray_simple = from_pandas(simple, 2) + frame_data = {"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]} + simple = pandas.DataFrame(frame_data) + ray_simple = pd.DataFrame(frame_data) assert ray_df_equals_pandas(ray_simple.drop("A", axis=1), simple[['B']]) assert ray_df_equals_pandas(ray_simple.drop(["A", "B"], axis='columns'), simple[[]]) @@ -1282,58 +1275,59 @@ def test_drop(): errors='ignore'), simple[['B']]) - # non-unique - wheee! - nu_df = pd.DataFrame(pd.compat.lzip(range(3), range(-3, 1), list('abc')), - columns=['a', 'a', 'b']) - ray_nu_df = from_pandas(nu_df, 3) + # non-unique + nu_df = pandas.DataFrame(pandas.compat.lzip(range(3), range(-3, 1), + list('abc')), + columns=['a', 'a', 'b']) + ray_nu_df = pd.DataFrame(nu_df) assert ray_df_equals_pandas(ray_nu_df.drop('a', axis=1), nu_df[['b']]) assert ray_df_equals_pandas(ray_nu_df.drop('b', axis='columns'), nu_df['a']) - assert ray_df_equals_pandas(ray_nu_df.drop([]), nu_df) # GH 16398 + assert ray_df_equals_pandas(ray_nu_df.drop([]), nu_df) - nu_df = nu_df.set_index(pd.Index(['X', 'Y', 'X'])) + nu_df = nu_df.set_index(pandas.Index(['X', 'Y', 'X'])) nu_df.columns = list('abc') - ray_nu_df = from_pandas(nu_df, 3) + ray_nu_df = pd.DataFrame(nu_df) assert ray_df_equals_pandas(ray_nu_df.drop('X', axis='rows'), nu_df.loc[["Y"], :]) assert ray_df_equals_pandas(ray_nu_df.drop(['X', 'Y'], axis=0), nu_df.loc[[], :]) # inplace cache issue - # GH 5628 - df = pd.DataFrame(np.random.randn(10, 3), columns=list('abc')) - ray_df = from_pandas(df, 2) + frame_data = np.random.randn(10, 3) + df = pandas.DataFrame(frame_data, columns=list('abc')) + ray_df = pd.DataFrame(frame_data, columns=list('abc')) expected = df[~(df.b > 0)] ray_df.drop(labels=df[df.b > 0].index, inplace=True) assert ray_df_equals_pandas(ray_df, expected) def test_drop_api_equivalence(): - # equivalence of the labels/axis and index/columns API's (GH12392) - df = pd.DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]], - index=['a', 'b', 'c'], - columns=['d', 'e', 'f']) - ray_df = from_pandas(df, 3) + # equivalence of the labels/axis and index/columns API's + frame_data = [[1, 2, 3], [3, 4, 5], [5, 6, 7]] - res1 = ray_df.drop('a') - res2 = ray_df.drop(index='a') - assert ray_df_equals(res1, res2) + ray_df = pd.DataFrame(frame_data, index=['a', 'b', 'c'], + columns=['d', 'e', 'f']) - res1 = ray_df.drop('d', 1) - res2 = ray_df.drop(columns='d') - assert ray_df_equals(res1, res2) + ray_df1 = ray_df.drop('a') + ray_df2 = ray_df.drop(index='a') + assert ray_df_equals(ray_df1, ray_df2) - res1 = ray_df.drop(labels='e', axis=1) - res2 = ray_df.drop(columns='e') - assert ray_df_equals(res1, res2) + ray_df1 = ray_df.drop('d', 1) + ray_df2 = ray_df.drop(columns='d') + assert ray_df_equals(ray_df1, ray_df2) - res1 = ray_df.drop(['a'], axis=0) - res2 = ray_df.drop(index=['a']) - assert ray_df_equals(res1, res2) + ray_df1 = ray_df.drop(labels='e', axis=1) + ray_df2 = ray_df.drop(columns='e') + assert ray_df_equals(ray_df1, ray_df2) - res1 = ray_df.drop(['a'], axis=0).drop(['d'], axis=1) - res2 = ray_df.drop(index=['a'], columns=['d']) - assert ray_df_equals(res1, res2) + ray_df1 = ray_df.drop(['a'], axis=0) + ray_df2 = ray_df.drop(index=['a']) + assert ray_df_equals(ray_df1, ray_df2) + + ray_df1 = ray_df.drop(['a'], axis=0).drop(['d'], axis=1) + ray_df2 = ray_df.drop(index=['a'], columns=['d']) + assert ray_df_equals(ray_df1, ray_df2) with pytest.raises(ValueError): ray_df.drop(labels='a', index='b') @@ -1385,14 +1379,10 @@ def test_dropna_inplace(ray_df, pd_df): @pytest.fixture def test_dropna_multiple_axes(ray_df, pd_df): - assert ray_df_equals_pandas( - ray_df.dropna(how='all', axis=[0, 1]), - pd_df.dropna(how='all', axis=[0, 1]) - ) - assert ray_df_equals_pandas( - ray_df.dropna(how='all', axis=(0, 1)), - pd_df.dropna(how='all', axis=(0, 1)) - ) + assert ray_df_equals_pandas(ray_df.dropna(how='all', axis=[0, 1]), + pd_df.dropna(how='all', axis=[0, 1])) + assert ray_df_equals_pandas(ray_df.dropna(how='all', axis=(0, 1)), + pd_df.dropna(how='all', axis=(0, 1))) @pytest.fixture @@ -1417,26 +1407,20 @@ def test_dropna_multiple_axes_inplace(ray_df, pd_df): @pytest.fixture def test_dropna_subset(ray_df, pd_df, column_subsets, row_subsets): for subset in column_subsets: - assert ray_df_equals_pandas( - ray_df.dropna(how='all', subset=subset), - pd_df.dropna(how='all', subset=subset) - ) + assert ray_df_equals_pandas(ray_df.dropna(how='all', subset=subset), + pd_df.dropna(how='all', subset=subset)) - assert ray_df_equals_pandas( - ray_df.dropna(how='any', subset=subset), - pd_df.dropna(how='any', subset=subset) - ) + assert ray_df_equals_pandas(ray_df.dropna(how='any', subset=subset), + pd_df.dropna(how='any', subset=subset)) for subset in row_subsets: assert ray_df_equals_pandas( ray_df.dropna(how='all', axis=1, subset=subset), - pd_df.dropna(how='all', axis=1, subset=subset) - ) + pd_df.dropna(how='all', axis=1, subset=subset)) assert ray_df_equals_pandas( ray_df.dropna(how='any', axis=1, subset=subset), - pd_df.dropna(how='any', axis=1, subset=subset) - ) + pd_df.dropna(how='any', axis=1, subset=subset)) @pytest.fixture @@ -1460,28 +1444,25 @@ def test_eq(): def test_equals(): - pandas_df1 = pd.DataFrame({'col1': [2.9, 3, 3, 3], - 'col2': [2, 3, 4, 1]}) - ray_df1 = from_pandas(pandas_df1, 2) - ray_df2 = from_pandas(pandas_df1, 3) + frame_data = {'col1': [2.9, 3, 3, 3], 'col2': [2, 3, 4, 1]} + ray_df1 = pd.DataFrame(frame_data) + ray_df2 = pd.DataFrame(frame_data) assert ray_df1.equals(ray_df2) - pandas_df2 = pd.DataFrame({'col1': [2.9, 3, 3, 3], - 'col2': [2, 3, 5, 1]}) - ray_df3 = from_pandas(pandas_df2, 4) + frame_data = {'col1': [2.9, 3, 3, 3], 'col2': [2, 3, 5, 1]} + ray_df3 = pd.DataFrame(frame_data) assert not ray_df3.equals(ray_df1) assert not ray_df3.equals(ray_df2) def test_eval_df_use_case(): - df = pd.DataFrame({'a': np.random.randn(10), - 'b': np.random.randn(10)}) - ray_df = from_pandas(df, 2) + frame_data = {'a': np.random.randn(10), 'b': np.random.randn(10)} + df = pandas.DataFrame(frame_data) + ray_df = pd.DataFrame(frame_data) df.eval("e = arctan2(sin(a), b)", - engine='python', - parser='pandas', inplace=True) + engine='python', parser='pandas', inplace=True) ray_df.eval("e = arctan2(sin(a), b)", engine='python', parser='pandas', inplace=True) @@ -1490,15 +1471,13 @@ def test_eval_df_use_case(): def test_eval_df_arithmetic_subexpression(): - df = pd.DataFrame({'a': np.random.randn(10), - 'b': np.random.randn(10)}) - ray_df = from_pandas(df, 2) + frame_data = {'a': np.random.randn(10), 'b': np.random.randn(10)} + df = pandas.DataFrame(frame_data) + ray_df = pd.DataFrame(frame_data) df.eval("not_e = sin(a + b)", - engine='python', - parser='pandas', inplace=True) + engine='python', parser='pandas', inplace=True) ray_df.eval("not_e = sin(a + b)", - engine='python', - parser='pandas', inplace=True) + engine='python', parser='pandas', inplace=True) # TODO: Use a series equality validator. assert ray_df_equals_pandas(ray_df, df) @@ -1518,16 +1497,13 @@ def test_expanding(): @pytest.fixture -def test_ffill(num_partitions=2): +def test_ffill(): test_data = TestData() test_data.tsframe['A'][:5] = np.nan test_data.tsframe['A'][-5:] = np.nan - ray_df = from_pandas(test_data.tsframe, num_partitions) + ray_df = pd.DataFrame(test_data.tsframe) - assert ray_df_equals_pandas( - ray_df.ffill(), - test_data.tsframe.ffill() - ) + assert ray_df_equals_pandas(ray_df.ffill(), test_data.tsframe.ffill()) def test_fillna(): @@ -1554,19 +1530,18 @@ def test_fillna(): @pytest.fixture -def test_fillna_sanity(num_partitions=2): +def test_fillna_sanity(): test_data = TestData() tf = test_data.tsframe tf.loc[tf.index[:5], 'A'] = np.nan tf.loc[tf.index[-5:], 'A'] = np.nan zero_filled = test_data.tsframe.fillna(0) - ray_df = from_pandas(test_data.tsframe, num_partitions).fillna(0) + ray_df = pd.DataFrame(test_data.tsframe).fillna(0) assert ray_df_equals_pandas(ray_df, zero_filled) padded = test_data.tsframe.fillna(method='pad') - ray_df = from_pandas(test_data.tsframe, - num_partitions).fillna(method='pad') + ray_df = pd.DataFrame(test_data.tsframe).fillna(method='pad') assert ray_df_equals_pandas(ray_df, padded) # mixed type @@ -1575,158 +1550,135 @@ def test_fillna_sanity(num_partitions=2): mf.loc[mf.index[-10:], 'A'] = np.nan result = test_data.mixed_frame.fillna(value=0) - ray_df = from_pandas(test_data.mixed_frame, - num_partitions).fillna(value=0) + ray_df = pd.DataFrame(test_data.mixed_frame).fillna(value=0) assert ray_df_equals_pandas(ray_df, result) result = test_data.mixed_frame.fillna(method='pad') - ray_df = from_pandas(test_data.mixed_frame, - num_partitions).fillna(method='pad') + ray_df = pd.DataFrame(test_data.mixed_frame).fillna(method='pad') assert ray_df_equals_pandas(ray_df, result) pytest.raises(ValueError, test_data.tsframe.fillna) - pytest.raises(ValueError, from_pandas(test_data.tsframe, - num_partitions).fillna) + pytest.raises(ValueError, pd.DataFrame(test_data.tsframe).fillna) with pytest.raises(ValueError): - from_pandas(test_data.tsframe, num_partitions).fillna( - 5, method='ffill' - ) + pd.DataFrame(test_data.tsframe).fillna(5, method='ffill') # mixed numeric (but no float16) mf = test_data.mixed_float.reindex(columns=['A', 'B', 'D']) mf.loc[mf.index[-10:], 'A'] = np.nan result = mf.fillna(value=0) - ray_df = from_pandas(mf, num_partitions).fillna(value=0) + ray_df = pd.DataFrame(mf).fillna(value=0) assert ray_df_equals_pandas(ray_df, result) result = mf.fillna(method='pad') - ray_df = from_pandas(mf, num_partitions).fillna(method='pad') + ray_df = pd.DataFrame(mf).fillna(method='pad') assert ray_df_equals_pandas(ray_df, result) # TODO: Use this when Arrow issue resolves: # (https://issues.apache.org/jira/browse/ARROW-2122) - # empty frame (GH #2778) + # empty frame # df = DataFrame(columns=['x']) # for m in ['pad', 'backfill']: # df.x.fillna(method=m, inplace=True) # df.x.fillna(method=m) - # with different dtype (GH3386) - df = pd.DataFrame([['a', 'a', np.nan, 'a'], [ - 'b', 'b', np.nan, 'b'], ['c', 'c', np.nan, 'c']]) + # with different dtype + frame_data = [['a', 'a', np.nan, 'a'], + ['b', 'b', np.nan, 'b'], + ['c', 'c', np.nan, 'c']] + df = pandas.DataFrame(frame_data) result = df.fillna({2: 'foo'}) - ray_df = from_pandas(df, num_partitions).fillna({2: 'foo'}) + ray_df = pd.DataFrame(frame_data).fillna({2: 'foo'}) assert ray_df_equals_pandas(ray_df, result) - ray_df = from_pandas(df, num_partitions) + ray_df = pd.DataFrame(df) df.fillna({2: 'foo'}, inplace=True) ray_df.fillna({2: 'foo'}, inplace=True) assert ray_df_equals_pandas(ray_df, result) - # limit and value - df = pd.DataFrame(np.random.randn(10, 3)) - df.iloc[2:7, 0] = np.nan - df.iloc[3:5, 2] = np.nan - - # result = df.fillna(999, limit=1) - # ray_df = from_pandas(df, num_partitions).fillna(999, limit=1) - - # assert ray_df_equals_pandas(ray_df, result) - - # with datelike - # GH 6344 - df = pd.DataFrame({ - 'Date': [pd.NaT, pd.Timestamp("2014-1-1")], - 'Date2': [pd.Timestamp("2013-1-1"), pd.NaT] - }) + frame_data = {'Date': [pandas.NaT, pandas.Timestamp("2014-1-1")], + 'Date2': [pandas.Timestamp("2013-1-1"), pandas.NaT]} + df = pandas.DataFrame(frame_data) result = df.fillna(value={'Date': df['Date2']}) - ray_df = from_pandas(df, num_partitions).fillna( - value={'Date': df['Date2']} - ) + ray_df = pd.DataFrame(frame_data).fillna(value={'Date': df['Date2']}) assert ray_df_equals_pandas(ray_df, result) # TODO: Use this when Arrow issue resolves: # (https://issues.apache.org/jira/browse/ARROW-2122) # with timezone - # GH 15855 """ - df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), - pd.NaT]}) - ray_df = from_pandas(df, num_partitions) + frame_data = {'A': [pandas.Timestamp('2012-11-11 00:00:00+01:00'), + pandas.NaT]} + df = pandas.DataFrame(frame_data) + ray_df = pd.DataFrame(frame_data) assert ray_df_equals_pandas(ray_df.fillna(method='pad'), df.fillna(method='pad')) - df = pd.DataFrame({'A': [pd.NaT, - pd.Timestamp('2012-11-11 00:00:00+01:00')]}) - ray_df = from_pandas(df, num_partitions).fillna(method='bfill') + frame_data = {'A': [pandas.NaT, + pandas.Timestamp('2012-11-11 00:00:00+01:00')]} + df = pandas.DataFrame(frame_data) + ray_df = pd.DataFrame(frame_data).fillna(method='bfill') assert ray_df_equals_pandas(ray_df, df.fillna(method='bfill')) """ @pytest.fixture -def test_fillna_downcast(num_partitions=2): - # GH 15277 +def test_fillna_downcast(): # infer int64 from float64 - df = pd.DataFrame({'a': [1., np.nan]}) + frame_data = {'a': [1., np.nan]} + df = pandas.DataFrame(frame_data) result = df.fillna(0, downcast='infer') - ray_df = from_pandas(df, num_partitions).fillna(0, downcast='infer') + ray_df = pd.DataFrame(frame_data).fillna(0, downcast='infer') assert ray_df_equals_pandas(ray_df, result) # infer int64 from float64 when fillna value is a dict - df = pd.DataFrame({'a': [1., np.nan]}) + df = pandas.DataFrame(frame_data) result = df.fillna({'a': 0}, downcast='infer') - ray_df = from_pandas(df, num_partitions).fillna( - {'a': 0}, downcast='infer' - ) + ray_df = pd.DataFrame(frame_data).fillna({'a': 0}, downcast='infer') assert ray_df_equals_pandas(ray_df, result) @pytest.fixture -def test_ffill2(num_partitions=2): +def test_ffill2(): test_data = TestData() test_data.tsframe['A'][:5] = np.nan test_data.tsframe['A'][-5:] = np.nan - ray_df = from_pandas(test_data.tsframe, num_partitions) - assert ray_df_equals_pandas( - ray_df.fillna(method='ffill'), - test_data.tsframe.fillna(method='ffill') - ) + ray_df = pd.DataFrame(test_data.tsframe) + assert ray_df_equals_pandas(ray_df.fillna(method='ffill'), + test_data.tsframe.fillna(method='ffill')) @pytest.fixture -def test_bfill2(num_partitions=2): +def test_bfill2(): test_data = TestData() test_data.tsframe['A'][:5] = np.nan test_data.tsframe['A'][-5:] = np.nan - ray_df = from_pandas(test_data.tsframe, num_partitions) - assert ray_df_equals_pandas( - ray_df.fillna(method='bfill'), - test_data.tsframe.fillna(method='bfill') - ) + ray_df = pd.DataFrame(test_data.tsframe) + assert ray_df_equals_pandas(ray_df.fillna(method='bfill'), + test_data.tsframe.fillna(method='bfill')) @pytest.fixture -def test_fillna_inplace(num_partitions=2): - df = pd.DataFrame(np.random.randn(10, 4)) +def test_fillna_inplace(): + frame_data = np.random.randn(10, 4) + df = pandas.DataFrame(frame_data) df[1][:4] = np.nan df[3][-4:] = np.nan - ray_df = from_pandas(df, num_partitions) + ray_df = pd.DataFrame(df) df.fillna(value=0, inplace=True) assert not ray_df_equals_pandas(ray_df, df) ray_df.fillna(value=0, inplace=True) assert ray_df_equals_pandas(ray_df, df) - ray_df = from_pandas(df, num_partitions).fillna(value={0: 0}, - inplace=True) + ray_df = pd.DataFrame(df).fillna(value={0: 0}, inplace=True) assert ray_df is None df[1][:4] = np.nan df[3][-4:] = np.nan - ray_df = from_pandas(df, num_partitions) + ray_df = pd.DataFrame(df) df.fillna(method='ffill', inplace=True) assert not ray_df_equals_pandas(ray_df, df) @@ -1736,161 +1688,130 @@ def test_fillna_inplace(num_partitions=2): @pytest.fixture -def test_frame_fillna_limit(num_partitions=2): +def test_frame_fillna_limit(): index = np.arange(10) - df = pd.DataFrame(np.random.randn(10, 4), index=index) + frame_data = np.random.randn(10, 4) + df = pandas.DataFrame(frame_data, index=index) expected = df[:2].reindex(index) expected = expected.fillna(method='pad', limit=5) - ray_df = from_pandas(df[:2].reindex(index), num_partitions).fillna( - method='pad', limit=5 - ) + ray_df = pd.DataFrame(df[:2].reindex(index)).fillna(method='pad', limit=5) assert ray_df_equals_pandas(ray_df, expected) expected = df[-2:].reindex(index) expected = expected.fillna(method='backfill', limit=5) - ray_df = from_pandas(df[-2:].reindex(index), num_partitions).fillna( - method='backfill', limit=5 - ) + ray_df = pd.DataFrame(df[-2:].reindex(index)).fillna(method='backfill', + limit=5) assert ray_df_equals_pandas(ray_df, expected) @pytest.fixture -def test_frame_pad_backfill_limit(num_partitions=2): +def test_frame_pad_backfill_limit(): index = np.arange(10) - df = pd.DataFrame(np.random.randn(10, 4), index=index) + frame_data = np.random.randn(10, 4) + df = pandas.DataFrame(frame_data, index=index) result = df[:2].reindex(index) - ray_df = from_pandas(result, num_partitions) - assert ray_df_equals_pandas( - ray_df.fillna(method='pad', limit=5), - result.fillna(method='pad', limit=5) - ) + ray_df = pd.DataFrame(result) + assert ray_df_equals_pandas(ray_df.fillna(method='pad', limit=5), + result.fillna(method='pad', limit=5)) result = df[-2:].reindex(index) - ray_df = from_pandas(result, num_partitions) - assert ray_df_equals_pandas( - ray_df.fillna(method='backfill', limit=5), - result.fillna(method='backfill', limit=5) - ) + ray_df = pd.DataFrame(result) + assert ray_df_equals_pandas(ray_df.fillna(method='backfill', limit=5), + result.fillna(method='backfill', limit=5)) @pytest.fixture -def test_fillna_dtype_conversion(num_partitions=2): +def test_fillna_dtype_conversion(): # make sure that fillna on an empty frame works - df = pd.DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) + df = pandas.DataFrame(index=range(3), columns=['A', 'B'], dtype='float64') + ray_df = pd.DataFrame(index=range(3), columns=['A', 'B'], dtype='float64') + assert ray_df_equals_pandas(ray_df.fillna('nan'), df.fillna('nan')) - # empty block - df = pd.DataFrame(index=range(3), columns=['A', 'B'], dtype='float64') - ray_df = from_pandas(df, num_partitions) - assert ray_df_equals_pandas( - ray_df.fillna('nan'), - df.fillna('nan') - ) - - # equiv of replace - df = pd.DataFrame({'A': [1, np.nan], 'B': [1., 2.]}) - ray_df = from_pandas(df, num_partitions) + frame_data = {'A': [1, np.nan], 'B': [1., 2.]} + df = pandas.DataFrame(frame_data) + ray_df = pd.DataFrame(frame_data) for v in ['', 1, np.nan, 1.0]: - assert ray_df_equals_pandas( - ray_df.fillna(v), - df.fillna(v) - ) + assert ray_df_equals_pandas(ray_df.fillna(v), df.fillna(v)) @pytest.fixture -def test_fillna_skip_certain_blocks(num_partitions=2): +def test_fillna_skip_certain_blocks(): # don't try to fill boolean, int blocks - df = pd.DataFrame(np.random.randn(10, 4).astype(int)) - ray_df = from_pandas(df, num_partitions) + frame_data = np.random.randn(10, 4).astype(int) + df = pandas.DataFrame(frame_data) + ray_df = pd.DataFrame(frame_data) - # it works! - assert ray_df_equals_pandas( - ray_df.fillna(np.nan), - df.fillna(np.nan) - ) + assert ray_df_equals_pandas(ray_df.fillna(np.nan), df.fillna(np.nan)) @pytest.fixture -def test_fillna_dict_series(num_partitions=2): - df = pd.DataFrame({'a': [np.nan, 1, 2, np.nan, np.nan], - 'b': [1, 2, 3, np.nan, np.nan], - 'c': [np.nan, 1, 2, 3, 4]}) - ray_df = from_pandas(df, num_partitions) +def test_fillna_dict_series(): + frame_data = {'a': [np.nan, 1, 2, np.nan, np.nan], + 'b': [1, 2, 3, np.nan, np.nan], + 'c': [np.nan, 1, 2, 3, 4]} + df = pandas.DataFrame(frame_data) + ray_df = pd.DataFrame(frame_data) - assert ray_df_equals_pandas( - ray_df.fillna({'a': 0, 'b': 5}), - df.fillna({'a': 0, 'b': 5}) - ) + assert ray_df_equals_pandas(ray_df.fillna({'a': 0, 'b': 5}), + df.fillna({'a': 0, 'b': 5})) - # it works - assert ray_df_equals_pandas( - ray_df.fillna({'a': 0, 'b': 5, 'd': 7}), - df.fillna({'a': 0, 'b': 5, 'd': 7}) - ) + assert ray_df_equals_pandas(ray_df.fillna({'a': 0, 'b': 5, 'd': 7}), + df.fillna({'a': 0, 'b': 5, 'd': 7})) # Series treated same as dict - assert ray_df_equals_pandas( - ray_df.fillna(df.max()), - df.fillna(df.max()) - ) + assert ray_df_equals_pandas(ray_df.fillna(df.max()), df.fillna(df.max())) @pytest.fixture -def test_fillna_dataframe(num_partitions=2): - # GH 8377 - df = pd.DataFrame({'a': [np.nan, 1, 2, np.nan, np.nan], - 'b': [1, 2, 3, np.nan, np.nan], - 'c': [np.nan, 1, 2, 3, 4]}, - index=list('VWXYZ')) - ray_df = from_pandas(df, num_partitions) +def test_fillna_dataframe(): + frame_data = {'a': [np.nan, 1, 2, np.nan, np.nan], + 'b': [1, 2, 3, np.nan, np.nan], + 'c': [np.nan, 1, 2, 3, 4]} + df = pandas.DataFrame(frame_data, index=list('VWXYZ')) + ray_df = pd.DataFrame(frame_data, index=list('VWXYZ')) # df2 may have different index and columns - df2 = pd.DataFrame({'a': [np.nan, 10, 20, 30, 40], - 'b': [50, 60, 70, 80, 90], - 'foo': ['bar'] * 5}, - index=list('VWXuZ')) + df2 = pandas.DataFrame({'a': [np.nan, 10, 20, 30, 40], + 'b': [50, 60, 70, 80, 90], + 'foo': ['bar'] * 5}, + index=list('VWXuZ')) # only those columns and indices which are shared get filled - assert ray_df_equals_pandas( - ray_df.fillna(df2), - df.fillna(df2) - ) + assert ray_df_equals_pandas(ray_df.fillna(df2), df.fillna(df2)) @pytest.fixture -def test_fillna_columns(num_partitions=2): - df = pd.DataFrame(np.random.randn(10, 10)) +def test_fillna_columns(): + frame_data = np.random.randn(10, 10) + df = pandas.DataFrame(frame_data) df.values[:, ::2] = np.nan - ray_df = from_pandas(df, num_partitions) + ray_df = pd.DataFrame(df) - assert ray_df_equals_pandas( - ray_df.fillna(method='ffill', axis=1), - df.fillna(method='ffill', axis=1) - ) + assert ray_df_equals_pandas(ray_df.fillna(method='ffill', axis=1), + df.fillna(method='ffill', axis=1)) df.insert(6, 'foo', 5) - ray_df = from_pandas(df, num_partitions) - assert ray_df_equals_pandas( - ray_df.fillna(method='ffill', axis=1), - df.fillna(method='ffill', axis=1) - ) + ray_df = pd.DataFrame(df) + assert ray_df_equals_pandas(ray_df.fillna(method='ffill', axis=1), + df.fillna(method='ffill', axis=1)) @pytest.fixture -def test_fillna_invalid_method(num_partitions=2): +def test_fillna_invalid_method(): test_data = TestData() - ray_df = from_pandas(test_data.frame, num_partitions) + ray_df = pd.DataFrame(test_data.frame) with tm.assert_raises_regex(ValueError, 'ffil'): ray_df.fillna(method='ffil') @pytest.fixture -def test_fillna_invalid_value(num_partitions=2): +def test_fillna_invalid_value(): test_data = TestData() - ray_df = from_pandas(test_data.frame, num_partitions) + ray_df = pd.DataFrame(test_data.frame) # list pytest.raises(TypeError, ray_df.fillna, [1, 2]) # tuple @@ -1901,45 +1822,36 @@ def test_fillna_invalid_value(num_partitions=2): @pytest.fixture -def test_fillna_col_reordering(num_partitions=2): +def test_fillna_col_reordering(): cols = ["COL." + str(i) for i in range(5, 0, -1)] data = np.random.rand(20, 5) - df = pd.DataFrame(index=range(20), columns=cols, data=data) - ray_df = from_pandas(df, num_partitions) - assert ray_df_equals_pandas( - ray_df.fillna(method='ffill'), - df.fillna(method='ffill') - ) + df = pandas.DataFrame(index=range(20), columns=cols, data=data) + ray_df = pd.DataFrame(index=range(20), columns=cols, data=data) + assert ray_df_equals_pandas(ray_df.fillna(method='ffill'), + df.fillna(method='ffill')) """ TODO: Use this when Arrow issue resolves: (https://issues.apache.org/jira/browse/ARROW-2122) @pytest.fixture -def test_fillna_datetime_columns(num_partitions=2): - # GH 7095 - df = pd.DataFrame({'A': [-1, -2, np.nan], - 'B': date_range('20130101', periods=3), - 'C': ['foo', 'bar', None], - 'D': ['foo2', 'bar2', None]}, - index=date_range('20130110', periods=3)) - ray_df = from_pandas(df, num_partitions) - assert ray_df_equals_pandas( - ray_df.fillna('?'), - df.fillna('?') - ) +def test_fillna_datetime_columns(): + frame_data = {'A': [-1, -2, np.nan], + 'B': date_range('20130101', periods=3), + 'C': ['foo', 'bar', None], + 'D': ['foo2', 'bar2', None]} + df = pandas.DataFrame(frame_data, index=date_range('20130110', periods=3)) + ray_df = pd.DataFrame(frame_data, index=date_range('20130110', periods=3)) + assert ray_df_equals_pandas(ray_df.fillna('?'), df.fillna('?')) - df = pd.DataFrame({'A': [-1, -2, np.nan], - 'B': [pd.Timestamp('2013-01-01'), - pd.Timestamp('2013-01-02'), pd.NaT], - 'C': ['foo', 'bar', None], - 'D': ['foo2', 'bar2', None]}, - index=date_range('20130110', periods=3)) - ray_df = from_pandas(df, num_partitions) - assert ray_df_equals_pandas( - ray_df.fillna('?'), - df.fillna('?') - ) + frame_data = {'A': [-1, -2, np.nan], + 'B': [pandas.Timestamp('2013-01-01'), + pandas.Timestamp('2013-01-02'), pandas.NaT], + 'C': ['foo', 'bar', None], + 'D': ['foo2', 'bar2', None]} + df = pandas.DataFrame(frame_data, index=date_range('20130110', periods=3)) + ray_df = pd.DataFrame(frame_data, index=date_range('20130110', periods=3)) + assert ray_df_equals_pandas(ray_df.fillna('?'), df.fillna('?')) """ @@ -1964,7 +1876,7 @@ def test_first(): @pytest.fixture def test_first_valid_index(ray_df, pandas_df): - assert(ray_df.first_valid_index() == (pandas_df.first_valid_index())) + assert ray_df.first_valid_index() == (pandas_df.first_valid_index()) def test_floordiv(): @@ -1973,22 +1885,22 @@ def test_floordiv(): def test_from_csv(): with pytest.raises(NotImplementedError): - rdf.DataFrame.from_csv(None) + pd.DataFrame.from_csv(None) def test_from_dict(): with pytest.raises(NotImplementedError): - rdf.DataFrame.from_dict(None) + pd.DataFrame.from_dict(None) def test_from_items(): with pytest.raises(NotImplementedError): - rdf.DataFrame.from_items(None) + pd.DataFrame.from_items(None) def test_from_records(): with pytest.raises(NotImplementedError): - rdf.DataFrame.from_records(None) + pd.DataFrame.from_records(None) def test_ge(): @@ -2028,13 +1940,13 @@ def test_hist(): @pytest.fixture def test_idxmax(ray_df, pandas_df): assert \ - ray_df.idxmax().sort_index().equals(pandas_df.idxmax().sort_index()) + ray_df.idxmax().equals(pandas_df.idxmax()) @pytest.fixture def test_idxmin(ray_df, pandas_df): assert \ - ray_df.idxmin().sort_index().equals(pandas_df.idxmin().sort_index()) + ray_df.idxmin().equals(pandas_df.idxmin()) def test_infer_objects(): @@ -2122,15 +2034,15 @@ def test_itertuples(ray_df, pandas_df): def test_join(): - ray_df = rdf.DataFrame({"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]}) + frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], + "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]} - pandas_df = pd.DataFrame({"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]}) + ray_df = pd.DataFrame(frame_data) + pandas_df = pandas.DataFrame(frame_data) - ray_df2 = rdf.DataFrame({"col5": [0], "col6": [1]}) - - pandas_df2 = pd.DataFrame({"col5": [0], "col6": [1]}) + frame_data2 = {"col5": [0], "col6": [1]} + ray_df2 = pd.DataFrame(frame_data2) + pandas_df2 = pandas.DataFrame(frame_data2) join_types = ["left", "right", "outer", "inner"] for how in join_types: @@ -2138,9 +2050,10 @@ def test_join(): pandas_join = pandas_df.join(pandas_df2, how=how) assert ray_df_equals_pandas(ray_join, pandas_join) - ray_df3 = rdf.DataFrame({"col7": [1, 2, 3, 5, 6, 7, 8]}) + frame_data3 = {"col7": [1, 2, 3, 5, 6, 7, 8]} - pandas_df3 = pd.DataFrame({"col7": [1, 2, 3, 5, 6, 7, 8]}) + ray_df3 = pd.DataFrame(frame_data3) + pandas_df3 = pandas.DataFrame(frame_data3) join_types = ["left", "outer", "inner"] for how in join_types: @@ -2172,7 +2085,7 @@ def test_last(): @pytest.fixture def test_last_valid_index(ray_df, pandas_df): - assert(ray_df.last_valid_index() == (pandas_df.last_valid_index())) + assert ray_df.last_valid_index() == (pandas_df.last_valid_index()) def test_le(): @@ -2206,8 +2119,8 @@ def test_mask(): @pytest.fixture def test_max(ray_df, pandas_df): - assert(ray_series_equals_pandas(ray_df.max(), pandas_df.max())) - assert(ray_series_equals_pandas(ray_df.max(axis=1), pandas_df.max(axis=1))) + assert ray_series_equals_pandas(ray_df.max(), pandas_df.max()) + assert ray_series_equals_pandas(ray_df.max(axis=1), pandas_df.max(axis=1)) @pytest.fixture @@ -2217,7 +2130,7 @@ def test_mean(ray_df, pandas_df): @pytest.fixture def test_median(ray_df, pandas_df): - assert(ray_df.median().equals(pandas_df.median())) + assert ray_df.median().equals(pandas_df.median()) def test_melt(): @@ -2229,22 +2142,22 @@ def test_melt(): @pytest.fixture def test_memory_usage(ray_df): - assert type(ray_df.memory_usage()) is pd.core.series.Series + assert type(ray_df.memory_usage()) is pandas.core.series.Series assert ray_df.memory_usage(index=True).at['Index'] is not None assert ray_df.memory_usage(deep=True).sum() >= \ ray_df.memory_usage(deep=False).sum() def test_merge(): - ray_df = rdf.DataFrame({"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]}) + frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], + "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]} - pandas_df = pd.DataFrame({"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]}) + ray_df = pd.DataFrame(frame_data) + pandas_df = pandas.DataFrame(frame_data) - ray_df2 = rdf.DataFrame({"col1": [0, 1, 2], "col2": [1, 5, 6]}) - - pandas_df2 = pd.DataFrame({"col1": [0, 1, 2], "col2": [1, 5, 6]}) + frame_data2 = {"col1": [0, 1, 2], "col2": [1, 5, 6]} + ray_df2 = pd.DataFrame(frame_data2) + pandas_df2 = pandas.DataFrame(frame_data2) join_types = ["outer", "inner"] for how in join_types: @@ -2291,8 +2204,8 @@ def test_merge(): @pytest.fixture def test_min(ray_df, pandas_df): - assert(ray_series_equals_pandas(ray_df.min(), pandas_df.min())) - assert(ray_series_equals_pandas(ray_df.min(axis=1), pandas_df.min(axis=1))) + assert ray_series_equals_pandas(ray_df.min(), pandas_df.min()) + assert ray_series_equals_pandas(ray_df.min(axis=1), pandas_df.min(axis=1)) def test_mod(): @@ -2301,9 +2214,9 @@ def test_mod(): @pytest.fixture def test_mode(ray_df, pandas_df): - assert(ray_series_equals_pandas(ray_df.mode(), pandas_df.mode())) - assert(ray_series_equals_pandas(ray_df.mode(axis=1), - pandas_df.mode(axis=1))) + assert ray_series_equals_pandas(ray_df.mode(), pandas_df.mode()) + assert ray_series_equals_pandas(ray_df.mode(axis=1), + pandas_df.mode(axis=1)) def test_mul(): @@ -2327,12 +2240,12 @@ def test_nlargest(): @pytest.fixture def test_notna(ray_df, pandas_df): - assert(ray_df_equals_pandas(ray_df.notna(), pandas_df.notna())) + assert ray_df_equals_pandas(ray_df.notna(), pandas_df.notna()) @pytest.fixture def test_notnull(ray_df, pandas_df): - assert(ray_df_equals_pandas(ray_df.notnull(), pandas_df.notnull())) + assert ray_df_equals_pandas(ray_df.notnull(), pandas_df.notnull()) def test_nsmallest(): @@ -2344,10 +2257,9 @@ def test_nsmallest(): @pytest.fixture def test_nunique(ray_df, pandas_df): - assert(ray_df_equals_pandas(ray_df.nunique(), - pandas_df.nunique())) - assert(ray_df_equals_pandas(ray_df.nunique(axis=1), - pandas_df.nunique(axis=1))) + assert ray_df_equals_pandas(ray_df.nunique(), pandas_df.nunique()) + assert ray_df_equals_pandas(ray_df.nunique(axis=1), + pandas_df.nunique(axis=1)) def test_pct_change(): @@ -2414,7 +2326,7 @@ def test_pop(ray_df, pandas_df): temp_pandas_df = pandas_df.copy() ray_popped = temp_ray_df.pop('col2') pandas_popped = temp_pandas_df.pop('col2') - assert ray_popped.sort_index().equals(pandas_popped.sort_index()) + assert ray_popped.equals(pandas_popped) assert ray_df_equals_pandas(temp_ray_df, temp_pandas_df) @@ -2424,17 +2336,17 @@ def test_pow(): @pytest.fixture def test_prod(ray_df, pandas_df): - assert(ray_df.prod().equals(pandas_df.prod())) + assert ray_df.prod().equals(pandas_df.prod()) @pytest.fixture def test_product(ray_df, pandas_df): - assert(ray_df.product().equals(pandas_df.product())) + assert ray_df.product().equals(pandas_df.product()) @pytest.fixture def test_quantile(ray_df, pandas_df, q): - assert(ray_df.quantile(q).equals(pandas_df.quantile(q))) + assert ray_df.quantile(q).equals(pandas_df.quantile(q)) @pytest.fixture @@ -2450,8 +2362,8 @@ def test_radd(): @pytest.fixture def test_rank(ray_df, pandas_df): - assert(ray_df_equals_pandas(ray_df.rank(), pandas_df.rank())) - assert(ray_df_equals_pandas(ray_df.rank(axis=1), pandas_df.rank(axis=1))) + assert ray_df_equals_pandas(ray_df.rank(), pandas_df.rank()) + assert ray_df_equals_pandas(ray_df.rank(axis=1), pandas_df.rank(axis=1)) def test_rdiv(): @@ -2459,12 +2371,13 @@ def test_rdiv(): def test_reindex(): - pandas_df = pd.DataFrame({'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0]}) - ray_df = from_pandas(pandas_df, 2) + frame_data = {'col1': [0, 1, 2, 3], + 'col2': [4, 5, 6, 7], + 'col3': [8, 9, 10, 11], + 'col4': [12, 13, 14, 15], + 'col5': [0, 0, 0, 0]} + pandas_df = pandas.DataFrame(frame_data) + ray_df = pd.DataFrame(frame_data) assert ray_df_equals_pandas( ray_df.reindex([0, 3, 2, 1]), pandas_df.reindex([0, 3, 2, 1])) @@ -2513,93 +2426,74 @@ def test_rename(): @pytest.fixture -def test_rename_sanity(num_partitions=2): +def test_rename_sanity(): test_data = TestData() - mapping = { - 'A': 'a', - 'B': 'b', - 'C': 'c', - 'D': 'd' - } + mapping = {'A': 'a', 'B': 'b', 'C': 'c', 'D': 'd'} - ray_df = from_pandas(test_data.frame, num_partitions) - assert ray_df_equals_pandas( - ray_df.rename(columns=mapping), - test_data.frame.rename(columns=mapping) - ) + ray_df = pd.DataFrame(test_data.frame) + assert ray_df_equals_pandas(ray_df.rename(columns=mapping), + test_data.frame.rename(columns=mapping)) renamed2 = test_data.frame.rename(columns=str.lower) - assert ray_df_equals_pandas( - ray_df.rename(columns=str.lower), - renamed2 - ) + assert ray_df_equals_pandas(ray_df.rename(columns=str.lower), renamed2) - ray_df = from_pandas(renamed2, num_partitions) - assert ray_df_equals_pandas( - ray_df.rename(columns=str.upper), - renamed2.rename(columns=str.upper) - ) + ray_df = pd.DataFrame(renamed2) + assert ray_df_equals_pandas(ray_df.rename(columns=str.upper), + renamed2.rename(columns=str.upper)) # index - data = { - 'A': {'foo': 0, 'bar': 1} - } + data = {'A': {'foo': 0, 'bar': 1}} # gets sorted alphabetical - df = pd.DataFrame(data) - ray_df = from_pandas(df, num_partitions) + df = pandas.DataFrame(data) + ray_df = pd.DataFrame(data) tm.assert_index_equal( ray_df.rename(index={'foo': 'bar', 'bar': 'foo'}).index, - df.rename(index={'foo': 'bar', 'bar': 'foo'}).index - ) + df.rename(index={'foo': 'bar', 'bar': 'foo'}).index) - tm.assert_index_equal( - ray_df.rename(index=str.upper).index, - df.rename(index=str.upper).index - ) + tm.assert_index_equal(ray_df.rename(index=str.upper).index, + df.rename(index=str.upper).index) # have to pass something pytest.raises(TypeError, ray_df.rename) # partial columns renamed = test_data.frame.rename(columns={'C': 'foo', 'D': 'bar'}) - ray_df = from_pandas(test_data.frame, num_partitions) + ray_df = pd.DataFrame(test_data.frame) tm.assert_index_equal( ray_df.rename(columns={'C': 'foo', 'D': 'bar'}).index, - test_data.frame.rename(columns={'C': 'foo', 'D': 'bar'}).index - ) + test_data.frame.rename(columns={'C': 'foo', 'D': 'bar'}).index) # TODO: Uncomment when transpose works # other axis # renamed = test_data.frame.T.rename(index={'C': 'foo', 'D': 'bar'}) # tm.assert_index_equal( # test_data.frame.T.rename(index={'C': 'foo', 'D': 'bar'}).index, - # ray_df.T.rename(index={'C': 'foo', 'D': 'bar'}).index - # ) + # ray_df.T.rename(index={'C': 'foo', 'D': 'bar'}).index) # index with name - index = pd.Index(['foo', 'bar'], name='name') - renamer = pd.DataFrame(data, index=index) + index = pandas.Index(['foo', 'bar'], name='name') + renamer = pandas.DataFrame(data, index=index) + ray_df = pd.DataFrame(data, index=index) - ray_df = from_pandas(renamer, num_partitions) renamed = renamer.rename(index={'foo': 'bar', 'bar': 'foo'}) ray_renamed = ray_df.rename(index={'foo': 'bar', 'bar': 'foo'}) - tm.assert_index_equal( - renamed.index, ray_renamed.index - ) + tm.assert_index_equal(renamed.index, ray_renamed.index) assert renamed.index.name == ray_renamed.index.name @pytest.fixture -def test_rename_multiindex(num_partitions=2): +def test_rename_multiindex(): tuples_index = [('foo1', 'bar1'), ('foo2', 'bar2')] tuples_columns = [('fizz1', 'buzz1'), ('fizz2', 'buzz2')] - index = pd.MultiIndex.from_tuples(tuples_index, names=['foo', 'bar']) - columns = pd.MultiIndex.from_tuples( + index = pandas.MultiIndex.from_tuples(tuples_index, names=['foo', 'bar']) + columns = pandas.MultiIndex.from_tuples( tuples_columns, names=['fizz', 'buzz']) - df = pd.DataFrame([(0, 0), (1, 1)], index=index, columns=columns) - ray_df = from_pandas(df, num_partitions) + + frame_data = [(0, 0), (1, 1)] + df = pandas.DataFrame(frame_data, index=index, columns=columns) + ray_df = pd.DataFrame(frame_data, index=index, columns=columns) # # without specifying level -> accross all levels @@ -2607,9 +2501,7 @@ def test_rename_multiindex(num_partitions=2): columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}) ray_renamed = ray_df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}) - tm.assert_index_equal( - renamed.index, ray_renamed.index - ) + tm.assert_index_equal(renamed.index, ray_renamed.index) renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}) @@ -2618,7 +2510,7 @@ def test_rename_multiindex(num_partitions=2): assert renamed.columns.names == ray_renamed.columns.names # - # with specifying a level (GH13766) + # with specifying a level # dict renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, @@ -2632,8 +2524,7 @@ def test_rename_multiindex(num_partitions=2): level='fizz') tm.assert_index_equal(renamed.columns, ray_renamed.columns) - renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, - level=1) + renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, level=1) ray_renamed = ray_df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, level=1) tm.assert_index_equal(renamed.columns, ray_renamed.columns) @@ -2660,49 +2551,43 @@ def test_rename_multiindex(num_partitions=2): tm.assert_index_equal(renamed.columns, ray_renamed.columns) # index - renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, - level=0) + renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, level=0) ray_renamed = ray_df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, level=0) tm.assert_index_equal(ray_renamed.index, renamed.index) @pytest.fixture -def test_rename_nocopy(num_partitions=2): +def test_rename_nocopy(): test_data = TestData().frame - ray_df = from_pandas(test_data, num_partitions) + ray_df = pd.DataFrame(test_data) ray_renamed = ray_df.rename(columns={'C': 'foo'}, copy=False) ray_renamed['foo'] = 1 assert (ray_df['C'] == 1).all() @pytest.fixture -def test_rename_inplace(num_partitions=2): +def test_rename_inplace(): test_data = TestData().frame - ray_df = from_pandas(test_data, num_partitions) + ray_df = pd.DataFrame(test_data) - assert ray_df_equals_pandas( - ray_df.rename(columns={'C': 'foo'}), - test_data.rename(columns={'C': 'foo'}) - ) + assert ray_df_equals_pandas(ray_df.rename(columns={'C': 'foo'}), + test_data.rename(columns={'C': 'foo'})) frame = test_data.copy() ray_frame = ray_df.copy() frame.rename(columns={'C': 'foo'}, inplace=True) ray_frame.rename(columns={'C': 'foo'}, inplace=True) - assert ray_df_equals_pandas( - ray_frame, - frame - ) + assert ray_df_equals_pandas(ray_frame, frame) @pytest.fixture -def test_rename_bug(num_partitions=2): - # GH 5344 +def test_rename_bug(): # rename set ref_locs, and set_index was not resetting - df = pd.DataFrame({0: ['foo', 'bar'], 1: ['bah', 'bas'], 2: [1, 2]}) - ray_df = from_pandas(df, num_partitions) + frame_data = {0: ['foo', 'bar'], 1: ['bah', 'bas'], 2: [1, 2]} + df = pandas.DataFrame(frame_data) + ray_df = pd.DataFrame(frame_data) df = df.rename(columns={0: 'a'}) df = df.rename(columns={1: 'b'}) # TODO: Uncomment when set_index is implemented @@ -2715,32 +2600,20 @@ def test_rename_bug(num_partitions=2): # ray_df = ray_df.set_index(['a', 'b']) # ray_df.columns = ['2001-01-01'] - assert ray_df_equals_pandas( - ray_df, - df - ) + assert ray_df_equals_pandas(ray_df, df) -def test_rename_axis(): - test_rename_axis_inplace() - - -@pytest.fixture -def test_rename_axis_inplace(num_partitions=2): +def test_rename_axis_inplace(): test_frame = TestData().frame - ray_df = from_pandas(test_frame, num_partitions) + ray_df = pd.DataFrame(test_frame) - # GH 15704 result = test_frame.copy() ray_result = ray_df.copy() no_return = result.rename_axis('foo', inplace=True) ray_no_return = ray_result.rename_axis('foo', inplace=True) assert no_return is ray_no_return - assert ray_df_equals_pandas( - ray_result, - result - ) + assert ray_df_equals_pandas(ray_result, result) result = test_frame.copy() ray_result = ray_df.copy() @@ -2748,10 +2621,7 @@ def test_rename_axis_inplace(num_partitions=2): ray_no_return = ray_result.rename_axis('bar', axis=1, inplace=True) assert no_return is ray_no_return - assert ray_df_equals_pandas( - ray_result, - result - ) + assert ray_df_equals_pandas(ray_result, result) def test_reorder_levels(): @@ -2845,25 +2715,27 @@ def test_select(): def test_select_dtypes(): - df = pd.DataFrame({'test1': list('abc'), - 'test2': np.arange(3, 6).astype('u1'), - 'test3': np.arange(8.0, 11.0, dtype='float64'), - 'test4': [True, False, True], - 'test5': pd.date_range('now', periods=3).values, - 'test6': list(range(5, 8))}) + frame_data = {'test1': list('abc'), + 'test2': np.arange(3, 6).astype('u1'), + 'test3': np.arange(8.0, 11.0, dtype='float64'), + 'test4': [True, False, True], + 'test5': pandas.date_range('now', periods=3).values, + 'test6': list(range(5, 8))} + df = pandas.DataFrame(frame_data) + rd = pd.DataFrame(frame_data) + include = np.float, 'integer' exclude = np.bool_, - rd = from_pandas(df, 2) r = rd.select_dtypes(include=include, exclude=exclude) e = df[["test2", "test3", "test6"]] - assert(ray_df_equals_pandas(r, e)) + assert ray_df_equals_pandas(r, e) try: - rdf.DataFrame().select_dtypes() - assert(False) + pd.DataFrame().select_dtypes() + assert False except ValueError: - assert(True) + assert True def test_sem(): @@ -2908,10 +2780,9 @@ def test_shift(): @pytest.fixture def test_skew(ray_df, pandas_df): - assert(ray_df_equals_pandas(ray_df.skew(), - pandas_df.skew())) - assert(ray_df_equals_pandas(ray_df.skew(axis=1), - pandas_df.skew(axis=1))) + assert ray_df_equals_pandas(ray_df.skew(), pandas_df.skew()) + assert ray_df_equals_pandas(ray_df.skew(axis=1), + pandas_df.skew(axis=1)) def test_slice_shift(): @@ -2922,11 +2793,12 @@ def test_slice_shift(): def test_sort_index(): - pandas_df = pd.DataFrame(np.random.randint(0, 100, size=(1000, 100))) - ray_df = rdf.DataFrame(pandas_df) + frame_data = np.random.randint(0, 100, size=(1000, 100)) + pandas_df = pandas.DataFrame(frame_data) + ray_df = pd.DataFrame(frame_data) - pandas_result = pandas_df.sort_index() - ray_result = ray_df.sort_index() + pandas_result = pandas_df + ray_result = ray_df assert ray_df_equals_pandas(ray_result, pandas_result) @@ -2937,8 +2809,9 @@ def test_sort_index(): def test_sort_values(): - pandas_df = pd.DataFrame(np.random.randint(0, 100, size=(1000, 100))) - ray_df = rdf.DataFrame(pandas_df) + frame_data = np.random.randint(0, 100, size=(1000, 100)) + pandas_df = pandas.DataFrame(frame_data) + ray_df = pd.DataFrame(frame_data) pandas_result = pandas_df.sort_values(by=1) ray_result = ray_df.sort_values(by=1) @@ -2984,7 +2857,7 @@ def test_stack(): @pytest.fixture def test_std(ray_df, pandas_df): - assert(ray_df.std().equals(pandas_df.std())) + assert ray_df.std().equals(pandas_df.std()) def test_sub(): @@ -3104,32 +2977,32 @@ def test_unstack(): def test_update(): - df = rdf.DataFrame([[1.5, np.nan, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3], - [1.5, np.nan, 3]]) + df = pd.DataFrame([[1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3], + [1.5, np.nan, 3]]) - other = rdf.DataFrame([[3.6, 2., np.nan], - [np.nan, np.nan, 7]], index=[1, 3]) + other = pd.DataFrame([[3.6, 2., np.nan], + [np.nan, np.nan, 7]], index=[1, 3]) df.update(other) - expected = rdf.DataFrame([[1.5, np.nan, 3], - [3.6, 2, 3], - [1.5, np.nan, 3], - [1.5, np.nan, 7.]]) + expected = pd.DataFrame([[1.5, np.nan, 3], + [3.6, 2, 3], + [1.5, np.nan, 3], + [1.5, np.nan, 7.]]) assert ray_df_equals(df, expected) @pytest.fixture def test_var(ray_df, pandas_df): - assert(ray_df.var().equals(pandas_df.var())) + assert ray_df.var().equals(pandas_df.var()) def test_where(): - pandas_df = pd.DataFrame(np.random.randn(100, 10), - columns=list('abcdefghij')) - ray_df = rdf.DataFrame(pandas_df) + frame_data = np.random.randn(100, 10) + pandas_df = pandas.DataFrame(frame_data, columns=list('abcdefghij')) + ray_df = pd.DataFrame(frame_data, columns=list('abcdefghij')) pandas_cond_df = pandas_df % 5 < 2 ray_cond_df = ray_df % 5 < 2 @@ -3169,7 +3042,7 @@ def test_xs(): @pytest.fixture def test___getitem__(ray_df, pd_df): ray_col = ray_df.__getitem__('col1') - assert isinstance(ray_col, pd.Series) + assert isinstance(ray_col, pandas.Series) pd_col = pd_df['col1'] assert pd_col.equals(ray_col) @@ -3179,17 +3052,17 @@ def test___getattr__(): df = create_test_dataframe() col = df.__getattr__("col1") - assert isinstance(col, pd.Series) + assert isinstance(col, pandas.Series) col = getattr(df, "col1") - assert isinstance(col, pd.Series) + assert isinstance(col, pandas.Series) col = df.col1 - assert isinstance(col, pd.Series) + assert isinstance(col, pandas.Series) # Check that lookup in column doesn't override other attributes df2 = df.rename(index=str, columns={"col5": "columns"}) - assert isinstance(df2.columns, pd.Index) + assert isinstance(df2.columns, pandas.Index) def test___setitem__(): @@ -3201,7 +3074,7 @@ def test___setitem__(): @pytest.fixture def test___len__(ray_df, pandas_df): - assert((len(ray_df) == len(pandas_df))) + assert len(ray_df) == len(pandas_df) def test___unicode__(): @@ -3265,7 +3138,7 @@ def test___bool__(): @pytest.fixture def test___abs__(ray_df, pandas_df): - assert(ray_df_equals_pandas(abs(ray_df), abs(pandas_df))) + assert ray_df_equals_pandas(abs(ray_df), abs(pandas_df)) def test___round__(): @@ -3418,33 +3291,32 @@ def test_iloc(ray_df, pd_df): def test__doc__(): - assert rdf.DataFrame.__doc__ != pd.DataFrame.__doc__ - assert rdf.DataFrame.__init__ != pd.DataFrame.__init__ - for attr, obj in rdf.DataFrame.__dict__.items(): + assert pd.DataFrame.__doc__ != pandas.DataFrame.__doc__ + assert pd.DataFrame.__init__ != pandas.DataFrame.__init__ + for attr, obj in pd.DataFrame.__dict__.items(): if (callable(obj) or isinstance(obj, property)) \ and attr != "__init__": - pd_obj = getattr(pd.DataFrame, attr, None) + pd_obj = getattr(pandas.DataFrame, attr, None) if callable(pd_obj) or isinstance(pd_obj, property): assert obj.__doc__ == pd_obj.__doc__ def test_to_datetime(): - ray_df = rdf.DataFrame({'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5]}) - pd_df = pd.DataFrame({'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5]}) + frame_data = {'year': [2015, 2016], + 'month': [2, 3], + 'day': [4, 5]} + ray_df = pd.DataFrame(frame_data) + pd_df = pandas.DataFrame(frame_data) - rdf.to_datetime(ray_df).equals(pd.to_datetime(pd_df)) + pd.to_datetime(ray_df).equals(pandas.to_datetime(pd_df)) def test_get_dummies(): - ray_df = rdf.DataFrame({'A': ['a', 'b', 'a'], - 'B': ['b', 'a', 'c'], - 'C': [1, 2, 3]}) - pd_df = pd.DataFrame({'A': ['a', 'b', 'a'], - 'B': ['b', 'a', 'c'], - 'C': [1, 2, 3]}) + frame_data = {'A': ['a', 'b', 'a'], + 'B': ['b', 'a', 'c'], + 'C': [1, 2, 3]} + ray_df = pd.DataFrame(frame_data) + pd_df = pandas.DataFrame(frame_data) - assert ray_df_equals_pandas(rdf.get_dummies(ray_df), pd.get_dummies(pd_df)) + assert ray_df_equals_pandas(pd.get_dummies(ray_df), + pandas.get_dummies(pd_df))