mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
Implement unicode serialization
This commit is contained in:
parent
a81dc0c541
commit
8e165d43d4
6 changed files with 85 additions and 52 deletions
|
@ -8,7 +8,9 @@ SequenceBuilder::SequenceBuilder(MemoryPool* pool)
|
|||
: pool_(pool), types_(pool), offsets_(pool),
|
||||
nones_(pool, std::make_shared<NullType>()),
|
||||
bools_(pool, std::make_shared<BooleanType>()),
|
||||
ints_(pool), strings_(pool, std::make_shared<StringType>()),
|
||||
ints_(pool),
|
||||
bytes_(pool, std::make_shared<BinaryType>()),
|
||||
strings_(pool, std::make_shared<StringType>()),
|
||||
floats_(pool), doubles_(pool),
|
||||
uint8_tensors_(std::make_shared<UInt8Type>(), pool),
|
||||
int8_tensors_(std::make_shared<Int8Type>(), pool),
|
||||
|
@ -31,46 +33,51 @@ SequenceBuilder::SequenceBuilder(MemoryPool* pool)
|
|||
RETURN_NOT_OK(types_.Append(TAG)); \
|
||||
RETURN_NOT_OK(nones_.AppendToBitmap(true));
|
||||
|
||||
Status SequenceBuilder::Append() {
|
||||
Status SequenceBuilder::AppendNone() {
|
||||
RETURN_NOT_OK(offsets_.Append(0));
|
||||
RETURN_NOT_OK(types_.Append(0));
|
||||
return nones_.AppendToBitmap(false);
|
||||
}
|
||||
|
||||
Status SequenceBuilder::Append(bool data) {
|
||||
Status SequenceBuilder::AppendBool(bool data) {
|
||||
UPDATE(bools_.length(), bool_tag);
|
||||
return bools_.Append(data);
|
||||
}
|
||||
|
||||
Status SequenceBuilder::Append(int64_t data) {
|
||||
Status SequenceBuilder::AppendInt64(int64_t data) {
|
||||
UPDATE(ints_.length(), int_tag);
|
||||
return ints_.Append(data);
|
||||
}
|
||||
|
||||
Status SequenceBuilder::Append(uint64_t data) {
|
||||
Status SequenceBuilder::AppendUInt64(uint64_t data) {
|
||||
UPDATE(ints_.length(), int_tag);
|
||||
return ints_.Append(data);
|
||||
}
|
||||
|
||||
Status SequenceBuilder::Append(const char* data, int32_t length) {
|
||||
Status SequenceBuilder::AppendBytes(const uint8_t* data, int32_t length) {
|
||||
UPDATE(bytes_.length(), bytes_tag);
|
||||
return bytes_.Append(data, length);
|
||||
}
|
||||
|
||||
Status SequenceBuilder::AppendString(const char* data, int32_t length) {
|
||||
UPDATE(strings_.length(), string_tag);
|
||||
return strings_.Append(data, length);
|
||||
}
|
||||
|
||||
Status SequenceBuilder::Append(float data) {
|
||||
Status SequenceBuilder::AppendFloat(float data) {
|
||||
UPDATE(floats_.length(), float_tag);
|
||||
return floats_.Append(data);
|
||||
}
|
||||
|
||||
Status SequenceBuilder::Append(double data) {
|
||||
Status SequenceBuilder::AppendDouble(double data) {
|
||||
UPDATE(doubles_.length(), double_tag);
|
||||
return doubles_.Append(data);
|
||||
}
|
||||
|
||||
#define DEF_TENSOR_APPEND(NAME, TYPE, TAG) \
|
||||
Status SequenceBuilder::Append(const std::vector<int64_t>& dims, TYPE* data) { \
|
||||
UPDATE(NAME.length(), TAG); \
|
||||
return NAME.Append(dims, data); \
|
||||
#define DEF_TENSOR_APPEND(NAME, TYPE, TAG) \
|
||||
Status SequenceBuilder::AppendTensor(const std::vector<int64_t>& dims, TYPE* data) { \
|
||||
UPDATE(NAME.length(), TAG); \
|
||||
return NAME.Append(dims, data); \
|
||||
}
|
||||
|
||||
DEF_TENSOR_APPEND(uint8_tensors_, uint8_t, uint8_tensor_tag);
|
||||
|
@ -136,6 +143,7 @@ std::shared_ptr<DenseUnionArray> SequenceBuilder::Finish(
|
|||
ADD_ELEMENT(bools_, bool_tag);
|
||||
ADD_ELEMENT(ints_, int_tag);
|
||||
ADD_ELEMENT(strings_, string_tag);
|
||||
ADD_ELEMENT(bytes_, bytes_tag);
|
||||
ADD_ELEMENT(floats_, float_tag);
|
||||
ADD_ELEMENT(doubles_, double_tag);
|
||||
|
||||
|
|
|
@ -15,25 +15,28 @@ class SequenceBuilder {
|
|||
SequenceBuilder(arrow::MemoryPool* pool = nullptr);
|
||||
|
||||
//! Appending a none to the sequence
|
||||
arrow::Status Append();
|
||||
arrow::Status AppendNone();
|
||||
|
||||
//! Appending a boolean to the sequence
|
||||
arrow::Status Append(bool data);
|
||||
arrow::Status AppendBool(bool data);
|
||||
|
||||
//! Appending an int64_t to the sequence
|
||||
arrow::Status Append(int64_t data);
|
||||
arrow::Status AppendInt64(int64_t data);
|
||||
|
||||
//! Appending an uint64_t to the sequence
|
||||
arrow::Status Append(uint64_t data);
|
||||
arrow::Status AppendUInt64(uint64_t data);
|
||||
|
||||
//! Append a list of bytes to the sequence
|
||||
arrow::Status AppendBytes(const uint8_t* data, int32_t length);
|
||||
|
||||
//! Appending a string to the sequence
|
||||
arrow::Status Append(const char* data, int32_t length);
|
||||
arrow::Status AppendString(const char* data, int32_t length);
|
||||
|
||||
//! Appending a float to the sequence
|
||||
arrow::Status Append(float data);
|
||||
arrow::Status AppendFloat(float data);
|
||||
|
||||
//! Appending a double to the sequence
|
||||
arrow::Status Append(double data);
|
||||
arrow::Status AppendDouble(double data);
|
||||
|
||||
/*! Appending a tensor to the sequence
|
||||
|
||||
|
@ -44,16 +47,16 @@ class SequenceBuilder {
|
|||
A pointer to the start of the data block. The length of the data block
|
||||
will be the product of the dimensions
|
||||
*/
|
||||
arrow::Status Append(const std::vector<int64_t>& dims, uint8_t* data);
|
||||
arrow::Status Append(const std::vector<int64_t>& dims, int8_t* data);
|
||||
arrow::Status Append(const std::vector<int64_t>& dims, uint16_t* data);
|
||||
arrow::Status Append(const std::vector<int64_t>& dims, int16_t* data);
|
||||
arrow::Status Append(const std::vector<int64_t>& dims, uint32_t* data);
|
||||
arrow::Status Append(const std::vector<int64_t>& dims, int32_t* data);
|
||||
arrow::Status Append(const std::vector<int64_t>& dims, uint64_t* data);
|
||||
arrow::Status Append(const std::vector<int64_t>& dims, int64_t* data);
|
||||
arrow::Status Append(const std::vector<int64_t>& dims, float* data);
|
||||
arrow::Status Append(const std::vector<int64_t>& dims, double* data);
|
||||
arrow::Status AppendTensor(const std::vector<int64_t>& dims, uint8_t* data);
|
||||
arrow::Status AppendTensor(const std::vector<int64_t>& dims, int8_t* data);
|
||||
arrow::Status AppendTensor(const std::vector<int64_t>& dims, uint16_t* data);
|
||||
arrow::Status AppendTensor(const std::vector<int64_t>& dims, int16_t* data);
|
||||
arrow::Status AppendTensor(const std::vector<int64_t>& dims, uint32_t* data);
|
||||
arrow::Status AppendTensor(const std::vector<int64_t>& dims, int32_t* data);
|
||||
arrow::Status AppendTensor(const std::vector<int64_t>& dims, uint64_t* data);
|
||||
arrow::Status AppendTensor(const std::vector<int64_t>& dims, int64_t* data);
|
||||
arrow::Status AppendTensor(const std::vector<int64_t>& dims, float* data);
|
||||
arrow::Status AppendTensor(const std::vector<int64_t>& dims, double* data);
|
||||
|
||||
/*! Add a sublist to the sequenc. The data contained in the sublist will be
|
||||
specified in the "Finish" method.
|
||||
|
@ -90,6 +93,7 @@ class SequenceBuilder {
|
|||
arrow::NullArrayBuilder nones_;
|
||||
arrow::BooleanBuilder bools_;
|
||||
arrow::Int64Builder ints_;
|
||||
arrow::BinaryBuilder bytes_;
|
||||
arrow::StringBuilder strings_;
|
||||
arrow::FloatBuilder floats_;
|
||||
arrow::DoubleBuilder doubles_;
|
||||
|
@ -112,6 +116,7 @@ class SequenceBuilder {
|
|||
int8_t bool_tag = -1;
|
||||
int8_t int_tag = -1;
|
||||
int8_t string_tag = -1;
|
||||
int8_t bytes_tag = -1;
|
||||
int8_t float_tag = -1;
|
||||
int8_t double_tag = -1;
|
||||
|
||||
|
|
|
@ -65,34 +65,34 @@ Status SerializeArray(PyArrayObject* array, SequenceBuilder& builder) {
|
|||
auto data = PyArray_DATA(contiguous);
|
||||
switch (dtype) {
|
||||
case NPY_UINT8:
|
||||
RETURN_NOT_OK(builder.Append(dims, reinterpret_cast<uint8_t*>(data)));
|
||||
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<uint8_t*>(data)));
|
||||
break;
|
||||
case NPY_INT8:
|
||||
RETURN_NOT_OK(builder.Append(dims, reinterpret_cast<int8_t*>(data)));
|
||||
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<int8_t*>(data)));
|
||||
break;
|
||||
case NPY_UINT16:
|
||||
RETURN_NOT_OK(builder.Append(dims, reinterpret_cast<uint16_t*>(data)));
|
||||
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<uint16_t*>(data)));
|
||||
break;
|
||||
case NPY_INT16:
|
||||
RETURN_NOT_OK(builder.Append(dims, reinterpret_cast<int16_t*>(data)));
|
||||
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<int16_t*>(data)));
|
||||
break;
|
||||
case NPY_UINT32:
|
||||
RETURN_NOT_OK(builder.Append(dims, reinterpret_cast<uint32_t*>(data)));
|
||||
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<uint32_t*>(data)));
|
||||
break;
|
||||
case NPY_INT32:
|
||||
RETURN_NOT_OK(builder.Append(dims, reinterpret_cast<int32_t*>(data)));
|
||||
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<int32_t*>(data)));
|
||||
break;
|
||||
case NPY_UINT64:
|
||||
RETURN_NOT_OK(builder.Append(dims, reinterpret_cast<uint64_t*>(data)));
|
||||
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<uint64_t*>(data)));
|
||||
break;
|
||||
case NPY_INT64:
|
||||
RETURN_NOT_OK(builder.Append(dims, reinterpret_cast<int64_t*>(data)));
|
||||
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<int64_t*>(data)));
|
||||
break;
|
||||
case NPY_FLOAT:
|
||||
RETURN_NOT_OK(builder.Append(dims, reinterpret_cast<float*>(data)));
|
||||
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<float*>(data)));
|
||||
break;
|
||||
case NPY_DOUBLE:
|
||||
RETURN_NOT_OK(builder.Append(dims, reinterpret_cast<double*>(data)));
|
||||
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<double*>(data)));
|
||||
break;
|
||||
default:
|
||||
DCHECK(false) << "numpy data type not recognized: " << dtype;
|
||||
|
|
|
@ -15,10 +15,15 @@ PyObject* get_value(ArrayPtr arr, int32_t index, int32_t type) {
|
|||
return PyBool_FromLong(std::static_pointer_cast<BooleanArray>(arr)->Value(index));
|
||||
case Type::INT64:
|
||||
return PyInt_FromLong(std::static_pointer_cast<Int64Array>(arr)->Value(index));
|
||||
case Type::BINARY: {
|
||||
int32_t nchars;
|
||||
const uint8_t* str = std::static_pointer_cast<BinaryArray>(arr)->GetValue(index, &nchars);
|
||||
return PyString_FromStringAndSize(reinterpret_cast<const char*>(str), nchars);
|
||||
}
|
||||
case Type::STRING: {
|
||||
int32_t nchars;
|
||||
const uint8_t* str = std::static_pointer_cast<StringArray>(arr)->GetValue(index, &nchars);
|
||||
return PyString_FromStringAndSize(reinterpret_cast<const char*>(str), nchars);
|
||||
return PyUnicode_FromStringAndSize(reinterpret_cast<const char*>(str), nchars);
|
||||
}
|
||||
case Type::FLOAT:
|
||||
return PyFloat_FromDouble(std::static_pointer_cast<FloatArray>(arr)->Value(index));
|
||||
|
@ -50,20 +55,34 @@ Status append(PyObject* elem, SequenceBuilder& builder,
|
|||
std::vector<PyObject*>& subdicts) {
|
||||
// The bool case must precede the int case (PyInt_Check passes for bools)
|
||||
if (PyBool_Check(elem)) {
|
||||
RETURN_NOT_OK(builder.Append(elem == Py_True));
|
||||
RETURN_NOT_OK(builder.AppendBool(elem == Py_True));
|
||||
} else if (PyFloat_Check(elem)) {
|
||||
RETURN_NOT_OK(builder.Append(PyFloat_AS_DOUBLE(elem)));
|
||||
RETURN_NOT_OK(builder.AppendFloat(PyFloat_AS_DOUBLE(elem)));
|
||||
} else if (PyLong_Check(elem)) {
|
||||
int overflow = 0;
|
||||
int64_t data = PyLong_AsLongLongAndOverflow(elem, &overflow);
|
||||
RETURN_NOT_OK(builder.Append(data));
|
||||
RETURN_NOT_OK(builder.AppendInt64(data));
|
||||
if(overflow) {
|
||||
return Status::NotImplemented("long overflow");
|
||||
}
|
||||
} else if (PyInt_Check(elem)) {
|
||||
RETURN_NOT_OK(builder.Append(static_cast<int64_t>(PyInt_AS_LONG(elem))));
|
||||
RETURN_NOT_OK(builder.AppendInt64(static_cast<int64_t>(PyInt_AS_LONG(elem))));
|
||||
} else if (PyString_Check(elem)) {
|
||||
RETURN_NOT_OK(builder.Append(PyString_AS_STRING(elem), PyString_GET_SIZE(elem)));
|
||||
auto data = reinterpret_cast<uint8_t*>(PyString_AS_STRING(elem));
|
||||
auto size = PyString_GET_SIZE(elem);
|
||||
RETURN_NOT_OK(builder.AppendBytes(data, size));
|
||||
} else if (PyUnicode_Check(elem)) {
|
||||
Py_ssize_t size;
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
char* data = PyUnicode_AsUTF8AndSize(elem, &size); // TODO(pcm): Check if this is correct
|
||||
#else
|
||||
PyObject* str = PyUnicode_AsUTF8String(elem);
|
||||
char* data = PyString_AS_STRING(str);
|
||||
size = PyString_GET_SIZE(str);
|
||||
#endif
|
||||
Status s = builder.AppendString(data, size);
|
||||
Py_XDECREF(str);
|
||||
RETURN_NOT_OK(s);
|
||||
} else if (PyList_Check(elem)) {
|
||||
builder.AppendList(PyList_Size(elem));
|
||||
sublists.push_back(elem);
|
||||
|
@ -78,7 +97,7 @@ Status append(PyObject* elem, SequenceBuilder& builder,
|
|||
} else if (PyArray_Check(elem)) {
|
||||
RETURN_NOT_OK(SerializeArray((PyArrayObject*) elem, builder));
|
||||
} else if (elem == Py_None) {
|
||||
RETURN_NOT_OK(builder.Append());
|
||||
RETURN_NOT_OK(builder.AppendNone());
|
||||
} else {
|
||||
std::stringstream ss;
|
||||
ss << "data type of " << PyString_AS_STRING(PyObject_Repr(elem))
|
||||
|
|
|
@ -16,11 +16,11 @@ namespace numbuf {
|
|||
|
||||
arrow::Status AppendScalar(PyObject* obj, SequenceBuilder& builder) {
|
||||
if (PyArray_IsScalar(obj, Bool)) {
|
||||
return builder.Append(((PyBoolScalarObject *)obj)->obval != 0);
|
||||
return builder.AppendBool(((PyBoolScalarObject *)obj)->obval != 0);
|
||||
} else if (PyArray_IsScalar(obj, Float)) {
|
||||
return builder.Append(((PyFloatScalarObject *)obj)->obval);
|
||||
return builder.AppendFloat(((PyFloatScalarObject *)obj)->obval);
|
||||
} else if (PyArray_IsScalar(obj, Double)) {
|
||||
return builder.Append(((PyDoubleScalarObject *)obj)->obval);
|
||||
return builder.AppendDouble(((PyDoubleScalarObject *)obj)->obval);
|
||||
}
|
||||
int64_t value = 0;
|
||||
if (PyArray_IsScalar(obj, Byte)) {
|
||||
|
@ -46,7 +46,7 @@ arrow::Status AppendScalar(PyObject* obj, SequenceBuilder& builder) {
|
|||
} else {
|
||||
DCHECK(false) << "scalar type not recognized";
|
||||
}
|
||||
return builder.Append(value);
|
||||
return builder.AppendInt64(value);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
|
|
@ -3,8 +3,9 @@ import libnumbuf
|
|||
import numpy as np
|
||||
from numpy.testing import assert_equal
|
||||
|
||||
TEST_OBJECTS = [[1, "hello", 3.0], 42, 43L, "hello world", 42.0, 1L << 62,
|
||||
(1.0, "hi"), None, (None, None), ("hello", None),
|
||||
TEST_OBJECTS = [[1, "hello", 3.0], 42, 43L, "hello world", u"x", u"\u262F",
|
||||
42.0, 1L << 62, (1.0, "hi"),
|
||||
None, (None, None), ("hello", None),
|
||||
True, False, (True, False), "hello",
|
||||
{True: "hello", False: "world"},
|
||||
{"hello" : "world", 1: 42, 1.0: 45}, {},
|
||||
|
|
Loading…
Add table
Reference in a new issue