diff --git a/cpp/src/numbuf/sequence.cc b/cpp/src/numbuf/sequence.cc index ae250180f..a5e1cf8ca 100644 --- a/cpp/src/numbuf/sequence.cc +++ b/cpp/src/numbuf/sequence.cc @@ -8,7 +8,9 @@ SequenceBuilder::SequenceBuilder(MemoryPool* pool) : pool_(pool), types_(pool), offsets_(pool), nones_(pool, std::make_shared()), bools_(pool, std::make_shared()), - ints_(pool), strings_(pool, std::make_shared()), + ints_(pool), + bytes_(pool, std::make_shared()), + strings_(pool, std::make_shared()), floats_(pool), doubles_(pool), uint8_tensors_(std::make_shared(), pool), int8_tensors_(std::make_shared(), pool), @@ -31,46 +33,51 @@ SequenceBuilder::SequenceBuilder(MemoryPool* pool) RETURN_NOT_OK(types_.Append(TAG)); \ RETURN_NOT_OK(nones_.AppendToBitmap(true)); -Status SequenceBuilder::Append() { +Status SequenceBuilder::AppendNone() { RETURN_NOT_OK(offsets_.Append(0)); RETURN_NOT_OK(types_.Append(0)); return nones_.AppendToBitmap(false); } -Status SequenceBuilder::Append(bool data) { +Status SequenceBuilder::AppendBool(bool data) { UPDATE(bools_.length(), bool_tag); return bools_.Append(data); } -Status SequenceBuilder::Append(int64_t data) { +Status SequenceBuilder::AppendInt64(int64_t data) { UPDATE(ints_.length(), int_tag); return ints_.Append(data); } -Status SequenceBuilder::Append(uint64_t data) { +Status SequenceBuilder::AppendUInt64(uint64_t data) { UPDATE(ints_.length(), int_tag); return ints_.Append(data); } -Status SequenceBuilder::Append(const char* data, int32_t length) { +Status SequenceBuilder::AppendBytes(const uint8_t* data, int32_t length) { + UPDATE(bytes_.length(), bytes_tag); + return bytes_.Append(data, length); +} + +Status SequenceBuilder::AppendString(const char* data, int32_t length) { UPDATE(strings_.length(), string_tag); return strings_.Append(data, length); } -Status SequenceBuilder::Append(float data) { +Status SequenceBuilder::AppendFloat(float data) { UPDATE(floats_.length(), float_tag); return floats_.Append(data); } -Status SequenceBuilder::Append(double data) { +Status SequenceBuilder::AppendDouble(double data) { UPDATE(doubles_.length(), double_tag); return doubles_.Append(data); } -#define DEF_TENSOR_APPEND(NAME, TYPE, TAG) \ - Status SequenceBuilder::Append(const std::vector& dims, TYPE* data) { \ - UPDATE(NAME.length(), TAG); \ - return NAME.Append(dims, data); \ +#define DEF_TENSOR_APPEND(NAME, TYPE, TAG) \ + Status SequenceBuilder::AppendTensor(const std::vector& dims, TYPE* data) { \ + UPDATE(NAME.length(), TAG); \ + return NAME.Append(dims, data); \ } DEF_TENSOR_APPEND(uint8_tensors_, uint8_t, uint8_tensor_tag); @@ -136,6 +143,7 @@ std::shared_ptr SequenceBuilder::Finish( ADD_ELEMENT(bools_, bool_tag); ADD_ELEMENT(ints_, int_tag); ADD_ELEMENT(strings_, string_tag); + ADD_ELEMENT(bytes_, bytes_tag); ADD_ELEMENT(floats_, float_tag); ADD_ELEMENT(doubles_, double_tag); diff --git a/cpp/src/numbuf/sequence.h b/cpp/src/numbuf/sequence.h index f30ea00fe..7237dbd83 100644 --- a/cpp/src/numbuf/sequence.h +++ b/cpp/src/numbuf/sequence.h @@ -15,25 +15,28 @@ class SequenceBuilder { SequenceBuilder(arrow::MemoryPool* pool = nullptr); //! Appending a none to the sequence - arrow::Status Append(); + arrow::Status AppendNone(); //! Appending a boolean to the sequence - arrow::Status Append(bool data); + arrow::Status AppendBool(bool data); //! Appending an int64_t to the sequence - arrow::Status Append(int64_t data); + arrow::Status AppendInt64(int64_t data); //! Appending an uint64_t to the sequence - arrow::Status Append(uint64_t data); + arrow::Status AppendUInt64(uint64_t data); + + //! Append a list of bytes to the sequence + arrow::Status AppendBytes(const uint8_t* data, int32_t length); //! Appending a string to the sequence - arrow::Status Append(const char* data, int32_t length); + arrow::Status AppendString(const char* data, int32_t length); //! Appending a float to the sequence - arrow::Status Append(float data); + arrow::Status AppendFloat(float data); //! Appending a double to the sequence - arrow::Status Append(double data); + arrow::Status AppendDouble(double data); /*! Appending a tensor to the sequence @@ -44,16 +47,16 @@ class SequenceBuilder { A pointer to the start of the data block. The length of the data block will be the product of the dimensions */ - arrow::Status Append(const std::vector& dims, uint8_t* data); - arrow::Status Append(const std::vector& dims, int8_t* data); - arrow::Status Append(const std::vector& dims, uint16_t* data); - arrow::Status Append(const std::vector& dims, int16_t* data); - arrow::Status Append(const std::vector& dims, uint32_t* data); - arrow::Status Append(const std::vector& dims, int32_t* data); - arrow::Status Append(const std::vector& dims, uint64_t* data); - arrow::Status Append(const std::vector& dims, int64_t* data); - arrow::Status Append(const std::vector& dims, float* data); - arrow::Status Append(const std::vector& dims, double* data); + arrow::Status AppendTensor(const std::vector& dims, uint8_t* data); + arrow::Status AppendTensor(const std::vector& dims, int8_t* data); + arrow::Status AppendTensor(const std::vector& dims, uint16_t* data); + arrow::Status AppendTensor(const std::vector& dims, int16_t* data); + arrow::Status AppendTensor(const std::vector& dims, uint32_t* data); + arrow::Status AppendTensor(const std::vector& dims, int32_t* data); + arrow::Status AppendTensor(const std::vector& dims, uint64_t* data); + arrow::Status AppendTensor(const std::vector& dims, int64_t* data); + arrow::Status AppendTensor(const std::vector& dims, float* data); + arrow::Status AppendTensor(const std::vector& dims, double* data); /*! Add a sublist to the sequenc. The data contained in the sublist will be specified in the "Finish" method. @@ -90,6 +93,7 @@ class SequenceBuilder { arrow::NullArrayBuilder nones_; arrow::BooleanBuilder bools_; arrow::Int64Builder ints_; + arrow::BinaryBuilder bytes_; arrow::StringBuilder strings_; arrow::FloatBuilder floats_; arrow::DoubleBuilder doubles_; @@ -112,6 +116,7 @@ class SequenceBuilder { int8_t bool_tag = -1; int8_t int_tag = -1; int8_t string_tag = -1; + int8_t bytes_tag = -1; int8_t float_tag = -1; int8_t double_tag = -1; diff --git a/python/src/pynumbuf/adapters/numpy.cc b/python/src/pynumbuf/adapters/numpy.cc index 67537fd25..45c4149a4 100644 --- a/python/src/pynumbuf/adapters/numpy.cc +++ b/python/src/pynumbuf/adapters/numpy.cc @@ -65,34 +65,34 @@ Status SerializeArray(PyArrayObject* array, SequenceBuilder& builder) { auto data = PyArray_DATA(contiguous); switch (dtype) { case NPY_UINT8: - RETURN_NOT_OK(builder.Append(dims, reinterpret_cast(data))); + RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast(data))); break; case NPY_INT8: - RETURN_NOT_OK(builder.Append(dims, reinterpret_cast(data))); + RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast(data))); break; case NPY_UINT16: - RETURN_NOT_OK(builder.Append(dims, reinterpret_cast(data))); + RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast(data))); break; case NPY_INT16: - RETURN_NOT_OK(builder.Append(dims, reinterpret_cast(data))); + RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast(data))); break; case NPY_UINT32: - RETURN_NOT_OK(builder.Append(dims, reinterpret_cast(data))); + RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast(data))); break; case NPY_INT32: - RETURN_NOT_OK(builder.Append(dims, reinterpret_cast(data))); + RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast(data))); break; case NPY_UINT64: - RETURN_NOT_OK(builder.Append(dims, reinterpret_cast(data))); + RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast(data))); break; case NPY_INT64: - RETURN_NOT_OK(builder.Append(dims, reinterpret_cast(data))); + RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast(data))); break; case NPY_FLOAT: - RETURN_NOT_OK(builder.Append(dims, reinterpret_cast(data))); + RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast(data))); break; case NPY_DOUBLE: - RETURN_NOT_OK(builder.Append(dims, reinterpret_cast(data))); + RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast(data))); break; default: DCHECK(false) << "numpy data type not recognized: " << dtype; diff --git a/python/src/pynumbuf/adapters/python.cc b/python/src/pynumbuf/adapters/python.cc index 7c747c4d9..790fd5497 100644 --- a/python/src/pynumbuf/adapters/python.cc +++ b/python/src/pynumbuf/adapters/python.cc @@ -15,10 +15,15 @@ PyObject* get_value(ArrayPtr arr, int32_t index, int32_t type) { return PyBool_FromLong(std::static_pointer_cast(arr)->Value(index)); case Type::INT64: return PyInt_FromLong(std::static_pointer_cast(arr)->Value(index)); + case Type::BINARY: { + int32_t nchars; + const uint8_t* str = std::static_pointer_cast(arr)->GetValue(index, &nchars); + return PyString_FromStringAndSize(reinterpret_cast(str), nchars); + } case Type::STRING: { int32_t nchars; const uint8_t* str = std::static_pointer_cast(arr)->GetValue(index, &nchars); - return PyString_FromStringAndSize(reinterpret_cast(str), nchars); + return PyUnicode_FromStringAndSize(reinterpret_cast(str), nchars); } case Type::FLOAT: return PyFloat_FromDouble(std::static_pointer_cast(arr)->Value(index)); @@ -50,20 +55,34 @@ Status append(PyObject* elem, SequenceBuilder& builder, std::vector& subdicts) { // The bool case must precede the int case (PyInt_Check passes for bools) if (PyBool_Check(elem)) { - RETURN_NOT_OK(builder.Append(elem == Py_True)); + RETURN_NOT_OK(builder.AppendBool(elem == Py_True)); } else if (PyFloat_Check(elem)) { - RETURN_NOT_OK(builder.Append(PyFloat_AS_DOUBLE(elem))); + RETURN_NOT_OK(builder.AppendFloat(PyFloat_AS_DOUBLE(elem))); } else if (PyLong_Check(elem)) { int overflow = 0; int64_t data = PyLong_AsLongLongAndOverflow(elem, &overflow); - RETURN_NOT_OK(builder.Append(data)); + RETURN_NOT_OK(builder.AppendInt64(data)); if(overflow) { return Status::NotImplemented("long overflow"); } } else if (PyInt_Check(elem)) { - RETURN_NOT_OK(builder.Append(static_cast(PyInt_AS_LONG(elem)))); + RETURN_NOT_OK(builder.AppendInt64(static_cast(PyInt_AS_LONG(elem)))); } else if (PyString_Check(elem)) { - RETURN_NOT_OK(builder.Append(PyString_AS_STRING(elem), PyString_GET_SIZE(elem))); + auto data = reinterpret_cast(PyString_AS_STRING(elem)); + auto size = PyString_GET_SIZE(elem); + RETURN_NOT_OK(builder.AppendBytes(data, size)); + } else if (PyUnicode_Check(elem)) { + Py_ssize_t size; + #if PY_MAJOR_VERSION >= 3 + char* data = PyUnicode_AsUTF8AndSize(elem, &size); // TODO(pcm): Check if this is correct + #else + PyObject* str = PyUnicode_AsUTF8String(elem); + char* data = PyString_AS_STRING(str); + size = PyString_GET_SIZE(str); + #endif + Status s = builder.AppendString(data, size); + Py_XDECREF(str); + RETURN_NOT_OK(s); } else if (PyList_Check(elem)) { builder.AppendList(PyList_Size(elem)); sublists.push_back(elem); @@ -78,7 +97,7 @@ Status append(PyObject* elem, SequenceBuilder& builder, } else if (PyArray_Check(elem)) { RETURN_NOT_OK(SerializeArray((PyArrayObject*) elem, builder)); } else if (elem == Py_None) { - RETURN_NOT_OK(builder.Append()); + RETURN_NOT_OK(builder.AppendNone()); } else { std::stringstream ss; ss << "data type of " << PyString_AS_STRING(PyObject_Repr(elem)) diff --git a/python/src/pynumbuf/adapters/scalars.h b/python/src/pynumbuf/adapters/scalars.h index 928289252..e680b446f 100644 --- a/python/src/pynumbuf/adapters/scalars.h +++ b/python/src/pynumbuf/adapters/scalars.h @@ -16,11 +16,11 @@ namespace numbuf { arrow::Status AppendScalar(PyObject* obj, SequenceBuilder& builder) { if (PyArray_IsScalar(obj, Bool)) { - return builder.Append(((PyBoolScalarObject *)obj)->obval != 0); + return builder.AppendBool(((PyBoolScalarObject *)obj)->obval != 0); } else if (PyArray_IsScalar(obj, Float)) { - return builder.Append(((PyFloatScalarObject *)obj)->obval); + return builder.AppendFloat(((PyFloatScalarObject *)obj)->obval); } else if (PyArray_IsScalar(obj, Double)) { - return builder.Append(((PyDoubleScalarObject *)obj)->obval); + return builder.AppendDouble(((PyDoubleScalarObject *)obj)->obval); } int64_t value = 0; if (PyArray_IsScalar(obj, Byte)) { @@ -46,7 +46,7 @@ arrow::Status AppendScalar(PyObject* obj, SequenceBuilder& builder) { } else { DCHECK(false) << "scalar type not recognized"; } - return builder.Append(value); + return builder.AppendInt64(value); } } // namespace diff --git a/python/test/runtest.py b/python/test/runtest.py index 5c713cc14..14545117d 100644 --- a/python/test/runtest.py +++ b/python/test/runtest.py @@ -3,8 +3,9 @@ import libnumbuf import numpy as np from numpy.testing import assert_equal -TEST_OBJECTS = [[1, "hello", 3.0], 42, 43L, "hello world", 42.0, 1L << 62, - (1.0, "hi"), None, (None, None), ("hello", None), +TEST_OBJECTS = [[1, "hello", 3.0], 42, 43L, "hello world", u"x", u"\u262F", + 42.0, 1L << 62, (1.0, "hi"), + None, (None, None), ("hello", None), True, False, (True, False), "hello", {True: "hello", False: "world"}, {"hello" : "world", 1: 42, 1.0: 45}, {},