mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
Rebase numbuf to latest arrow (#23)
* rebase to latest arrow * fix arrow linking * finish rebase * point arrow to pcmoritz's arrow * fix * fix macOS * fix
This commit is contained in:
parent
c3db225cbe
commit
c3ab68e88c
15 changed files with 163 additions and 86 deletions
2
.gitmodules
vendored
2
.gitmodules
vendored
|
@ -1,3 +1,3 @@
|
|||
[submodule "thirdparty/arrow"]
|
||||
path = thirdparty/arrow
|
||||
url = https://github.com/ray-project/arrow.git
|
||||
url = https://github.com/pcmoritz/arrow.git
|
||||
|
|
|
@ -4,6 +4,10 @@ project(numbuf)
|
|||
|
||||
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
|
||||
|
||||
# Make libnumbuf.so look for shared libraries in the folder libnumbuf.so is in
|
||||
set(CMAKE_INSTALL_RPATH "$ORIGIN/")
|
||||
set(CMAKE_MACOSX_RPATH 1)
|
||||
|
||||
if(NOT APPLE)
|
||||
find_package(PythonInterp REQUIRED)
|
||||
find_package(PythonLibs REQUIRED)
|
||||
|
@ -53,8 +57,25 @@ endif()
|
|||
set(ARROW_DIR "${CMAKE_SOURCE_DIR}/thirdparty/arrow/" CACHE STRING
|
||||
"Path of the arrow source directory")
|
||||
|
||||
set(ARROW_STATIC_LIB "${CMAKE_SOURCE_DIR}/thirdparty/arrow/cpp/build/release/libarrow.a" CACHE STRING
|
||||
"Path to libarrow.a (needs to be changed if arrow is build in debug mode)")
|
||||
if (APPLE)
|
||||
set(ARROW_LIB "${CMAKE_SOURCE_DIR}/thirdparty/arrow/cpp/build/release/libarrow.dylib" CACHE STRING
|
||||
"Path to libarrow.dylib (needs to be changed if arrow is build in debug mode)")
|
||||
|
||||
set(ARROW_IO_LIB "${CMAKE_SOURCE_DIR}/thirdparty/arrow/cpp/build/release/libarrow_io.dylib" CACHE STRING
|
||||
"Path to libarrow_io.dylib (needs to be changed if arrow is build in debug mode)")
|
||||
|
||||
set(ARROW_IPC_LIB "${CMAKE_SOURCE_DIR}/thirdparty/arrow/cpp/build/release/libarrow_ipc.dylib" CACHE STRING
|
||||
"Path to libarrow_ipc.dylib (needs to be changed if arrow is build in debug mode)")
|
||||
else()
|
||||
set(ARROW_LIB "${CMAKE_SOURCE_DIR}/thirdparty/arrow/cpp/build/release/libarrow.so" CACHE STRING
|
||||
"Path to libarrow.so (needs to be changed if arrow is build in debug mode)")
|
||||
|
||||
set(ARROW_IO_LIB "${CMAKE_SOURCE_DIR}/thirdparty/arrow/cpp/build/release/libarrow_io.so" CACHE STRING
|
||||
"Path to libarrow_io.so (needs to be changed if arrow is build in debug mode)")
|
||||
|
||||
set(ARROW_IPC_LIB "${CMAKE_SOURCE_DIR}/thirdparty/arrow/cpp/build/release/libarrow_ipc.so" CACHE STRING
|
||||
"Path to libarrow_ipc.so (needs to be changed if arrow is build in debug mode)")
|
||||
endif()
|
||||
|
||||
include_directories("${ARROW_DIR}/cpp/src/")
|
||||
include_directories("cpp/src/")
|
||||
|
@ -75,8 +96,18 @@ if(APPLE)
|
|||
add_custom_command(TARGET numbuf
|
||||
POST_BUILD COMMAND
|
||||
${CMAKE_INSTALL_NAME_TOOL} -change ${PYTHON_SHARED_LIBRARY} ${PYTHON_LIBRARIES} libnumbuf.so)
|
||||
add_custom_command(TARGET numbuf
|
||||
POST_BUILD COMMAND
|
||||
${CMAKE_INSTALL_NAME_TOOL} -change "@rpath/libarrow.dylib" "@loader_path/libarrow.dylib" libnumbuf.so)
|
||||
add_custom_command(TARGET numbuf
|
||||
POST_BUILD COMMAND
|
||||
${CMAKE_INSTALL_NAME_TOOL} -change "@rpath/libarrow_io.dylib" "@loader_path/libarrow_io.dylib" libnumbuf.so)
|
||||
add_custom_command(TARGET numbuf
|
||||
POST_BUILD COMMAND
|
||||
${CMAKE_INSTALL_NAME_TOOL} -change "@rpath/libarrow_ipc.dylib" "@loader_path/libarrow_ipc.dylib" libnumbuf.so)
|
||||
endif(APPLE)
|
||||
|
||||
target_link_libraries(numbuf ${ARROW_STATIC_LIB} ${PYTHON_LIBRARIES})
|
||||
target_link_libraries(numbuf ${ARROW_LIB} ${ARROW_IO_LIB} ${ARROW_IPC_LIB} ${PYTHON_LIBRARIES})
|
||||
|
||||
install(TARGETS numbuf DESTINATION ${CMAKE_SOURCE_DIR})
|
||||
install(TARGETS numbuf DESTINATION ${CMAKE_SOURCE_DIR}/numbuf/)
|
||||
install(FILES ${ARROW_LIB} ${ARROW_IO_LIB} ${ARROW_IPC_LIB} DESTINATION ${CMAKE_SOURCE_DIR}/numbuf/)
|
||||
|
|
2
build.sh
2
build.sh
|
@ -15,6 +15,6 @@ fi
|
|||
|
||||
mkdir -p "$ROOT_DIR/build"
|
||||
pushd "$ROOT_DIR/build"
|
||||
cmake -DCMAKE_BUILD_TYPE=Release ..
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="-g" -DCMAKE_CXX_FLAGS="-g" ..
|
||||
make install -j$PARALLEL
|
||||
popd
|
||||
|
|
|
@ -4,21 +4,24 @@ using namespace arrow;
|
|||
|
||||
namespace numbuf {
|
||||
|
||||
std::shared_ptr<arrow::StructArray> DictBuilder::Finish(
|
||||
Status DictBuilder::Finish(
|
||||
std::shared_ptr<Array> key_tuple_data,
|
||||
std::shared_ptr<Array> val_list_data,
|
||||
std::shared_ptr<Array> val_tuple_data,
|
||||
std::shared_ptr<Array> val_dict_data) {
|
||||
std::shared_ptr<Array> val_dict_data,
|
||||
std::shared_ptr<arrow::Array>* out) {
|
||||
// lists and dicts can't be keys of dicts in Python, that is why for
|
||||
// the keys we do not need to collect sublists
|
||||
auto keys = keys_.Finish(nullptr, key_tuple_data, nullptr);
|
||||
auto vals = vals_.Finish(val_list_data, val_tuple_data, val_dict_data);
|
||||
std::shared_ptr<Array> keys, vals;
|
||||
RETURN_NOT_OK(keys_.Finish(nullptr, key_tuple_data, nullptr, &keys));
|
||||
RETURN_NOT_OK(vals_.Finish(val_list_data, val_tuple_data, val_dict_data, &vals));
|
||||
auto keys_field = std::make_shared<Field>("keys", keys->type());
|
||||
auto vals_field = std::make_shared<Field>("vals", vals->type());
|
||||
auto type = std::make_shared<StructType>(std::vector<FieldPtr>({keys_field, vals_field}));
|
||||
std::vector<ArrayPtr> field_arrays({keys, vals});
|
||||
DCHECK(keys->length() == vals->length());
|
||||
return std::make_shared<StructArray>(type, keys->length(), field_arrays);
|
||||
out->reset(new StructArray(type, keys->length(), field_arrays));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -33,11 +33,12 @@ public:
|
|||
List containing the data from nested dictionaries in the
|
||||
value list of the dictionary
|
||||
*/
|
||||
std::shared_ptr<arrow::StructArray> Finish(
|
||||
arrow::Status Finish(
|
||||
std::shared_ptr<arrow::Array> key_tuple_data,
|
||||
std::shared_ptr<arrow::Array> val_list_data,
|
||||
std::shared_ptr<arrow::Array> val_tuple_data,
|
||||
std::shared_ptr<arrow::Array> val_dict_data);
|
||||
std::shared_ptr<arrow::Array> val_dict_data,
|
||||
std::shared_ptr<arrow::Array>* out);
|
||||
|
||||
private:
|
||||
SequenceBuilder keys_;
|
||||
|
|
|
@ -5,13 +5,16 @@ using namespace arrow;
|
|||
namespace numbuf {
|
||||
|
||||
SequenceBuilder::SequenceBuilder(MemoryPool* pool)
|
||||
: pool_(pool), types_(pool), offsets_(pool),
|
||||
: pool_(pool),
|
||||
types_(pool, std::make_shared<Int8Type>()),
|
||||
offsets_(pool, std::make_shared<Int32Type>()),
|
||||
nones_(pool, std::make_shared<NullType>()),
|
||||
bools_(pool, std::make_shared<BooleanType>()),
|
||||
ints_(pool),
|
||||
ints_(pool, std::make_shared<Int64Type>()),
|
||||
bytes_(pool, std::make_shared<BinaryType>()),
|
||||
strings_(pool, std::make_shared<StringType>()),
|
||||
floats_(pool), doubles_(pool),
|
||||
floats_(pool, std::make_shared<FloatType>()),
|
||||
doubles_(pool, std::make_shared<DoubleType>()),
|
||||
uint8_tensors_(std::make_shared<UInt8Type>(), pool),
|
||||
int8_tensors_(std::make_shared<Int8Type>(), pool),
|
||||
uint16_tensors_(std::make_shared<UInt16Type>(), pool),
|
||||
|
@ -76,6 +79,9 @@ Status SequenceBuilder::AppendDouble(double data) {
|
|||
|
||||
#define DEF_TENSOR_APPEND(NAME, TYPE, TAG) \
|
||||
Status SequenceBuilder::AppendTensor(const std::vector<int64_t>& dims, TYPE* data) { \
|
||||
if (TAG == -1) { \
|
||||
NAME.Start(); \
|
||||
} \
|
||||
UPDATE(NAME.length(), TAG); \
|
||||
return NAME.Append(dims, data); \
|
||||
}
|
||||
|
@ -109,11 +115,11 @@ Status SequenceBuilder::AppendDict(int32_t size) {
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
#define ADD_ELEMENT(VARNAME, TAG) \
|
||||
if (TAG != -1) { \
|
||||
types[TAG] = VARNAME.type(); \
|
||||
children[TAG] = VARNAME.Finish(); \
|
||||
ARROW_CHECK_OK(nones_.AppendToBitmap(true)); \
|
||||
#define ADD_ELEMENT(VARNAME, TAG) \
|
||||
if (TAG != -1) { \
|
||||
types[TAG] = std::make_shared<Field>("", VARNAME.type()); \
|
||||
RETURN_NOT_OK(VARNAME.Finish(&children[TAG])); \
|
||||
RETURN_NOT_OK(nones_.AppendToBitmap(true)); \
|
||||
}
|
||||
|
||||
#define ADD_SUBSEQUENCE(DATA, OFFSETS, BUILDER, TAG, NAME) \
|
||||
|
@ -132,12 +138,13 @@ Status SequenceBuilder::AppendDict(int32_t size) {
|
|||
DCHECK(OFFSETS.size() == 1); \
|
||||
}
|
||||
|
||||
std::shared_ptr<DenseUnionArray> SequenceBuilder::Finish(
|
||||
Status SequenceBuilder::Finish(
|
||||
std::shared_ptr<Array> list_data,
|
||||
std::shared_ptr<Array> tuple_data,
|
||||
std::shared_ptr<Array> dict_data) {
|
||||
std::shared_ptr<Array> dict_data,
|
||||
std::shared_ptr<Array>* out) {
|
||||
|
||||
std::vector<TypePtr> types(num_tags);
|
||||
std::vector<std::shared_ptr<Field>> types(num_tags);
|
||||
std::vector<ArrayPtr> children(num_tags);
|
||||
|
||||
ADD_ELEMENT(bools_, bool_tag);
|
||||
|
@ -165,11 +172,12 @@ std::shared_ptr<DenseUnionArray> SequenceBuilder::Finish(
|
|||
ADD_SUBSEQUENCE(tuple_data, tuple_offsets_, tuple_builder, tuple_tag, "tuple");
|
||||
ADD_SUBSEQUENCE(dict_data, dict_offsets_, dict_builder, dict_tag, "dict");
|
||||
|
||||
TypePtr type = TypePtr(new DenseUnionType(types));
|
||||
|
||||
return std::make_shared<DenseUnionArray>(type, types_.length(),
|
||||
std::vector<uint8_t> type_ids = {};
|
||||
TypePtr type = TypePtr(new UnionType(types, type_ids, UnionMode::DENSE));
|
||||
out->reset(new UnionArray(type, types_.length(),
|
||||
children, types_.data(), offsets_.data(),
|
||||
nones_.null_count(), nones_.null_bitmap());
|
||||
nones_.null_count(), nones_.null_bitmap()));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -79,10 +79,11 @@ class SequenceBuilder {
|
|||
arrow::Status AppendDict(int32_t size);
|
||||
|
||||
//! Finish building the sequence and return the result
|
||||
std::shared_ptr<arrow::DenseUnionArray> Finish(
|
||||
arrow::Status Finish(
|
||||
std::shared_ptr<arrow::Array> list_data,
|
||||
std::shared_ptr<arrow::Array> tuple_data,
|
||||
std::shared_ptr<arrow::Array> dict_data);
|
||||
std::shared_ptr<arrow::Array> dict_data,
|
||||
std::shared_ptr<arrow::Array>* out);
|
||||
|
||||
private:
|
||||
arrow::MemoryPool* pool_;
|
||||
|
|
|
@ -6,19 +6,24 @@ namespace numbuf {
|
|||
|
||||
template<typename T>
|
||||
TensorBuilder<T>::TensorBuilder(const TypePtr& dtype, MemoryPool* pool)
|
||||
: dtype_(dtype) {
|
||||
dim_data_ = std::make_shared<Int64Builder>(pool);
|
||||
dims_ = std::make_shared<ListBuilder>(pool, dim_data_);
|
||||
value_data_ = std::make_shared<PrimitiveBuilder<T>>(pool, dtype);
|
||||
values_ = std::make_shared<ListBuilder>(pool, value_data_);
|
||||
: dtype_(dtype), pool_(pool) {}
|
||||
|
||||
template<typename T>
|
||||
Status TensorBuilder<T>::Start() {
|
||||
dim_data_ = std::make_shared<Int64Builder>(pool_, std::make_shared<Int64Type>());
|
||||
dims_ = std::make_shared<ListBuilder>(pool_, dim_data_);
|
||||
value_data_ = std::make_shared<PrimitiveBuilder<T>>(pool_, dtype_);
|
||||
values_ = std::make_shared<ListBuilder>(pool_, value_data_);
|
||||
auto dims_field = std::make_shared<Field>("dims", dims_->type());
|
||||
auto values_field = std::make_shared<Field>("data", values_->type());
|
||||
auto type = std::make_shared<StructType>(std::vector<FieldPtr>({dims_field, values_field}));
|
||||
tensors_ = std::make_shared<StructBuilder>(pool, type, std::vector<std::shared_ptr<ArrayBuilder>>({dims_, values_}));
|
||||
};
|
||||
tensors_ = std::make_shared<StructBuilder>(pool_, type, std::vector<std::shared_ptr<ArrayBuilder>>({dims_, values_}));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
Status TensorBuilder<T>::Append(const std::vector<int64_t>& dims, const elem_type* data) {
|
||||
DCHECK(tensors_);
|
||||
RETURN_NOT_OK(tensors_->Append());
|
||||
RETURN_NOT_OK(dims_->Append());
|
||||
RETURN_NOT_OK(values_->Append());
|
||||
|
@ -32,8 +37,8 @@ Status TensorBuilder<T>::Append(const std::vector<int64_t>& dims, const elem_typ
|
|||
}
|
||||
|
||||
template<typename T>
|
||||
std::shared_ptr<Array> TensorBuilder<T>::Finish() {
|
||||
return tensors_->Finish();
|
||||
Status TensorBuilder<T>::Finish(std::shared_ptr<Array>* out) {
|
||||
return tensors_->Finish(out);
|
||||
}
|
||||
|
||||
template class TensorBuilder<UInt8Type>;
|
||||
|
|
|
@ -18,6 +18,8 @@ public:
|
|||
typedef typename T::c_type elem_type;
|
||||
|
||||
TensorBuilder(const arrow::TypePtr& dtype, arrow::MemoryPool* pool = nullptr);
|
||||
|
||||
arrow::Status Start();
|
||||
|
||||
/*! Append a new tensor.
|
||||
|
||||
|
@ -31,7 +33,7 @@ public:
|
|||
arrow::Status Append(const std::vector<int64_t>& dims, const elem_type* data);
|
||||
|
||||
//! Convert the tensors to an Arrow StructArray
|
||||
std::shared_ptr<arrow::Array> Finish();
|
||||
arrow::Status Finish(std::shared_ptr<arrow::Array>* out);
|
||||
|
||||
//! Number of tensors in the column
|
||||
int32_t length() {
|
||||
|
@ -44,6 +46,7 @@ public:
|
|||
|
||||
private:
|
||||
arrow::TypePtr dtype_;
|
||||
arrow::MemoryPool* pool_;
|
||||
std::shared_ptr<arrow::Int64Builder> dim_data_;
|
||||
std::shared_ptr<arrow::ListBuilder> dims_;
|
||||
std::shared_ptr<arrow::PrimitiveBuilder<T>> value_data_;
|
||||
|
|
|
@ -166,12 +166,11 @@ Status SerializeSequences(std::vector<PyObject*> sequences, int32_t recursion_de
|
|||
if (subdicts.size() > 0) {
|
||||
RETURN_NOT_OK(SerializeDict(subdicts, recursion_depth + 1, &dict));
|
||||
}
|
||||
*out = builder.Finish(list, tuple, dict);
|
||||
return Status::OK();
|
||||
return builder.Finish(list, tuple, dict, out);
|
||||
}
|
||||
|
||||
#define DESERIALIZE_SEQUENCE(CREATE, SET_ITEM) \
|
||||
auto data = std::dynamic_pointer_cast<DenseUnionArray>(array); \
|
||||
auto data = std::dynamic_pointer_cast<UnionArray>(array); \
|
||||
int32_t size = array->length(); \
|
||||
PyObject* result = CREATE(stop_idx - start_idx); \
|
||||
auto types = std::make_shared<Int8Array>(size, data->types()); \
|
||||
|
@ -231,7 +230,7 @@ Status SerializeDict(std::vector<PyObject*> dicts, int32_t recursion_depth, std:
|
|||
if (val_dicts.size() > 0) {
|
||||
RETURN_NOT_OK(SerializeDict(val_dicts, recursion_depth + 1, &val_dict_arr));
|
||||
}
|
||||
*out = result.Finish(key_tuples_arr, val_list_arr, val_tuples_arr, val_dict_arr);
|
||||
result.Finish(key_tuples_arr, val_list_arr, val_tuples_arr, val_dict_arr, out);
|
||||
|
||||
// This block is used to decrement the reference counts of the results
|
||||
// returned by the serialization callback, which is called in SerializeArray
|
||||
|
|
|
@ -1,22 +1,32 @@
|
|||
#ifndef PYNUMBUF_MEMORY_H
|
||||
#define PYNUMBUF_MEMORY_H
|
||||
|
||||
#include <arrow/ipc/memory.h>
|
||||
#include <arrow/io/interfaces.h>
|
||||
|
||||
namespace numbuf {
|
||||
|
||||
class BufferSource : public arrow::ipc::MemorySource {
|
||||
class FixedBufferStream : public arrow::io::OutputStream, public arrow::io::ReadableFileInterface {
|
||||
public:
|
||||
virtual ~BufferSource() {}
|
||||
virtual ~FixedBufferStream() {}
|
||||
|
||||
explicit BufferSource(uint8_t* data, int64_t nbytes)
|
||||
: data_(data), size_(nbytes) {}
|
||||
explicit FixedBufferStream(uint8_t* data, int64_t nbytes)
|
||||
: data_(data), position_(0), size_(nbytes) {}
|
||||
|
||||
arrow::Status ReadAt(int64_t position, int64_t nbytes,
|
||||
std::shared_ptr<arrow::Buffer>* out) override {
|
||||
arrow::Status Read(int64_t nbytes, std::shared_ptr<arrow::Buffer>* out) override {
|
||||
DCHECK(out);
|
||||
DCHECK(position + nbytes <= size_) << "position: " << position << " nbytes: " << nbytes << "size: " << size_;
|
||||
*out = std::make_shared<arrow::Buffer>(data_ + position, nbytes);
|
||||
DCHECK(position_ + nbytes <= size_) << "position: " << position_ << " nbytes: " << nbytes << "size: " << size_;
|
||||
*out = std::make_shared<arrow::Buffer>(data_ + position_, nbytes);
|
||||
position_ += nbytes;
|
||||
return arrow::Status::OK();
|
||||
}
|
||||
|
||||
arrow::Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) {
|
||||
assert(0);
|
||||
return arrow::Status::OK();
|
||||
}
|
||||
|
||||
arrow::Status Seek(int64_t position) override {
|
||||
position_ = position;
|
||||
return arrow::Status::OK();
|
||||
}
|
||||
|
||||
|
@ -24,24 +34,35 @@ class BufferSource : public arrow::ipc::MemorySource {
|
|||
return arrow::Status::OK();
|
||||
}
|
||||
|
||||
arrow::Status Write(int64_t position, const uint8_t* data,
|
||||
int64_t nbytes) override {
|
||||
DCHECK(position >= 0 && position < size_);
|
||||
DCHECK(position + nbytes <= size_) << "position: " << position << " nbytes: " << nbytes << "size: " << size_;
|
||||
uint8_t* dst = data_ + position;
|
||||
memcpy(dst, data, nbytes);
|
||||
arrow::Status Tell(int64_t* position) override {
|
||||
*position = position_;
|
||||
return arrow::Status::OK();
|
||||
}
|
||||
|
||||
int64_t Size() const override {
|
||||
return size_;
|
||||
arrow::Status Write(const uint8_t* data, int64_t nbytes) override {
|
||||
DCHECK(position_ >= 0 && position_ < size_);
|
||||
DCHECK(position_ + nbytes <= size_) << "position: " << position_ << " nbytes: " << nbytes << "size: " << size_;
|
||||
uint8_t* dst = data_ + position_;
|
||||
memcpy(dst, data, nbytes);
|
||||
position_ += nbytes;
|
||||
return arrow::Status::OK();
|
||||
}
|
||||
|
||||
arrow::Status GetSize(int64_t *size) override {
|
||||
*size = size_;
|
||||
return arrow::Status::OK();
|
||||
}
|
||||
|
||||
bool supports_zero_copy() const override {
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
uint8_t* data_;
|
||||
int64_t position_;
|
||||
int64_t size_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
} // namespace numbuf
|
||||
|
||||
#endif // PYNUMBUF_MEMORY_H
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
#include <Python.h>
|
||||
#include <arrow/api.h>
|
||||
#include <arrow/ipc/memory.h>
|
||||
#include <arrow/ipc/adapter.h>
|
||||
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
|
||||
#define PY_ARRAY_UNIQUE_SYMBOL NUMBUF_ARRAY_API
|
||||
|
@ -16,10 +15,10 @@
|
|||
using namespace arrow;
|
||||
using namespace numbuf;
|
||||
|
||||
std::shared_ptr<RowBatch> make_row_batch(std::shared_ptr<Array> data) {
|
||||
std::shared_ptr<RecordBatch> make_row_batch(std::shared_ptr<Array> data) {
|
||||
auto field = std::make_shared<Field>("list", data->type());
|
||||
std::shared_ptr<Schema> schema(new Schema({field}));
|
||||
return std::shared_ptr<RowBatch>(new RowBatch(schema, data->length(), {data}));
|
||||
return std::shared_ptr<RecordBatch>(new RecordBatch(schema, data->length(), {data}));
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
@ -29,9 +28,9 @@ static PyObject *NumbufError;
|
|||
PyObject *numbuf_serialize_callback = NULL;
|
||||
PyObject *numbuf_deserialize_callback = NULL;
|
||||
|
||||
int PyObjectToArrow(PyObject* object, std::shared_ptr<RowBatch> **result) {
|
||||
int PyObjectToArrow(PyObject* object, std::shared_ptr<RecordBatch> **result) {
|
||||
if (PyCapsule_IsValid(object, "arrow")) {
|
||||
*result = reinterpret_cast<std::shared_ptr<RowBatch>*>(PyCapsule_GetPointer(object, "arrow"));
|
||||
*result = reinterpret_cast<std::shared_ptr<RecordBatch>*>(PyCapsule_GetPointer(object, "arrow"));
|
||||
return 1;
|
||||
} else {
|
||||
PyErr_SetString(PyExc_TypeError, "must be an 'arrow' capsule");
|
||||
|
@ -40,7 +39,7 @@ int PyObjectToArrow(PyObject* object, std::shared_ptr<RowBatch> **result) {
|
|||
}
|
||||
|
||||
static void ArrowCapsule_Destructor(PyObject* capsule) {
|
||||
delete reinterpret_cast<std::shared_ptr<RowBatch>*>(PyCapsule_GetPointer(capsule, "arrow"));
|
||||
delete reinterpret_cast<std::shared_ptr<RecordBatch>*>(PyCapsule_GetPointer(capsule, "arrow"));
|
||||
}
|
||||
|
||||
/* Documented in doc/numbuf.rst in ray-core */
|
||||
|
@ -62,11 +61,11 @@ static PyObject* serialize_list(PyObject* self, PyObject* args) {
|
|||
return NULL;
|
||||
}
|
||||
|
||||
auto batch = new std::shared_ptr<RowBatch>();
|
||||
auto batch = new std::shared_ptr<RecordBatch>();
|
||||
*batch = make_row_batch(array);
|
||||
|
||||
int64_t size = 0;
|
||||
ARROW_CHECK_OK(arrow::ipc::GetRowBatchSize(batch->get(), &size));
|
||||
ARROW_CHECK_OK(arrow::ipc::GetRecordBatchSize(batch->get(), &size));
|
||||
|
||||
std::shared_ptr<Buffer> buffer;
|
||||
ARROW_CHECK_OK(ipc::WriteSchema((*batch)->schema().get(), &buffer));
|
||||
|
@ -84,7 +83,7 @@ static PyObject* serialize_list(PyObject* self, PyObject* args) {
|
|||
|
||||
/* Documented in doc/numbuf.rst in ray-core */
|
||||
static PyObject* write_to_buffer(PyObject* self, PyObject* args) {
|
||||
std::shared_ptr<RowBatch>* batch;
|
||||
std::shared_ptr<RecordBatch>* batch;
|
||||
PyObject* memoryview;
|
||||
if (!PyArg_ParseTuple(args, "O&O", &PyObjectToArrow, &batch, &memoryview)) {
|
||||
return NULL;
|
||||
|
@ -93,10 +92,11 @@ static PyObject* write_to_buffer(PyObject* self, PyObject* args) {
|
|||
return NULL;
|
||||
}
|
||||
Py_buffer* buffer = PyMemoryView_GET_BUFFER(memoryview);
|
||||
auto target = std::make_shared<BufferSource>(reinterpret_cast<uint8_t*>(buffer->buf), buffer->len);
|
||||
int64_t metadata_offset;
|
||||
ARROW_CHECK_OK(ipc::WriteRowBatch(target.get(), batch->get(), 0, &metadata_offset));
|
||||
return PyInt_FromLong(metadata_offset);
|
||||
auto target = std::make_shared<FixedBufferStream>(reinterpret_cast<uint8_t*>(buffer->buf), buffer->len);
|
||||
int64_t body_end_offset;
|
||||
int64_t header_end_offset;
|
||||
ARROW_CHECK_OK(ipc::WriteRecordBatch((*batch)->columns(), (*batch)->num_rows(), target.get(), &body_end_offset, &header_end_offset));
|
||||
return PyInt_FromLong(header_end_offset);
|
||||
}
|
||||
|
||||
/* Documented in doc/numbuf.rst in ray-core */
|
||||
|
@ -118,11 +118,11 @@ static PyObject* read_from_buffer(PyObject* self, PyObject* args) {
|
|||
ARROW_CHECK_OK(schema_msg->GetSchema(&schema));
|
||||
|
||||
Py_buffer* buffer = PyMemoryView_GET_BUFFER(memoryview);
|
||||
auto source = std::make_shared<BufferSource>(reinterpret_cast<uint8_t*>(buffer->buf), buffer->len);
|
||||
std::shared_ptr<arrow::ipc::RowBatchReader> reader;
|
||||
ARROW_CHECK_OK(arrow::ipc::RowBatchReader::Open(source.get(), metadata_offset, &reader));
|
||||
auto batch = new std::shared_ptr<arrow::RowBatch>();
|
||||
ARROW_CHECK_OK(reader->GetRowBatch(schema, batch));
|
||||
auto source = std::make_shared<FixedBufferStream>(reinterpret_cast<uint8_t*>(buffer->buf), buffer->len);
|
||||
std::shared_ptr<arrow::ipc::RecordBatchReader> reader;
|
||||
ARROW_CHECK_OK(arrow::ipc::RecordBatchReader::Open(source.get(), metadata_offset, &reader));
|
||||
auto batch = new std::shared_ptr<arrow::RecordBatch>();
|
||||
ARROW_CHECK_OK(reader->GetRecordBatch(schema, batch));
|
||||
|
||||
return PyCapsule_New(reinterpret_cast<void*>(batch),
|
||||
"arrow", &ArrowCapsule_Destructor);
|
||||
|
@ -130,7 +130,7 @@ static PyObject* read_from_buffer(PyObject* self, PyObject* args) {
|
|||
|
||||
/* Documented in doc/numbuf.rst in ray-core */
|
||||
static PyObject* deserialize_list(PyObject* self, PyObject* args) {
|
||||
std::shared_ptr<RowBatch>* data;
|
||||
std::shared_ptr<RecordBatch>* data;
|
||||
PyObject* base = Py_None;
|
||||
if (!PyArg_ParseTuple(args, "O&|O", &PyObjectToArrow, &data, &base)) {
|
||||
return NULL;
|
||||
|
|
10
setup.py
10
setup.py
|
@ -1,6 +1,13 @@
|
|||
import subprocess
|
||||
from setuptools import setup, find_packages, Extension
|
||||
import setuptools.command.install as _install
|
||||
from sys import platform
|
||||
|
||||
extension = ""
|
||||
if platform == "linux" or platform == "linux2":
|
||||
extension = ".so"
|
||||
elif platform == "darwin":
|
||||
extension = ".dylib"
|
||||
|
||||
# Because of relative paths, this must be run from inside numbuf/.
|
||||
|
||||
|
@ -8,7 +15,6 @@ class install(_install.install):
|
|||
def run(self):
|
||||
subprocess.check_call(["./setup.sh"])
|
||||
subprocess.check_call(["./build.sh"])
|
||||
subprocess.check_call(["cp", "libnumbuf.so", "numbuf/"])
|
||||
# Calling _install.install.run(self) does not fetch required packages and
|
||||
# instead performs an old-style install. See command/install.py in
|
||||
# setuptools. So, calling do_egg_install() manually here.
|
||||
|
@ -17,7 +23,7 @@ class install(_install.install):
|
|||
setup(name="numbuf",
|
||||
version="0.0.1",
|
||||
packages=find_packages(),
|
||||
package_data={"numbuf": ["libnumbuf.so"]},
|
||||
package_data={"numbuf": ["libnumbuf.so", "libarrow" + extension, "libarrow_io" + extension, "libarrow_ipc" + extension]},
|
||||
cmdclass={"install": install},
|
||||
setup_requires=["numpy"],
|
||||
include_package_data=True,
|
||||
|
|
2
thirdparty/arrow
vendored
2
thirdparty/arrow
vendored
|
@ -1 +1 @@
|
|||
Subproject commit 1407126ec0956b6dc6f9873bdeeef4d042ca032f
|
||||
Subproject commit 58bd7bedc63d66d5898297bab25b54dfb67665db
|
3
thirdparty/build_thirdparty.sh
vendored
3
thirdparty/build_thirdparty.sh
vendored
|
@ -20,8 +20,7 @@ fi
|
|||
|
||||
echo "building arrow"
|
||||
cd $TP_DIR/arrow/cpp
|
||||
source setup_build_env.sh
|
||||
mkdir -p $TP_DIR/arrow/cpp/build
|
||||
cd $TP_DIR/arrow/cpp/build
|
||||
cmake -DLIBARROW_LINKAGE=STATIC -DCMAKE_BUILD_TYPE=Release ..
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="-g" -DCMAKE_CXX_FLAGS="-g" ..
|
||||
make VERBOSE=1 -j$PARALLEL
|
||||
|
|
Loading…
Add table
Reference in a new issue