Merge pull request #48 from ray-project/numbuf-integration

Numbuf integration
This commit is contained in:
Robert Nishihara 2016-11-19 18:10:53 -08:00 committed by GitHub
commit c9757a6fd0
29 changed files with 1892 additions and 5 deletions

View file

@ -52,6 +52,10 @@ matrix:
install: install:
- ./install-dependencies.sh - ./install-dependencies.sh
- ./build.sh - ./build.sh
- cd numbuf
- sudo python setup.py install
- cd ..
- cd src/common/lib/python - cd src/common/lib/python
- sudo python setup.py install - sudo python setup.py install
@ -62,6 +66,8 @@ install:
- cd ../.. - cd ../..
script: script:
- python numbuf/python/test/runtest.py
- python src/common/test/test.py - python src/common/test/test.py
- python src/plasma/test/test.py - python src/plasma/test/test.py
- python src/photon/test/test.py - python src/photon/test/test.py

View file

@ -1,6 +1,6 @@
# Installation on Mac OS X # Installation on Mac OS X
Ray should work with Python 2 and Python 3. We have tested Ray on OS X 10.11. Ray should work with Python 2. We have tested Ray on OS X 10.11.
## Dependencies ## Dependencies
@ -14,7 +14,7 @@ sudo easy_install pip # If you're using Anaconda, then this is unnecessary.
pip install numpy funcsigs colorama psutil redis --ignore-installed six pip install numpy funcsigs colorama psutil redis --ignore-installed six
pip install --upgrade git+git://github.com/cloudpipe/cloudpickle.git@0d225a4695f1f65ae1cbb2e0bbc145e10167cce4 # We use the latest version of cloudpickle because it can serialize named tuples. pip install --upgrade git+git://github.com/cloudpipe/cloudpickle.git@0d225a4695f1f65ae1cbb2e0bbc145e10167cce4 # We use the latest version of cloudpickle because it can serialize named tuples.
pip install --upgrade --verbose git+git://github.com/ray-project/numbuf.git@488f881d708bc54e86ed375ee97aa94540808fa1 pip install --upgrade --verbose "git+git://github.com/ray-project/ray.git#egg=ray&subdirectory=numbuf"
``` ```
# Install Ray # Install Ray

View file

@ -1,6 +1,6 @@
# Installation on Ubuntu # Installation on Ubuntu
Ray should work with Python 2 and Python 3. We have tested Ray on Ubuntu 14.04 Ray should work with Python 2. We have tested Ray on Ubuntu 14.04
and Ubuntu 16.04 and Ubuntu 16.04
## Dependencies ## Dependencies
@ -14,7 +14,7 @@ sudo apt-get install -y cmake build-essential autoconf curl libtool python-dev p
pip install numpy funcsigs colorama psutil redis pip install numpy funcsigs colorama psutil redis
pip install --upgrade git+git://github.com/cloudpipe/cloudpickle.git@0d225a4695f1f65ae1cbb2e0bbc145e10167cce4 # We use the latest version of cloudpickle because it can serialize named tuples. pip install --upgrade git+git://github.com/cloudpipe/cloudpickle.git@0d225a4695f1f65ae1cbb2e0bbc145e10167cce4 # We use the latest version of cloudpickle because it can serialize named tuples.
pip install --upgrade --verbose git+git://github.com/ray-project/numbuf.git@488f881d708bc54e86ed375ee97aa94540808fa1 pip install --upgrade --verbose "git+git://github.com/ray-project/ray.git#egg=ray&subdirectory=numbuf"
``` ```
# Install Ray # Install Ray

View file

@ -37,4 +37,3 @@ elif [[ $platform == "macosx" ]]; then
fi fi
sudo pip install --upgrade git+git://github.com/cloudpipe/cloudpickle.git@0d225a4695f1f65ae1cbb2e0bbc145e10167cce4 # We use the latest version of cloudpickle because it can serialize named tuples. sudo pip install --upgrade git+git://github.com/cloudpipe/cloudpickle.git@0d225a4695f1f65ae1cbb2e0bbc145e10167cce4 # We use the latest version of cloudpickle because it can serialize named tuples.
sudo pip install --upgrade --verbose git+git://github.com/ray-project/numbuf.git@488f881d708bc54e86ed375ee97aa94540808fa1

65
numbuf/.clang-format Normal file
View file

@ -0,0 +1,65 @@
---
Language: Cpp
# BasedOnStyle: Google
AccessModifierOffset: -1
AlignAfterOpenBracket: false
AlignConsecutiveAssignments: false
AlignEscapedNewlinesLeft: true
AlignOperands: true
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Inline
AllowShortIfStatementsOnASingleLine: true
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: true
BinPackParameters: true
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Attach
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
ColumnLimit: 90
CommentPragmas: '^ IWYU pragma:'
ConstructorInitializerAllOnOneLineOrOnePerLine: true
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: false
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
IndentCaseLabels: true
IndentWidth: 2
IndentWrappedFunctionNames: false
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBlockIndentWidth: 2
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: false
PenaltyBreakBeforeFirstCallParameter: 1000
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 2
SpacesInAngles: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Cpp11
TabWidth: 8
UseTab: Never

113
numbuf/CMakeLists.txt Normal file
View file

@ -0,0 +1,113 @@
cmake_minimum_required(VERSION 2.8)
project(numbuf)
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
# Make libnumbuf.so look for shared libraries in the folder libnumbuf.so is in
set(CMAKE_INSTALL_RPATH "$ORIGIN/")
set(CMAKE_MACOSX_RPATH 1)
if(NOT APPLE)
find_package(PythonInterp REQUIRED)
find_package(PythonLibs REQUIRED)
set(CUSTOM_PYTHON_EXECUTABLE ${PYTHON_EXECUTABLE})
else()
find_program(CUSTOM_PYTHON_EXECUTABLE python)
message("-- Found Python program: ${CUSTOM_PYTHON_EXECUTABLE}")
execute_process(COMMAND ${CUSTOM_PYTHON_EXECUTABLE} -c
"import sys; print 'python' + sys.version[0:3]"
OUTPUT_VARIABLE PYTHON_LIBRARY_NAME OUTPUT_STRIP_TRAILING_WHITESPACE)
execute_process(COMMAND ${CUSTOM_PYTHON_EXECUTABLE} -c
"import sys; print sys.exec_prefix"
OUTPUT_VARIABLE PYTHON_PREFIX OUTPUT_STRIP_TRAILING_WHITESPACE)
FIND_LIBRARY(PYTHON_LIBRARIES
NAMES ${PYTHON_LIBRARY_NAME}
HINTS "${PYTHON_PREFIX}"
PATH_SUFFIXES "lib" "libs"
NO_DEFAULT_PATH)
execute_process(COMMAND ${CUSTOM_PYTHON_EXECUTABLE} -c
"from distutils.sysconfig import *; print get_python_inc()"
OUTPUT_VARIABLE PYTHON_INCLUDE_DIRS OUTPUT_STRIP_TRAILING_WHITESPACE)
if(PYTHON_LIBRARIES AND PYTHON_INCLUDE_DIRS)
SET(PYTHONLIBS_FOUND TRUE)
message("-- Found PythonLibs: " ${PYTHON_LIBRARIES})
message("-- -- Used custom search path")
else()
find_package(PythonLibs REQUIRED)
message("-- -- Used find_package(PythonLibs)")
endif()
endif()
find_package(NumPy REQUIRED)
if(APPLE)
SET(CMAKE_SHARED_LIBRARY_SUFFIX ".so")
endif(APPLE)
include_directories("${PYTHON_INCLUDE_DIRS}")
include_directories("${NUMPY_INCLUDE_DIR}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
if (UNIX AND NOT APPLE)
link_libraries(rt)
endif()
set(ARROW_DIR "${CMAKE_SOURCE_DIR}/thirdparty/arrow/" CACHE STRING
"Path of the arrow source directory")
if (APPLE)
set(ARROW_LIB "${CMAKE_SOURCE_DIR}/thirdparty/arrow/cpp/build/release/libarrow.dylib" CACHE STRING
"Path to libarrow.dylib (needs to be changed if arrow is build in debug mode)")
set(ARROW_IO_LIB "${CMAKE_SOURCE_DIR}/thirdparty/arrow/cpp/build/release/libarrow_io.dylib" CACHE STRING
"Path to libarrow_io.dylib (needs to be changed if arrow is build in debug mode)")
set(ARROW_IPC_LIB "${CMAKE_SOURCE_DIR}/thirdparty/arrow/cpp/build/release/libarrow_ipc.dylib" CACHE STRING
"Path to libarrow_ipc.dylib (needs to be changed if arrow is build in debug mode)")
else()
set(ARROW_LIB "${CMAKE_SOURCE_DIR}/thirdparty/arrow/cpp/build/release/libarrow.so" CACHE STRING
"Path to libarrow.so (needs to be changed if arrow is build in debug mode)")
set(ARROW_IO_LIB "${CMAKE_SOURCE_DIR}/thirdparty/arrow/cpp/build/release/libarrow_io.so" CACHE STRING
"Path to libarrow_io.so (needs to be changed if arrow is build in debug mode)")
set(ARROW_IPC_LIB "${CMAKE_SOURCE_DIR}/thirdparty/arrow/cpp/build/release/libarrow_ipc.so" CACHE STRING
"Path to libarrow_ipc.so (needs to be changed if arrow is build in debug mode)")
endif()
include_directories("${ARROW_DIR}/cpp/src/")
include_directories("cpp/src/")
include_directories("python/src/")
add_definitions(-fPIC)
add_library(numbuf SHARED
cpp/src/numbuf/tensor.cc
cpp/src/numbuf/dict.cc
cpp/src/numbuf/sequence.cc
python/src/pynumbuf/numbuf.cc
python/src/pynumbuf/adapters/numpy.cc
python/src/pynumbuf/adapters/python.cc)
get_filename_component(PYTHON_SHARED_LIBRARY ${PYTHON_LIBRARIES} NAME)
if(APPLE)
add_custom_command(TARGET numbuf
POST_BUILD COMMAND
${CMAKE_INSTALL_NAME_TOOL} -change ${PYTHON_SHARED_LIBRARY} ${PYTHON_LIBRARIES} libnumbuf.so)
add_custom_command(TARGET numbuf
POST_BUILD COMMAND
${CMAKE_INSTALL_NAME_TOOL} -change "@rpath/libarrow.dylib" "@loader_path/libarrow.dylib" libnumbuf.so)
add_custom_command(TARGET numbuf
POST_BUILD COMMAND
${CMAKE_INSTALL_NAME_TOOL} -change "@rpath/libarrow_io.dylib" "@loader_path/libarrow_io.dylib" libnumbuf.so)
add_custom_command(TARGET numbuf
POST_BUILD COMMAND
${CMAKE_INSTALL_NAME_TOOL} -change "@rpath/libarrow_ipc.dylib" "@loader_path/libarrow_ipc.dylib" libnumbuf.so)
endif(APPLE)
target_link_libraries(numbuf ${ARROW_LIB} ${ARROW_IO_LIB} ${ARROW_IPC_LIB} ${PYTHON_LIBRARIES})
install(TARGETS numbuf DESTINATION ${CMAKE_SOURCE_DIR}/numbuf/)
install(FILES ${ARROW_LIB} ${ARROW_IO_LIB} ${ARROW_IPC_LIB} DESTINATION ${CMAKE_SOURCE_DIR}/numbuf/)

20
numbuf/build.sh Executable file
View file

@ -0,0 +1,20 @@
#!/usr/bin/env bash
ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
# Determine how many parallel jobs to use for make based on the number of cores
unamestr="$(uname)"
if [[ "$unamestr" == "Linux" ]]; then
PARALLEL=$(nproc)
elif [[ "$unamestr" == "Darwin" ]]; then
PARALLEL=$(sysctl -n hw.ncpu)
else
echo "Unrecognized platform."
exit 1
fi
mkdir -p "$ROOT_DIR/build"
pushd "$ROOT_DIR/build"
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="-g" -DCMAKE_CXX_FLAGS="-g" ..
make install -j$PARALLEL
popd

View file

@ -0,0 +1,54 @@
# - Find the NumPy libraries
# This module finds if NumPy is installed, and sets the following variables
# indicating where it is.
#
#
# NUMPY_FOUND - was NumPy found
# NUMPY_VERSION - the version of NumPy found as a string
# NUMPY_VERSION_MAJOR - the major version number of NumPy
# NUMPY_VERSION_MINOR - the minor version number of NumPy
# NUMPY_VERSION_PATCH - the patch version number of NumPy
# NUMPY_VERSION_DECIMAL - e.g. version 1.6.1 is 10601
# NUMPY_INCLUDE_DIR - path to the NumPy include files
unset(NUMPY_VERSION)
unset(NUMPY_INCLUDE_DIR)
if(NOT "${CUSTOM_PYTHON_EXECUTABLE}" STREQUAL "CUSTOM_PYTHON_EXECUTABLE-NOTFOUND")
execute_process(COMMAND "${CUSTOM_PYTHON_EXECUTABLE}" "-c"
"import numpy as n; print(n.__version__); print(n.get_include());"
RESULT_VARIABLE __result
OUTPUT_VARIABLE __output
OUTPUT_STRIP_TRAILING_WHITESPACE)
if(__result MATCHES 0)
string(REGEX REPLACE ";" "\\\\;" __values ${__output})
string(REGEX REPLACE "\r?\n" ";" __values ${__values})
list(GET __values 0 NUMPY_VERSION)
list(GET __values 1 NUMPY_INCLUDE_DIR)
string(REGEX MATCH "^([0-9])+\\.([0-9])+\\.([0-9])+" __ver_check "${NUMPY_VERSION}")
if(NOT "${__ver_check}" STREQUAL "")
set(NUMPY_VERSION_MAJOR ${CMAKE_MATCH_1})
set(NUMPY_VERSION_MINOR ${CMAKE_MATCH_2})
set(NUMPY_VERSION_PATCH ${CMAKE_MATCH_3})
math(EXPR NUMPY_VERSION_DECIMAL
"(${NUMPY_VERSION_MAJOR} * 10000) + (${NUMPY_VERSION_MINOR} * 100) + ${NUMPY_VERSION_PATCH}")
string(REGEX REPLACE "\\\\" "/" NUMPY_INCLUDE_DIR ${NUMPY_INCLUDE_DIR})
else()
unset(NUMPY_VERSION)
unset(NUMPY_INCLUDE_DIR)
message(STATUS "Requested NumPy version and include path, but got instead:\n${__output}\n")
endif()
endif()
else()
message(STATUS "To find NumPy Python executable is required to be found.")
endif()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(NumPy REQUIRED_VARS NUMPY_INCLUDE_DIR NUMPY_VERSION
VERSION_VAR NUMPY_VERSION)
if(NUMPY_FOUND)
message(STATUS "NumPy ver. ${NUMPY_VERSION} found (include: ${NUMPY_INCLUDE_DIR})")
endif()

View file

@ -0,0 +1,24 @@
#include "dict.h"
using namespace arrow;
namespace numbuf {
Status DictBuilder::Finish(std::shared_ptr<Array> key_tuple_data,
std::shared_ptr<Array> val_list_data, std::shared_ptr<Array> val_tuple_data,
std::shared_ptr<Array> val_dict_data, std::shared_ptr<arrow::Array>* out) {
// lists and dicts can't be keys of dicts in Python, that is why for
// the keys we do not need to collect sublists
std::shared_ptr<Array> keys, vals;
RETURN_NOT_OK(keys_.Finish(nullptr, key_tuple_data, nullptr, &keys));
RETURN_NOT_OK(vals_.Finish(val_list_data, val_tuple_data, val_dict_data, &vals));
auto keys_field = std::make_shared<Field>("keys", keys->type());
auto vals_field = std::make_shared<Field>("vals", vals->type());
auto type =
std::make_shared<StructType>(std::vector<FieldPtr>({keys_field, vals_field}));
std::vector<ArrayPtr> field_arrays({keys, vals});
DCHECK(keys->length() == vals->length());
out->reset(new StructArray(type, keys->length(), field_arrays));
return Status::OK();
}
}

View file

@ -0,0 +1,46 @@
#ifndef NUMBUF_DICT_H
#define NUMBUF_DICT_H
#include <arrow/api.h>
#include "sequence.h"
namespace numbuf {
/*! Constructing dictionaries of key/value pairs. Sequences of
keys and values are built separately using a pair of
SequenceBuilders. The resulting Arrow representation
can be obtained via the Finish method.
*/
class DictBuilder {
public:
DictBuilder(arrow::MemoryPool* pool = nullptr) : keys_(pool), vals_(pool) {}
//! Builder for the keys of the dictionary
SequenceBuilder& keys() { return keys_; }
//! Builder for the values of the dictionary
SequenceBuilder& vals() { return vals_; }
/*! Construct an Arrow StructArray representing the dictionary.
Contains a field "keys" for the keys and "vals" for the values.
\param list_data
List containing the data from nested lists in the value
list of the dictionary
\param dict_data
List containing the data from nested dictionaries in the
value list of the dictionary
*/
arrow::Status Finish(std::shared_ptr<arrow::Array> key_tuple_data,
std::shared_ptr<arrow::Array> val_list_data,
std::shared_ptr<arrow::Array> val_tuple_data,
std::shared_ptr<arrow::Array> val_dict_data, std::shared_ptr<arrow::Array>* out);
private:
SequenceBuilder keys_;
SequenceBuilder vals_;
};
}
#endif

View file

@ -0,0 +1,178 @@
#include "sequence.h"
using namespace arrow;
namespace numbuf {
SequenceBuilder::SequenceBuilder(MemoryPool* pool)
: pool_(pool),
types_(pool, std::make_shared<Int8Type>()),
offsets_(pool, std::make_shared<Int32Type>()),
nones_(pool, std::make_shared<NullType>()),
bools_(pool, std::make_shared<BooleanType>()),
ints_(pool, std::make_shared<Int64Type>()),
bytes_(pool, std::make_shared<BinaryType>()),
strings_(pool, std::make_shared<StringType>()),
floats_(pool, std::make_shared<FloatType>()),
doubles_(pool, std::make_shared<DoubleType>()),
uint8_tensors_(std::make_shared<UInt8Type>(), pool),
int8_tensors_(std::make_shared<Int8Type>(), pool),
uint16_tensors_(std::make_shared<UInt16Type>(), pool),
int16_tensors_(std::make_shared<Int16Type>(), pool),
uint32_tensors_(std::make_shared<UInt32Type>(), pool),
int32_tensors_(std::make_shared<Int32Type>(), pool),
uint64_tensors_(std::make_shared<UInt64Type>(), pool),
int64_tensors_(std::make_shared<Int64Type>(), pool),
float_tensors_(std::make_shared<FloatType>(), pool),
double_tensors_(std::make_shared<DoubleType>(), pool),
list_offsets_({0}),
tuple_offsets_({0}),
dict_offsets_({0}) {}
#define UPDATE(OFFSET, TAG) \
if (TAG == -1) { \
TAG = num_tags; \
num_tags += 1; \
} \
RETURN_NOT_OK(offsets_.Append(OFFSET)); \
RETURN_NOT_OK(types_.Append(TAG)); \
RETURN_NOT_OK(nones_.AppendToBitmap(true));
Status SequenceBuilder::AppendNone() {
RETURN_NOT_OK(offsets_.Append(0));
RETURN_NOT_OK(types_.Append(0));
return nones_.AppendToBitmap(false);
}
Status SequenceBuilder::AppendBool(bool data) {
UPDATE(bools_.length(), bool_tag);
return bools_.Append(data);
}
Status SequenceBuilder::AppendInt64(int64_t data) {
UPDATE(ints_.length(), int_tag);
return ints_.Append(data);
}
Status SequenceBuilder::AppendUInt64(uint64_t data) {
UPDATE(ints_.length(), int_tag);
return ints_.Append(data);
}
Status SequenceBuilder::AppendBytes(const uint8_t* data, int32_t length) {
UPDATE(bytes_.length(), bytes_tag);
return bytes_.Append(data, length);
}
Status SequenceBuilder::AppendString(const char* data, int32_t length) {
UPDATE(strings_.length(), string_tag);
return strings_.Append(data, length);
}
Status SequenceBuilder::AppendFloat(float data) {
UPDATE(floats_.length(), float_tag);
return floats_.Append(data);
}
Status SequenceBuilder::AppendDouble(double data) {
UPDATE(doubles_.length(), double_tag);
return doubles_.Append(data);
}
#define DEF_TENSOR_APPEND(NAME, TYPE, TAG) \
Status SequenceBuilder::AppendTensor(const std::vector<int64_t>& dims, TYPE* data) { \
if (TAG == -1) { NAME.Start(); } \
UPDATE(NAME.length(), TAG); \
return NAME.Append(dims, data); \
}
DEF_TENSOR_APPEND(uint8_tensors_, uint8_t, uint8_tensor_tag);
DEF_TENSOR_APPEND(int8_tensors_, int8_t, int8_tensor_tag);
DEF_TENSOR_APPEND(uint16_tensors_, uint16_t, uint16_tensor_tag);
DEF_TENSOR_APPEND(int16_tensors_, int16_t, int16_tensor_tag);
DEF_TENSOR_APPEND(uint32_tensors_, uint32_t, uint32_tensor_tag);
DEF_TENSOR_APPEND(int32_tensors_, int32_t, int32_tensor_tag);
DEF_TENSOR_APPEND(uint64_tensors_, uint64_t, uint64_tensor_tag);
DEF_TENSOR_APPEND(int64_tensors_, int64_t, int64_tensor_tag);
DEF_TENSOR_APPEND(float_tensors_, float, float_tensor_tag);
DEF_TENSOR_APPEND(double_tensors_, double, double_tensor_tag);
Status SequenceBuilder::AppendList(int32_t size) {
UPDATE(list_offsets_.size() - 1, list_tag);
list_offsets_.push_back(list_offsets_.back() + size);
return Status::OK();
}
Status SequenceBuilder::AppendTuple(int32_t size) {
UPDATE(tuple_offsets_.size() - 1, tuple_tag);
tuple_offsets_.push_back(tuple_offsets_.back() + size);
return Status::OK();
}
Status SequenceBuilder::AppendDict(int32_t size) {
UPDATE(dict_offsets_.size() - 1, dict_tag);
dict_offsets_.push_back(dict_offsets_.back() + size);
return Status::OK();
}
#define ADD_ELEMENT(VARNAME, TAG) \
if (TAG != -1) { \
types[TAG] = std::make_shared<Field>("", VARNAME.type()); \
RETURN_NOT_OK(VARNAME.Finish(&children[TAG])); \
RETURN_NOT_OK(nones_.AppendToBitmap(true)); \
}
#define ADD_SUBSEQUENCE(DATA, OFFSETS, BUILDER, TAG, NAME) \
if (DATA) { \
DCHECK(DATA->length() == OFFSETS.back()); \
auto list_builder = std::make_shared<ListBuilder>(pool_, DATA); \
auto field = std::make_shared<Field>(NAME, list_builder->type()); \
auto type = std::make_shared<StructType>(std::vector<FieldPtr>({field})); \
auto lists = std::vector<std::shared_ptr<ArrayBuilder>>({list_builder}); \
StructBuilder builder(pool_, type, lists); \
OFFSETS.pop_back(); \
ARROW_CHECK_OK(list_builder->Append(OFFSETS.data(), OFFSETS.size())); \
builder.Append(); \
ADD_ELEMENT(builder, TAG); \
} else { \
DCHECK(OFFSETS.size() == 1); \
}
Status SequenceBuilder::Finish(std::shared_ptr<Array> list_data,
std::shared_ptr<Array> tuple_data, std::shared_ptr<Array> dict_data,
std::shared_ptr<Array>* out) {
std::vector<std::shared_ptr<Field>> types(num_tags);
std::vector<ArrayPtr> children(num_tags);
ADD_ELEMENT(bools_, bool_tag);
ADD_ELEMENT(ints_, int_tag);
ADD_ELEMENT(strings_, string_tag);
ADD_ELEMENT(bytes_, bytes_tag);
ADD_ELEMENT(floats_, float_tag);
ADD_ELEMENT(doubles_, double_tag);
ADD_ELEMENT(uint8_tensors_, uint8_tensor_tag);
ADD_ELEMENT(int8_tensors_, int8_tensor_tag);
ADD_ELEMENT(uint16_tensors_, uint16_tensor_tag);
ADD_ELEMENT(int16_tensors_, int16_tensor_tag);
ADD_ELEMENT(uint32_tensors_, uint32_tensor_tag);
ADD_ELEMENT(int32_tensors_, int32_tensor_tag);
ADD_ELEMENT(uint64_tensors_, uint64_tensor_tag);
ADD_ELEMENT(int64_tensors_, int64_tensor_tag);
ADD_ELEMENT(float_tensors_, float_tensor_tag);
ADD_ELEMENT(double_tensors_, double_tensor_tag);
ADD_SUBSEQUENCE(list_data, list_offsets_, list_builder, list_tag, "list");
ADD_SUBSEQUENCE(tuple_data, tuple_offsets_, tuple_builder, tuple_tag, "tuple");
ADD_SUBSEQUENCE(dict_data, dict_offsets_, dict_builder, dict_tag, "dict");
std::vector<uint8_t> type_ids = {};
TypePtr type = TypePtr(new UnionType(types, type_ids, UnionMode::DENSE));
out->reset(new UnionArray(type, types_.length(), children, types_.data(),
offsets_.data(), nones_.null_count(), nones_.null_bitmap()));
return Status::OK();
}
}

View file

@ -0,0 +1,142 @@
#ifndef NUMBUF_LIST_H
#define NUMBUF_LIST_H
#include "tensor.h"
#include <arrow/api.h>
#include <arrow/types/union.h>
namespace numbuf {
/*! A Sequence is a heterogeneous collections of elements. It can contain
scalar Python types, lists, tuples, dictionaries and tensors.
*/
class SequenceBuilder {
public:
SequenceBuilder(arrow::MemoryPool* pool = nullptr);
//! Appending a none to the sequence
arrow::Status AppendNone();
//! Appending a boolean to the sequence
arrow::Status AppendBool(bool data);
//! Appending an int64_t to the sequence
arrow::Status AppendInt64(int64_t data);
//! Appending an uint64_t to the sequence
arrow::Status AppendUInt64(uint64_t data);
//! Append a list of bytes to the sequence
arrow::Status AppendBytes(const uint8_t* data, int32_t length);
//! Appending a string to the sequence
arrow::Status AppendString(const char* data, int32_t length);
//! Appending a float to the sequence
arrow::Status AppendFloat(float data);
//! Appending a double to the sequence
arrow::Status AppendDouble(double data);
/*! Appending a tensor to the sequence
\param dims
A vector of dimensions
\param data
A pointer to the start of the data block. The length of the data block
will be the product of the dimensions
*/
arrow::Status AppendTensor(const std::vector<int64_t>& dims, uint8_t* data);
arrow::Status AppendTensor(const std::vector<int64_t>& dims, int8_t* data);
arrow::Status AppendTensor(const std::vector<int64_t>& dims, uint16_t* data);
arrow::Status AppendTensor(const std::vector<int64_t>& dims, int16_t* data);
arrow::Status AppendTensor(const std::vector<int64_t>& dims, uint32_t* data);
arrow::Status AppendTensor(const std::vector<int64_t>& dims, int32_t* data);
arrow::Status AppendTensor(const std::vector<int64_t>& dims, uint64_t* data);
arrow::Status AppendTensor(const std::vector<int64_t>& dims, int64_t* data);
arrow::Status AppendTensor(const std::vector<int64_t>& dims, float* data);
arrow::Status AppendTensor(const std::vector<int64_t>& dims, double* data);
/*! Add a sublist to the sequenc. The data contained in the sublist will be
specified in the "Finish" method.
To construct l = [[11, 22], 33, [44, 55]] you would for example run
list = ListBuilder();
list.AppendList(2);
list.Append(33);
list.AppendList(2);
list.Finish([11, 22, 44, 55]);
list.Finish();
\param size
The size of the sublist
*/
arrow::Status AppendList(int32_t size);
arrow::Status AppendTuple(int32_t size);
arrow::Status AppendDict(int32_t size);
//! Finish building the sequence and return the result
arrow::Status Finish(std::shared_ptr<arrow::Array> list_data,
std::shared_ptr<arrow::Array> tuple_data, std::shared_ptr<arrow::Array> dict_data,
std::shared_ptr<arrow::Array>* out);
private:
arrow::MemoryPool* pool_;
arrow::Int8Builder types_;
arrow::Int32Builder offsets_;
arrow::NullArrayBuilder nones_;
arrow::BooleanBuilder bools_;
arrow::Int64Builder ints_;
arrow::BinaryBuilder bytes_;
arrow::StringBuilder strings_;
arrow::FloatBuilder floats_;
arrow::DoubleBuilder doubles_;
UInt8TensorBuilder uint8_tensors_;
Int8TensorBuilder int8_tensors_;
UInt16TensorBuilder uint16_tensors_;
Int16TensorBuilder int16_tensors_;
UInt32TensorBuilder uint32_tensors_;
Int32TensorBuilder int32_tensors_;
UInt64TensorBuilder uint64_tensors_;
Int64TensorBuilder int64_tensors_;
FloatTensorBuilder float_tensors_;
DoubleTensorBuilder double_tensors_;
std::vector<int32_t> list_offsets_;
std::vector<int32_t> tuple_offsets_;
std::vector<int32_t> dict_offsets_;
int8_t bool_tag = -1;
int8_t int_tag = -1;
int8_t string_tag = -1;
int8_t bytes_tag = -1;
int8_t float_tag = -1;
int8_t double_tag = -1;
int8_t uint8_tensor_tag = -1;
int8_t int8_tensor_tag = -1;
int8_t uint16_tensor_tag = -1;
int8_t int16_tensor_tag = -1;
int8_t uint32_tensor_tag = -1;
int8_t int32_tensor_tag = -1;
int8_t uint64_tensor_tag = -1;
int8_t int64_tensor_tag = -1;
int8_t float_tensor_tag = -1;
int8_t double_tensor_tag = -1;
int8_t list_tag = -1;
int8_t tuple_tag = -1;
int8_t dict_tag = -1;
int8_t num_tags = 0;
};
} // namespace numbuf
#endif // NUMBUF_LIST_H

View file

@ -0,0 +1,56 @@
#include "tensor.h"
using namespace arrow;
namespace numbuf {
template <typename T>
TensorBuilder<T>::TensorBuilder(const TypePtr& dtype, MemoryPool* pool)
: dtype_(dtype), pool_(pool) {}
template <typename T>
Status TensorBuilder<T>::Start() {
dim_data_ = std::make_shared<Int64Builder>(pool_, std::make_shared<Int64Type>());
dims_ = std::make_shared<ListBuilder>(pool_, dim_data_);
value_data_ = std::make_shared<PrimitiveBuilder<T>>(pool_, dtype_);
values_ = std::make_shared<ListBuilder>(pool_, value_data_);
auto dims_field = std::make_shared<Field>("dims", dims_->type());
auto values_field = std::make_shared<Field>("data", values_->type());
auto type =
std::make_shared<StructType>(std::vector<FieldPtr>({dims_field, values_field}));
tensors_ = std::make_shared<StructBuilder>(
pool_, type, std::vector<std::shared_ptr<ArrayBuilder>>({dims_, values_}));
return Status::OK();
}
template <typename T>
Status TensorBuilder<T>::Append(const std::vector<int64_t>& dims, const elem_type* data) {
DCHECK(tensors_);
RETURN_NOT_OK(tensors_->Append());
RETURN_NOT_OK(dims_->Append());
RETURN_NOT_OK(values_->Append());
int32_t size = 1;
for (auto dim : dims) {
size *= dim;
RETURN_NOT_OK(dim_data_->Append(dim));
}
RETURN_NOT_OK(value_data_->Append(data, size));
return Status::OK(); // tensors_->Append();
}
template <typename T>
Status TensorBuilder<T>::Finish(std::shared_ptr<Array>* out) {
return tensors_->Finish(out);
}
template class TensorBuilder<UInt8Type>;
template class TensorBuilder<Int8Type>;
template class TensorBuilder<UInt16Type>;
template class TensorBuilder<Int16Type>;
template class TensorBuilder<UInt32Type>;
template class TensorBuilder<Int32Type>;
template class TensorBuilder<UInt64Type>;
template class TensorBuilder<Int64Type>;
template class TensorBuilder<FloatType>;
template class TensorBuilder<DoubleType>;
}

View file

@ -0,0 +1,65 @@
#ifndef NUMBUF_TENSOR_H
#define NUMBUF_TENSOR_H
#include <arrow/api.h>
#include <arrow/type.h>
#include <memory>
namespace numbuf {
/*! This is a class for building a dataframe where each row corresponds to
a Tensor (= multidimensional array) of numerical data. There are two
columns, "dims" which contains an array of dimensions for each Tensor
and "data" which contains data buffer of the Tensor as a flattened array.
*/
template <typename T>
class TensorBuilder {
public:
typedef typename T::c_type elem_type;
TensorBuilder(const arrow::TypePtr& dtype, arrow::MemoryPool* pool = nullptr);
arrow::Status Start();
/*! Append a new tensor.
\param dims
The dimensions of the Tensor
\param data
Pointer to the beginning of the data buffer of the Tensor. The
total length of the buffer is sizeof(elem_type) * product of dims[i] over i
*/
arrow::Status Append(const std::vector<int64_t>& dims, const elem_type* data);
//! Convert the tensors to an Arrow StructArray
arrow::Status Finish(std::shared_ptr<arrow::Array>* out);
//! Number of tensors in the column
int32_t length() { return tensors_->length(); }
const arrow::TypePtr& type() { return tensors_->type(); }
private:
arrow::TypePtr dtype_;
arrow::MemoryPool* pool_;
std::shared_ptr<arrow::Int64Builder> dim_data_;
std::shared_ptr<arrow::ListBuilder> dims_;
std::shared_ptr<arrow::PrimitiveBuilder<T>> value_data_;
std::shared_ptr<arrow::ListBuilder> values_;
std::shared_ptr<arrow::StructBuilder> tensors_;
};
typedef TensorBuilder<arrow::UInt8Type> UInt8TensorBuilder;
typedef TensorBuilder<arrow::Int8Type> Int8TensorBuilder;
typedef TensorBuilder<arrow::UInt16Type> UInt16TensorBuilder;
typedef TensorBuilder<arrow::Int16Type> Int16TensorBuilder;
typedef TensorBuilder<arrow::UInt32Type> UInt32TensorBuilder;
typedef TensorBuilder<arrow::Int32Type> Int32TensorBuilder;
typedef TensorBuilder<arrow::UInt64Type> UInt64TensorBuilder;
typedef TensorBuilder<arrow::Int64Type> Int64TensorBuilder;
typedef TensorBuilder<arrow::FloatType> FloatTensorBuilder;
typedef TensorBuilder<arrow::DoubleType> DoubleTensorBuilder;
}
#endif // NUMBUF_TENSOR_H

View file

@ -0,0 +1 @@
from libnumbuf import *

View file

@ -0,0 +1,127 @@
#include "numpy.h"
#include "python.h"
#include <sstream>
#include <numbuf/tensor.h>
using namespace arrow;
extern "C" {
extern PyObject* numbuf_serialize_callback;
extern PyObject* numbuf_deserialize_callback;
}
namespace numbuf {
#define ARROW_TYPE_TO_NUMPY_CASE(TYPE) \
case Type::TYPE: \
return NPY_##TYPE;
#define DESERIALIZE_ARRAY_CASE(TYPE, ArrayType, type) \
case Type::TYPE: { \
auto values = std::dynamic_pointer_cast<ArrayType>(content->values()); \
DCHECK(values); \
type* data = const_cast<type*>(values->raw_data()) + content->offset(offset); \
*out = PyArray_SimpleNewFromData( \
num_dims, dim.data(), NPY_##TYPE, reinterpret_cast<void*>(data)); \
if (base != Py_None) { PyArray_SetBaseObject((PyArrayObject*)*out, base); } \
Py_XINCREF(base); \
} \
return Status::OK();
Status DeserializeArray(
std::shared_ptr<Array> array, int32_t offset, PyObject* base, PyObject** out) {
DCHECK(array);
auto tensor = std::dynamic_pointer_cast<StructArray>(array);
DCHECK(tensor);
auto dims = std::dynamic_pointer_cast<ListArray>(tensor->field(0));
auto content = std::dynamic_pointer_cast<ListArray>(tensor->field(1));
npy_intp num_dims = dims->value_length(offset);
std::vector<npy_intp> dim(num_dims);
for (int i = dims->offset(offset); i < dims->offset(offset + 1); ++i) {
dim[i - dims->offset(offset)] =
std::dynamic_pointer_cast<Int64Array>(dims->values())->Value(i);
}
switch (content->value_type()->type) {
DESERIALIZE_ARRAY_CASE(INT8, Int8Array, int8_t)
DESERIALIZE_ARRAY_CASE(INT16, Int16Array, int16_t)
DESERIALIZE_ARRAY_CASE(INT32, Int32Array, int32_t)
DESERIALIZE_ARRAY_CASE(INT64, Int64Array, int64_t)
DESERIALIZE_ARRAY_CASE(UINT8, UInt8Array, uint8_t)
DESERIALIZE_ARRAY_CASE(UINT16, UInt16Array, uint16_t)
DESERIALIZE_ARRAY_CASE(UINT32, UInt32Array, uint32_t)
DESERIALIZE_ARRAY_CASE(UINT64, UInt64Array, uint64_t)
DESERIALIZE_ARRAY_CASE(FLOAT, FloatArray, float)
DESERIALIZE_ARRAY_CASE(DOUBLE, DoubleArray, double)
default:
DCHECK(false) << "arrow type not recognized: " << content->value_type()->type;
}
return Status::OK();
}
Status SerializeArray(
PyArrayObject* array, SequenceBuilder& builder, std::vector<PyObject*>& subdicts) {
size_t ndim = PyArray_NDIM(array);
int dtype = PyArray_TYPE(array);
std::vector<int64_t> dims(ndim);
for (int i = 0; i < ndim; ++i) {
dims[i] = PyArray_DIM(array, i);
}
// TODO(pcm): Once we don't use builders any more below and directly share
// the memory buffer, we need to be more careful about this and not
// decrease the reference count of "contiguous" before the serialization
// is finished
auto contiguous = PyArray_GETCONTIGUOUS(array);
auto data = PyArray_DATA(contiguous);
switch (dtype) {
case NPY_UINT8:
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<uint8_t*>(data)));
break;
case NPY_INT8:
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<int8_t*>(data)));
break;
case NPY_UINT16:
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<uint16_t*>(data)));
break;
case NPY_INT16:
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<int16_t*>(data)));
break;
case NPY_UINT32:
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<uint32_t*>(data)));
break;
case NPY_INT32:
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<int32_t*>(data)));
break;
case NPY_UINT64:
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<uint64_t*>(data)));
break;
case NPY_INT64:
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<int64_t*>(data)));
break;
case NPY_FLOAT:
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<float*>(data)));
break;
case NPY_DOUBLE:
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<double*>(data)));
break;
default:
if (!numbuf_serialize_callback) {
std::stringstream stream;
stream << "numpy data type not recognized: " << dtype;
return Status::NotImplemented(stream.str());
} else {
PyObject* arglist = Py_BuildValue("(O)", array);
// The reference count of the result of the call to PyObject_CallObject
// must be decremented. This is done in SerializeDict in python.cc.
PyObject* result = PyObject_CallObject(numbuf_serialize_callback, arglist);
Py_XDECREF(arglist);
if (!result) { return Status::NotImplemented("python error"); }
builder.AppendDict(PyDict_Size(result));
subdicts.push_back(result);
}
}
Py_XDECREF(contiguous);
return Status::OK();
}
}

View file

@ -0,0 +1,23 @@
#ifndef PYNUMBUF_NUMPY_H
#define PYNUMBUF_NUMPY_H
#include <Python.h>
#include <arrow/api.h>
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
#define NO_IMPORT_ARRAY
#define PY_ARRAY_UNIQUE_SYMBOL NUMBUF_ARRAY_API
#include <numpy/arrayobject.h>
#include <numbuf/sequence.h>
#include <numbuf/tensor.h>
namespace numbuf {
arrow::Status SerializeArray(
PyArrayObject* array, SequenceBuilder& builder, std::vector<PyObject*>& subdicts);
arrow::Status DeserializeArray(
std::shared_ptr<arrow::Array> array, int32_t offset, PyObject* base, PyObject** out);
}
#endif

View file

@ -0,0 +1,290 @@
#include "python.h"
#include <sstream>
#include "scalars.h"
using namespace arrow;
int32_t MAX_RECURSION_DEPTH = 100;
extern "C" {
extern PyObject* numbuf_serialize_callback;
extern PyObject* numbuf_deserialize_callback;
}
namespace numbuf {
Status get_value(
ArrayPtr arr, int32_t index, int32_t type, PyObject* base, PyObject** result) {
switch (arr->type()->type) {
case Type::BOOL:
*result =
PyBool_FromLong(std::static_pointer_cast<BooleanArray>(arr)->Value(index));
return Status::OK();
case Type::INT64:
*result = PyInt_FromLong(std::static_pointer_cast<Int64Array>(arr)->Value(index));
return Status::OK();
case Type::BINARY: {
int32_t nchars;
const uint8_t* str =
std::static_pointer_cast<BinaryArray>(arr)->GetValue(index, &nchars);
*result = PyString_FromStringAndSize(reinterpret_cast<const char*>(str), nchars);
return Status::OK();
}
case Type::STRING: {
int32_t nchars;
const uint8_t* str =
std::static_pointer_cast<StringArray>(arr)->GetValue(index, &nchars);
*result = PyUnicode_FromStringAndSize(reinterpret_cast<const char*>(str), nchars);
return Status::OK();
}
case Type::FLOAT:
*result =
PyFloat_FromDouble(std::static_pointer_cast<FloatArray>(arr)->Value(index));
return Status::OK();
case Type::DOUBLE:
*result =
PyFloat_FromDouble(std::static_pointer_cast<DoubleArray>(arr)->Value(index));
return Status::OK();
case Type::STRUCT: {
auto s = std::static_pointer_cast<StructArray>(arr);
auto l = std::static_pointer_cast<ListArray>(s->field(0));
if (s->type()->child(0)->name == "list") {
return DeserializeList(l->values(), l->value_offset(index),
l->value_offset(index + 1), base, result);
} else if (s->type()->child(0)->name == "tuple") {
return DeserializeTuple(l->values(), l->value_offset(index),
l->value_offset(index + 1), base, result);
} else if (s->type()->child(0)->name == "dict") {
return DeserializeDict(l->values(), l->value_offset(index),
l->value_offset(index + 1), base, result);
} else {
return DeserializeArray(arr, index, base, result);
}
}
default:
DCHECK(false) << "union tag not recognized " << type;
}
return Status::OK();
}
Status append(PyObject* elem, SequenceBuilder& builder, std::vector<PyObject*>& sublists,
std::vector<PyObject*>& subtuples, std::vector<PyObject*>& subdicts) {
// The bool case must precede the int case (PyInt_Check passes for bools)
if (PyBool_Check(elem)) {
RETURN_NOT_OK(builder.AppendBool(elem == Py_True));
} else if (PyFloat_Check(elem)) {
RETURN_NOT_OK(builder.AppendDouble(PyFloat_AS_DOUBLE(elem)));
} else if (PyLong_Check(elem)) {
int overflow = 0;
int64_t data = PyLong_AsLongLongAndOverflow(elem, &overflow);
RETURN_NOT_OK(builder.AppendInt64(data));
if (overflow) { return Status::NotImplemented("long overflow"); }
} else if (PyInt_Check(elem)) {
RETURN_NOT_OK(builder.AppendInt64(static_cast<int64_t>(PyInt_AS_LONG(elem))));
} else if (PyString_Check(elem)) {
auto data = reinterpret_cast<uint8_t*>(PyString_AS_STRING(elem));
auto size = PyString_GET_SIZE(elem);
RETURN_NOT_OK(builder.AppendBytes(data, size));
} else if (PyUnicode_Check(elem)) {
Py_ssize_t size;
#if PY_MAJOR_VERSION >= 3
char* data =
PyUnicode_AsUTF8AndSize(elem, &size); // TODO(pcm): Check if this is correct
#else
PyObject* str = PyUnicode_AsUTF8String(elem);
char* data = PyString_AS_STRING(str);
size = PyString_GET_SIZE(str);
#endif
Status s = builder.AppendString(data, size);
Py_XDECREF(str);
RETURN_NOT_OK(s);
} else if (PyList_Check(elem)) {
builder.AppendList(PyList_Size(elem));
sublists.push_back(elem);
} else if (PyDict_Check(elem)) {
builder.AppendDict(PyDict_Size(elem));
subdicts.push_back(elem);
} else if (PyTuple_CheckExact(elem)) {
builder.AppendTuple(PyTuple_Size(elem));
subtuples.push_back(elem);
} else if (PyArray_IsScalar(elem, Generic)) {
RETURN_NOT_OK(AppendScalar(elem, builder));
} else if (PyArray_Check(elem)) {
RETURN_NOT_OK(SerializeArray((PyArrayObject*)elem, builder, subdicts));
} else if (elem == Py_None) {
RETURN_NOT_OK(builder.AppendNone());
} else {
if (!numbuf_serialize_callback) {
std::stringstream ss;
ss << "data type of " << PyString_AS_STRING(PyObject_Repr(elem))
<< " not recognized and custom serialization handler not registered";
return Status::NotImplemented(ss.str());
} else {
PyObject* arglist = Py_BuildValue("(O)", elem);
// The reference count of the result of the call to PyObject_CallObject
// must be decremented. This is done in SerializeDict in this file.
PyObject* result = PyObject_CallObject(numbuf_serialize_callback, arglist);
Py_XDECREF(arglist);
if (!result) { return Status::NotImplemented("python error"); }
builder.AppendDict(PyDict_Size(result));
subdicts.push_back(result);
}
}
return Status::OK();
}
Status SerializeSequences(std::vector<PyObject*> sequences, int32_t recursion_depth,
std::shared_ptr<Array>* out) {
DCHECK(out);
if (recursion_depth >= MAX_RECURSION_DEPTH) {
return Status::NotImplemented(
"This object exceeds the maximum recursion depth. It may contain itself "
"recursively.");
}
SequenceBuilder builder(nullptr);
std::vector<PyObject *> sublists, subtuples, subdicts;
for (const auto& sequence : sequences) {
PyObject* item;
PyObject* iterator = PyObject_GetIter(sequence);
while ((item = PyIter_Next(iterator))) {
Status s = append(item, builder, sublists, subtuples, subdicts);
Py_DECREF(item);
// if an error occurs, we need to decrement the reference counts before returning
if (!s.ok()) {
Py_DECREF(iterator);
return s;
}
}
Py_DECREF(iterator);
}
std::shared_ptr<Array> list;
if (sublists.size() > 0) {
RETURN_NOT_OK(SerializeSequences(sublists, recursion_depth + 1, &list));
}
std::shared_ptr<Array> tuple;
if (subtuples.size() > 0) {
RETURN_NOT_OK(SerializeSequences(subtuples, recursion_depth + 1, &tuple));
}
std::shared_ptr<Array> dict;
if (subdicts.size() > 0) {
RETURN_NOT_OK(SerializeDict(subdicts, recursion_depth + 1, &dict));
}
return builder.Finish(list, tuple, dict, out);
}
#define DESERIALIZE_SEQUENCE(CREATE, SET_ITEM) \
auto data = std::dynamic_pointer_cast<UnionArray>(array); \
int32_t size = array->length(); \
PyObject* result = CREATE(stop_idx - start_idx); \
auto types = std::make_shared<Int8Array>(size, data->types()); \
auto offsets = std::make_shared<Int32Array>(size, data->offset_buf()); \
for (size_t i = start_idx; i < stop_idx; ++i) { \
if (data->IsNull(i)) { \
Py_INCREF(Py_None); \
SET_ITEM(result, i - start_idx, Py_None); \
} else { \
int32_t offset = offsets->Value(i); \
int8_t type = types->Value(i); \
ArrayPtr arr = data->child(type); \
PyObject* value; \
RETURN_NOT_OK(get_value(arr, offset, type, base, &value)); \
SET_ITEM(result, i - start_idx, value); \
} \
} \
*out = result; \
return Status::OK();
Status DeserializeList(std::shared_ptr<Array> array, int32_t start_idx, int32_t stop_idx,
PyObject* base, PyObject** out) {
DESERIALIZE_SEQUENCE(PyList_New, PyList_SetItem)
}
Status DeserializeTuple(std::shared_ptr<Array> array, int32_t start_idx, int32_t stop_idx,
PyObject* base, PyObject** out) {
DESERIALIZE_SEQUENCE(PyTuple_New, PyTuple_SetItem)
}
Status SerializeDict(
std::vector<PyObject*> dicts, int32_t recursion_depth, std::shared_ptr<Array>* out) {
DictBuilder result;
if (recursion_depth >= MAX_RECURSION_DEPTH) {
return Status::NotImplemented(
"This object exceeds the maximum recursion depth. It may contain itself "
"recursively.");
}
std::vector<PyObject *> key_tuples, val_lists, val_tuples, val_dicts, dummy;
for (const auto& dict : dicts) {
PyObject *key, *value;
Py_ssize_t pos = 0;
while (PyDict_Next(dict, &pos, &key, &value)) {
RETURN_NOT_OK(append(key, result.keys(), dummy, key_tuples, dummy));
DCHECK(dummy.size() == 0);
RETURN_NOT_OK(append(value, result.vals(), val_lists, val_tuples, val_dicts));
}
}
std::shared_ptr<Array> key_tuples_arr;
if (key_tuples.size() > 0) {
RETURN_NOT_OK(SerializeSequences(key_tuples, recursion_depth + 1, &key_tuples_arr));
}
std::shared_ptr<Array> val_list_arr;
if (val_lists.size() > 0) {
RETURN_NOT_OK(SerializeSequences(val_lists, recursion_depth + 1, &val_list_arr));
}
std::shared_ptr<Array> val_tuples_arr;
if (val_tuples.size() > 0) {
RETURN_NOT_OK(SerializeSequences(val_tuples, recursion_depth + 1, &val_tuples_arr));
}
std::shared_ptr<Array> val_dict_arr;
if (val_dicts.size() > 0) {
RETURN_NOT_OK(SerializeDict(val_dicts, recursion_depth + 1, &val_dict_arr));
}
result.Finish(key_tuples_arr, val_list_arr, val_tuples_arr, val_dict_arr, out);
// This block is used to decrement the reference counts of the results
// returned by the serialization callback, which is called in SerializeArray
// in numpy.cc as well as in DeserializeDict and in append in this file.
static PyObject* py_type = PyString_FromString("_pytype_");
for (const auto& dict : dicts) {
if (PyDict_Contains(dict, py_type)) {
// If the dictionary contains the key "_pytype_", then the user has to
// have registered a callback.
ARROW_CHECK(numbuf_serialize_callback);
Py_XDECREF(dict);
}
}
return Status::OK();
}
Status DeserializeDict(std::shared_ptr<Array> array, int32_t start_idx, int32_t stop_idx,
PyObject* base, PyObject** out) {
auto data = std::dynamic_pointer_cast<StructArray>(array);
// TODO(pcm): error handling, get rid of the temporary copy of the list
PyObject *keys, *vals;
PyObject* result = PyDict_New();
ARROW_RETURN_NOT_OK(DeserializeList(data->field(0), start_idx, stop_idx, base, &keys));
ARROW_RETURN_NOT_OK(DeserializeList(data->field(1), start_idx, stop_idx, base, &vals));
for (size_t i = start_idx; i < stop_idx; ++i) {
PyDict_SetItem(
result, PyList_GetItem(keys, i - start_idx), PyList_GetItem(vals, i - start_idx));
}
Py_XDECREF(keys); // PyList_GetItem(keys, ...) incremented the reference count
Py_XDECREF(vals); // PyList_GetItem(vals, ...) incremented the reference count
static PyObject* py_type = PyString_FromString("_pytype_");
if (PyDict_Contains(result, py_type) && numbuf_deserialize_callback) {
PyObject* arglist = Py_BuildValue("(O)", result);
// The result of the call to PyObject_CallObject will be passed to Python
// and its reference count will be decremented by the interpreter.
PyObject* callback_result = PyObject_CallObject(numbuf_deserialize_callback, arglist);
Py_XDECREF(arglist);
Py_XDECREF(result);
result = callback_result;
if (!callback_result) { return Status::NotImplemented("python error"); }
}
*out = result;
return Status::OK();
}
}

View file

@ -0,0 +1,26 @@
#ifndef PYNUMBUF_PYTHON_H
#define PYNUMBUF_PYTHON_H
#include <Python.h>
#include <arrow/api.h>
#include <numbuf/dict.h>
#include <numbuf/sequence.h>
#include "numpy.h"
namespace numbuf {
arrow::Status SerializeSequences(std::vector<PyObject*> sequences,
int32_t recursion_depth, std::shared_ptr<arrow::Array>* out);
arrow::Status SerializeDict(std::vector<PyObject*> dicts, int32_t recursion_depth,
std::shared_ptr<arrow::Array>* out);
arrow::Status DeserializeList(std::shared_ptr<arrow::Array> array, int32_t start_idx,
int32_t stop_idx, PyObject* base, PyObject** out);
arrow::Status DeserializeTuple(std::shared_ptr<arrow::Array> array, int32_t start_idx,
int32_t stop_idx, PyObject* base, PyObject** out);
arrow::Status DeserializeDict(std::shared_ptr<arrow::Array> array, int32_t start_idx,
int32_t stop_idx, PyObject* base, PyObject** out);
}
#endif

View file

@ -0,0 +1,54 @@
#ifndef PYNUMBUF_SCALARS_H
#define PYNUMBUF_SCALARS_H
#include <arrow/api.h>
#include <Python.h>
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
#define NO_IMPORT_ARRAY
#define PY_ARRAY_UNIQUE_SYMBOL NUMBUF_ARRAY_API
#include <numpy/arrayobject.h>
#include <numpy/arrayscalars.h>
#include <numbuf/sequence.h>
namespace numbuf {
arrow::Status AppendScalar(PyObject* obj, SequenceBuilder& builder) {
if (PyArray_IsScalar(obj, Bool)) {
return builder.AppendBool(((PyBoolScalarObject*)obj)->obval != 0);
} else if (PyArray_IsScalar(obj, Float)) {
return builder.AppendFloat(((PyFloatScalarObject*)obj)->obval);
} else if (PyArray_IsScalar(obj, Double)) {
return builder.AppendDouble(((PyDoubleScalarObject*)obj)->obval);
}
int64_t value = 0;
if (PyArray_IsScalar(obj, Byte)) {
value = ((PyByteScalarObject*)obj)->obval;
} else if (PyArray_IsScalar(obj, UByte)) {
value = ((PyUByteScalarObject*)obj)->obval;
} else if (PyArray_IsScalar(obj, Short)) {
value = ((PyShortScalarObject*)obj)->obval;
} else if (PyArray_IsScalar(obj, UShort)) {
value = ((PyUShortScalarObject*)obj)->obval;
} else if (PyArray_IsScalar(obj, Int)) {
value = ((PyIntScalarObject*)obj)->obval;
} else if (PyArray_IsScalar(obj, UInt)) {
value = ((PyUIntScalarObject*)obj)->obval;
} else if (PyArray_IsScalar(obj, Long)) {
value = ((PyLongScalarObject*)obj)->obval;
} else if (PyArray_IsScalar(obj, ULong)) {
value = ((PyULongScalarObject*)obj)->obval;
} else if (PyArray_IsScalar(obj, LongLong)) {
value = ((PyLongLongScalarObject*)obj)->obval;
} else if (PyArray_IsScalar(obj, ULongLong)) {
value = ((PyULongLongScalarObject*)obj)->obval;
} else {
DCHECK(false) << "scalar type not recognized";
}
return builder.AppendInt64(value);
}
} // namespace
#endif // PYNUMBUF_SCALARS_H

View file

@ -0,0 +1,67 @@
#ifndef PYNUMBUF_MEMORY_H
#define PYNUMBUF_MEMORY_H
#include <arrow/io/interfaces.h>
namespace numbuf {
class FixedBufferStream : public arrow::io::OutputStream,
public arrow::io::ReadableFileInterface {
public:
virtual ~FixedBufferStream() {}
explicit FixedBufferStream(uint8_t* data, int64_t nbytes)
: data_(data), position_(0), size_(nbytes) {}
arrow::Status Read(int64_t nbytes, std::shared_ptr<arrow::Buffer>* out) override {
DCHECK(out);
DCHECK(position_ + nbytes <= size_) << "position: " << position_
<< " nbytes: " << nbytes << "size: " << size_;
*out = std::make_shared<arrow::Buffer>(data_ + position_, nbytes);
position_ += nbytes;
return arrow::Status::OK();
}
arrow::Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) {
assert(0);
return arrow::Status::OK();
}
arrow::Status Seek(int64_t position) override {
position_ = position;
return arrow::Status::OK();
}
arrow::Status Close() override { return arrow::Status::OK(); }
arrow::Status Tell(int64_t* position) override {
*position = position_;
return arrow::Status::OK();
}
arrow::Status Write(const uint8_t* data, int64_t nbytes) override {
DCHECK(position_ >= 0 && position_ < size_);
DCHECK(position_ + nbytes <= size_) << "position: " << position_
<< " nbytes: " << nbytes << "size: " << size_;
uint8_t* dst = data_ + position_;
memcpy(dst, data, nbytes);
position_ += nbytes;
return arrow::Status::OK();
}
arrow::Status GetSize(int64_t* size) override {
*size = size_;
return arrow::Status::OK();
}
bool supports_zero_copy() const override { return true; }
private:
uint8_t* data_;
int64_t position_;
int64_t size_;
};
} // namespace numbuf
#endif // PYNUMBUF_MEMORY_H

View file

@ -0,0 +1,192 @@
#include <Python.h>
#include <arrow/api.h>
#include <arrow/ipc/adapter.h>
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
#define PY_ARRAY_UNIQUE_SYMBOL NUMBUF_ARRAY_API
#include <numpy/arrayobject.h>
#include <iostream>
#include <arrow/ipc/metadata.h>
#include "adapters/python.h"
#include "memory.h"
using namespace arrow;
using namespace numbuf;
std::shared_ptr<RecordBatch> make_row_batch(std::shared_ptr<Array> data) {
auto field = std::make_shared<Field>("list", data->type());
std::shared_ptr<Schema> schema(new Schema({field}));
return std::shared_ptr<RecordBatch>(new RecordBatch(schema, data->length(), {data}));
}
extern "C" {
static PyObject* NumbufError;
PyObject* numbuf_serialize_callback = NULL;
PyObject* numbuf_deserialize_callback = NULL;
int PyObjectToArrow(PyObject* object, std::shared_ptr<RecordBatch>** result) {
if (PyCapsule_IsValid(object, "arrow")) {
*result = reinterpret_cast<std::shared_ptr<RecordBatch>*>(
PyCapsule_GetPointer(object, "arrow"));
return 1;
} else {
PyErr_SetString(PyExc_TypeError, "must be an 'arrow' capsule");
return 0;
}
}
static void ArrowCapsule_Destructor(PyObject* capsule) {
delete reinterpret_cast<std::shared_ptr<RecordBatch>*>(
PyCapsule_GetPointer(capsule, "arrow"));
}
/* Documented in doc/numbuf.rst in ray-core */
static PyObject* serialize_list(PyObject* self, PyObject* args) {
PyObject* value;
if (!PyArg_ParseTuple(args, "O", &value)) { return NULL; }
std::shared_ptr<Array> array;
if (PyList_Check(value)) {
int32_t recursion_depth = 0;
Status s =
SerializeSequences(std::vector<PyObject*>({value}), recursion_depth, &array);
if (!s.ok()) {
// If this condition is true, there was an error in the callback that
// needs to be passed through
if (!PyErr_Occurred()) { PyErr_SetString(NumbufError, s.ToString().c_str()); }
return NULL;
}
auto batch = new std::shared_ptr<RecordBatch>();
*batch = make_row_batch(array);
int64_t size = 0;
ARROW_CHECK_OK(arrow::ipc::GetRecordBatchSize(batch->get(), &size));
std::shared_ptr<Buffer> buffer;
ARROW_CHECK_OK(ipc::WriteSchema((*batch)->schema().get(), &buffer));
auto ptr = reinterpret_cast<const char*>(buffer->data());
PyObject* r = PyTuple_New(3);
PyTuple_SetItem(r, 0, PyByteArray_FromStringAndSize(ptr, buffer->size()));
PyTuple_SetItem(r, 1, PyInt_FromLong(size));
PyTuple_SetItem(r, 2,
PyCapsule_New(reinterpret_cast<void*>(batch), "arrow", &ArrowCapsule_Destructor));
return r;
}
return NULL;
}
/* Documented in doc/numbuf.rst in ray-core */
static PyObject* write_to_buffer(PyObject* self, PyObject* args) {
std::shared_ptr<RecordBatch>* batch;
PyObject* memoryview;
if (!PyArg_ParseTuple(args, "O&O", &PyObjectToArrow, &batch, &memoryview)) {
return NULL;
}
if (!PyMemoryView_Check(memoryview)) { return NULL; }
Py_buffer* buffer = PyMemoryView_GET_BUFFER(memoryview);
auto target = std::make_shared<FixedBufferStream>(
reinterpret_cast<uint8_t*>(buffer->buf), buffer->len);
int64_t body_end_offset;
int64_t header_end_offset;
ARROW_CHECK_OK(ipc::WriteRecordBatch((*batch)->columns(), (*batch)->num_rows(),
target.get(), &body_end_offset, &header_end_offset));
return PyInt_FromLong(header_end_offset);
}
/* Documented in doc/numbuf.rst in ray-core */
static PyObject* read_from_buffer(PyObject* self, PyObject* args) {
PyObject* memoryview;
PyObject* metadata;
int64_t metadata_offset;
if (!PyArg_ParseTuple(args, "OOL", &memoryview, &metadata, &metadata_offset)) {
return NULL;
}
auto ptr = reinterpret_cast<uint8_t*>(PyByteArray_AsString(metadata));
auto schema_buffer = std::make_shared<Buffer>(ptr, PyByteArray_Size(metadata));
std::shared_ptr<ipc::Message> message;
ARROW_CHECK_OK(ipc::Message::Open(schema_buffer, &message));
DCHECK_EQ(ipc::Message::SCHEMA, message->type());
std::shared_ptr<ipc::SchemaMessage> schema_msg = message->GetSchema();
std::shared_ptr<Schema> schema;
ARROW_CHECK_OK(schema_msg->GetSchema(&schema));
Py_buffer* buffer = PyMemoryView_GET_BUFFER(memoryview);
auto source = std::make_shared<FixedBufferStream>(
reinterpret_cast<uint8_t*>(buffer->buf), buffer->len);
std::shared_ptr<arrow::ipc::RecordBatchReader> reader;
ARROW_CHECK_OK(
arrow::ipc::RecordBatchReader::Open(source.get(), metadata_offset, &reader));
auto batch = new std::shared_ptr<arrow::RecordBatch>();
ARROW_CHECK_OK(reader->GetRecordBatch(schema, batch));
return PyCapsule_New(reinterpret_cast<void*>(batch), "arrow", &ArrowCapsule_Destructor);
}
/* Documented in doc/numbuf.rst in ray-core */
static PyObject* deserialize_list(PyObject* self, PyObject* args) {
std::shared_ptr<RecordBatch>* data;
PyObject* base = Py_None;
if (!PyArg_ParseTuple(args, "O&|O", &PyObjectToArrow, &data, &base)) { return NULL; }
PyObject* result;
Status s = DeserializeList((*data)->column(0), 0, (*data)->num_rows(), base, &result);
if (!s.ok()) {
// If this condition is true, there was an error in the callback that
// needs to be passed through
if (!PyErr_Occurred()) { PyErr_SetString(NumbufError, s.ToString().c_str()); }
return NULL;
}
return result;
}
static PyObject* register_callbacks(PyObject* self, PyObject* args) {
PyObject* result = NULL;
PyObject* serialize_callback;
PyObject* deserialize_callback;
if (PyArg_ParseTuple(
args, "OO:register_callbacks", &serialize_callback, &deserialize_callback)) {
if (!PyCallable_Check(serialize_callback)) {
PyErr_SetString(PyExc_TypeError, "serialize_callback must be callable");
return NULL;
}
if (!PyCallable_Check(deserialize_callback)) {
PyErr_SetString(PyExc_TypeError, "deserialize_callback must be callable");
return NULL;
}
Py_XINCREF(serialize_callback); // Add a reference to new serialization callback
Py_XINCREF(deserialize_callback); // Add a reference to new deserialization callback
Py_XDECREF(numbuf_serialize_callback); // Dispose of old serialization callback
Py_XDECREF(numbuf_deserialize_callback); // Dispose of old deserialization callback
numbuf_serialize_callback = serialize_callback;
numbuf_deserialize_callback = deserialize_callback;
Py_INCREF(Py_None);
result = Py_None;
}
return result;
}
static PyMethodDef NumbufMethods[] = {
{"serialize_list", serialize_list, METH_VARARGS, "serialize a Python list"},
{"deserialize_list", deserialize_list, METH_VARARGS, "deserialize a Python list"},
{"write_to_buffer", write_to_buffer, METH_VARARGS, "write serialized data to buffer"},
{"read_from_buffer", read_from_buffer, METH_VARARGS,
"read serialized data from buffer"},
{"register_callbacks", register_callbacks, METH_VARARGS,
"set serialization and deserialization callbacks"},
{NULL, NULL, 0, NULL}};
PyMODINIT_FUNC initlibnumbuf(void) {
PyObject* m;
m = Py_InitModule3("libnumbuf", NumbufMethods, "Python C Extension for Numbuf");
char numbuf_error[] = "numbuf.error";
NumbufError = PyErr_NewException(numbuf_error, NULL, NULL);
Py_INCREF(NumbufError);
PyModule_AddObject(m, "numbuf_error", NumbufError);
import_array();
}
}

View file

@ -0,0 +1,114 @@
import unittest
import numbuf
import numpy as np
from numpy.testing import assert_equal
TEST_OBJECTS = [{(1,2) : 1}, {() : 2}, [1, "hello", 3.0], 42, 43L, "hello world",
u"x", u"\u262F", 42.0,
1L << 62, (1.0, "hi"),
None, (None, None), ("hello", None),
True, False, (True, False), "hello",
{True: "hello", False: "world"},
{"hello" : "world", 1: 42, 1.0: 45}, {},
np.int8(3), np.int32(4), np.int64(5),
np.uint8(3), np.uint32(4), np.uint64(5),
np.float32(1.0), np.float64(1.0)]
class SerializationTests(unittest.TestCase):
def roundTripTest(self, data):
schema, size, serialized = numbuf.serialize_list(data)
result = numbuf.deserialize_list(serialized)
assert_equal(data, result)
def testSimple(self):
self.roundTripTest([1, 2, 3])
self.roundTripTest([1.0, 2.0, 3.0])
self.roundTripTest(['hello', 'world'])
self.roundTripTest([1, 'hello', 1.0])
self.roundTripTest([{'hello': 1.0, 'world': 42}])
self.roundTripTest([True, False])
def testNone(self):
self.roundTripTest([1, 2, None, 3])
def testNested(self):
self.roundTripTest([{"hello": {"world": (1, 2, 3)}}])
self.roundTripTest([((1,), (1, 2, 3, (4, 5, 6), "string"))])
self.roundTripTest([{"hello": [1, 2, 3]}])
self.roundTripTest([{"hello": [1, [2, 3]]}])
self.roundTripTest([{"hello": (None, 2, [3, 4])}])
self.roundTripTest([{"hello": (None, 2, [3, 4], np.array([1.0, 2.0, 3.0]))}])
def numpyTest(self, t):
a = np.random.randint(0, 10, size=(100, 100)).astype(t)
self.roundTripTest([a])
def testArrays(self):
for t in ["int8", "uint8", "int16", "uint16", "int32", "uint32", "float32", "float64"]:
self.numpyTest(t)
def testRay(self):
for obj in TEST_OBJECTS:
self.roundTripTest([obj])
def testCallback(self):
class Foo(object):
def __init__(self):
self.x = 1
class Bar(object):
def __init__(self):
self.foo = Foo()
def serialize(obj):
return dict(obj.__dict__, **{"_pytype_": type(obj).__name__})
def deserialize(obj):
if obj["_pytype_"] == "Foo":
result = Foo()
elif obj["_pytype_"] == "Bar":
result = Bar()
obj.pop("_pytype_", None)
result.__dict__ = obj
return result
bar = Bar()
bar.foo.x = 42
numbuf.register_callbacks(serialize, deserialize)
metadata, size, serialized = numbuf.serialize_list([bar])
self.assertEqual(numbuf.deserialize_list(serialized)[0].foo.x, 42)
def testObjectArray(self):
x = np.array([1, 2, "hello"], dtype=object)
y = np.array([[1, 2], [3, 4]], dtype=object)
def myserialize(obj):
return {"_pytype_": "numpy.array", "data": obj.tolist()}
def mydeserialize(obj):
if obj["_pytype_"] == "numpy.array":
return np.array(obj["data"], dtype=object)
numbuf.register_callbacks(myserialize, mydeserialize)
metadata, size, serialized = numbuf.serialize_list([x, y])
assert_equal(numbuf.deserialize_list(serialized), [x, y])
def testBuffer(self):
for (i, obj) in enumerate(TEST_OBJECTS):
schema, size, batch = numbuf.serialize_list([obj])
size = size + 4096 # INITIAL_METADATA_SIZE in arrow
buff = np.zeros(size, dtype="uint8")
metadata_offset = numbuf.write_to_buffer(batch, memoryview(buff))
array = numbuf.read_from_buffer(memoryview(buff), schema, metadata_offset)
result = numbuf.deserialize_list(array)
assert_equal(result[0], obj)
if __name__ == "__main__":
unittest.main(verbosity=2)

33
numbuf/setup.py Normal file
View file

@ -0,0 +1,33 @@
import subprocess
from setuptools import setup, find_packages
import setuptools.command.install as _install
from sys import platform
extension = ""
if platform == "linux" or platform == "linux2":
extension = ".so"
elif platform == "darwin":
extension = ".dylib"
# Because of relative paths, this must be run from inside numbuf/.
class install(_install.install):
def run(self):
subprocess.check_call(["./setup.sh"])
subprocess.check_call(["./build.sh"])
# Calling _install.install.run(self) does not fetch required packages and
# instead performs an old-style install. See command/install.py in
# setuptools. So, calling do_egg_install() manually here.
self.do_egg_install()
setup(name="numbuf",
version="0.0.1",
packages=find_packages(),
package_data={"numbuf": ["libnumbuf.so",
"libarrow" + extension,
"libarrow_io" + extension,
"libarrow_ipc" + extension]},
cmdclass={"install": install},
setup_requires=["numpy"],
include_package_data=True,
zip_safe=False)

21
numbuf/setup.sh Executable file
View file

@ -0,0 +1,21 @@
#!/usr/bin/env bash
ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
platform="unknown"
unamestr="$(uname)"
if [[ "$unamestr" == "Linux" ]]; then
echo "Platform is linux."
platform="linux"
elif [[ "$unamestr" == "Darwin" ]]; then
echo "Platform is macosx."
platform="macosx"
else
echo "Unrecognized platform."
exit 1
fi
pushd "$ROOT_DIR"
./thirdparty/download_thirdparty.sh
./thirdparty/build_thirdparty.sh
popd

26
numbuf/thirdparty/build_thirdparty.sh vendored Executable file
View file

@ -0,0 +1,26 @@
#!/bin/bash
set -x
set -e
TP_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
PREFIX=$TP_DIR/installed
# Determine how many parallel jobs to use for make based on the number of cores
unamestr="$(uname)"
if [[ "$unamestr" == "Linux" ]]; then
PARALLEL=$(nproc)
elif [[ "$unamestr" == "Darwin" ]]; then
PARALLEL=$(sysctl -n hw.ncpu)
echo "Platform is macosx."
else
echo "Unrecognized platform."
exit 1
fi
echo "building arrow"
cd $TP_DIR/arrow/cpp
mkdir -p $TP_DIR/arrow/cpp/build
cd $TP_DIR/arrow/cpp/build
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="-g" -DCMAKE_CXX_FLAGS="-g" ..
make VERBOSE=1 -j$PARALLEL

10
numbuf/thirdparty/download_thirdparty.sh vendored Executable file
View file

@ -0,0 +1,10 @@
#!/bin/bash
set -x
set -e
TP_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
git clone https://github.com/pcmoritz/arrow.git "$TP_DIR/arrow"
cd "$TP_DIR/arrow"
git checkout 58bd7bedc63d66d5898297bab25b54dfb67665db

View file

@ -0,0 +1,75 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|Win32">
<Configuration>Release</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{609D1438-D42D-4CBA-80A5-A1398C3BCC85}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Label="Configuration">
<PlatformToolset>v140</PlatformToolset>
<ConfigurationType>DynamicLibrary</ConfigurationType>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
</ImportGroup>
<ImportGroup Label="Shared">
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
<Import Project="$(SolutionDir)vsprojects\base.props" Condition="exists('$(SolutionDir)vsprojects\base.props')" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<TargetName>lib$(MSBuildProjectName)</TargetName>
<TargetExt>.pyd</TargetExt>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<AdditionalIncludeDirectories>$(THIRD_PARTY)arrow\cpp\src;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClInclude Include="..\cpp\src\numbuf\dict.h" />
<ClInclude Include="..\cpp\src\numbuf\sequence.h" />
<ClInclude Include="..\cpp\src\numbuf\tensor.h" />
<ClInclude Include="..\python\src\pynumbuf\adapters\numpy.h" />
<ClInclude Include="..\python\src\pynumbuf\adapters\python.h" />
<ClInclude Include="..\python\src\pynumbuf\adapters\scalars.h" />
<ClInclude Include="..\python\src\pynumbuf\memory.h" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\cpp\src\numbuf\dict.cc" />
<ClCompile Include="..\cpp\src\numbuf\sequence.cc" />
<ClCompile Include="..\cpp\src\numbuf\tensor.cc" />
<ClCompile Include="..\python\src\pynumbuf\adapters\numpy.cc" />
<ClCompile Include="..\python\src\pynumbuf\adapters\python.cc" />
<ClCompile Include="..\python\src\pynumbuf\numbuf.cc" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="$(THIRD_PARTY)arrow\vsprojects\arrow.vcxproj" Condition="exists('$(THIRD_PARTY)arrow\vsprojects\arrow.vcxproj')">
<Project>{10e7d8e8-0eeb-46ea-a58d-f9236b5960ad}</Project>
</ProjectReference>
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
</Project>

View file

@ -0,0 +1,60 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
<Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
<Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
</Filter>
<Filter Include="Resource Files">
<UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
<Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\cpp\src\numbuf\dict.cc">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\cpp\src\numbuf\sequence.cc">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\cpp\src\numbuf\tensor.cc">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\python\src\pynumbuf\adapters\numpy.cc">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\python\src\pynumbuf\adapters\python.cc">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\python\src\pynumbuf\numbuf.cc">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\cpp\src\numbuf\dict.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\cpp\src\numbuf\sequence.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\cpp\src\numbuf\tensor.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\python\src\pynumbuf\adapters\numpy.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\python\src\pynumbuf\adapters\python.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\python\src\pynumbuf\adapters\scalars.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\python\src\pynumbuf\memory.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
</Project>