mirror of
https://github.com/vale981/ray
synced 2025-03-09 12:56:46 -04:00
parallelize numbuf memcpy and plasma object hash construction (#366)
* parallelizing memcopy and object hash construction in numbuf/plasma * clang format * whitespace * refactoring compute object hash: get rid of the prefix chunk * clang format * Document performance optimization. * Remove check for 64-byte alignment, since it may not be guaranteed.
This commit is contained in:
parent
ba02fc0eb0
commit
a3d58607bf
6 changed files with 193 additions and 21 deletions
|
@ -68,15 +68,15 @@ if(APPLE)
|
||||||
endif(APPLE)
|
endif(APPLE)
|
||||||
|
|
||||||
if(APPLE)
|
if(APPLE)
|
||||||
target_link_libraries(numbuf ${ARROW_LIB} ${ARROW_IO_LIB} ${ARROW_IPC_LIB} ${PYTHON_LIBRARIES})
|
target_link_libraries(numbuf ${ARROW_LIB} ${ARROW_IO_LIB} ${ARROW_IPC_LIB} ${PYTHON_LIBRARIES} -lpthread)
|
||||||
else()
|
else()
|
||||||
target_link_libraries(numbuf -Wl,--whole-archive ${ARROW_LIB} -Wl,--no-whole-archive ${ARROW_IO_LIB} ${ARROW_IPC_LIB} ${PYTHON_LIBRARIES})
|
target_link_libraries(numbuf -Wl,--whole-archive ${ARROW_LIB} -Wl,--no-whole-archive ${ARROW_IO_LIB} ${ARROW_IPC_LIB} ${PYTHON_LIBRARIES} -lpthread)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(HAS_PLASMA)
|
if(HAS_PLASMA)
|
||||||
target_link_libraries(numbuf ${ARROW_LIB} ${ARROW_IO_LIB} ${ARROW_IPC_LIB} ${PYTHON_LIBRARIES} plasma_lib common ${FLATBUFFERS_STATIC_LIB})
|
target_link_libraries(numbuf ${ARROW_LIB} ${ARROW_IO_LIB} ${ARROW_IPC_LIB} ${PYTHON_LIBRARIES} plasma_lib common ${FLATBUFFERS_STATIC_LIB} -lpthread)
|
||||||
else()
|
else()
|
||||||
target_link_libraries(numbuf ${ARROW_LIB} ${ARROW_IO_LIB} ${ARROW_IPC_LIB} ${PYTHON_LIBRARIES})
|
target_link_libraries(numbuf ${ARROW_LIB} ${ARROW_IO_LIB} ${ARROW_IPC_LIB} ${PYTHON_LIBRARIES} -lpthread)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
install(TARGETS numbuf DESTINATION ${CMAKE_SOURCE_DIR}/numbuf/)
|
install(TARGETS numbuf DESTINATION ${CMAKE_SOURCE_DIR}/numbuf/)
|
||||||
|
|
|
@ -3,15 +3,113 @@
|
||||||
|
|
||||||
#include <arrow/io/interfaces.h>
|
#include <arrow/io/interfaces.h>
|
||||||
|
|
||||||
|
/* C++ includes */
|
||||||
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#define THREADPOOL_SIZE 8
|
||||||
|
#define MEMCOPY_BLOCK_SIZE 64
|
||||||
|
#define BYTES_IN_MB (1 << 20)
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
namespace numbuf {
|
namespace numbuf {
|
||||||
|
|
||||||
|
class ParallelMemcopy {
|
||||||
|
public:
|
||||||
|
explicit ParallelMemcopy(uint64_t block_size, bool timeit, int threadpool_size)
|
||||||
|
: timeit_(timeit),
|
||||||
|
threadpool_(threadpool_size),
|
||||||
|
block_size_(block_size),
|
||||||
|
threadpool_size_(threadpool_size) {}
|
||||||
|
|
||||||
|
void memcopy(uint8_t* dst, const uint8_t* src, uint64_t nbytes) {
|
||||||
|
struct timeval tv1, tv2;
|
||||||
|
if (timeit_) {
|
||||||
|
// Start the timer.
|
||||||
|
gettimeofday(&tv1, NULL);
|
||||||
|
}
|
||||||
|
if (nbytes >= BYTES_IN_MB) {
|
||||||
|
memcopy_aligned(dst, src, nbytes, block_size_);
|
||||||
|
} else {
|
||||||
|
memcpy(dst, src, nbytes);
|
||||||
|
}
|
||||||
|
if (timeit_) {
|
||||||
|
// Stop the timer and log the measured time.
|
||||||
|
gettimeofday(&tv2, NULL);
|
||||||
|
double elapsed =
|
||||||
|
((tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec)) / 1000000.0;
|
||||||
|
// TODO: replace this with ARROW_LOG(ARROW_INFO) or better equivalent.
|
||||||
|
printf("Copied %llu bytes in time = %8.4f MBps = %8.4f\n", nbytes, elapsed,
|
||||||
|
nbytes / (BYTES_IN_MB * elapsed));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
~ParallelMemcopy() {
|
||||||
|
// Join threadpool threads just in case they are still running.
|
||||||
|
for (auto& t : threadpool_) {
|
||||||
|
if (t.joinable()) { t.join(); }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
/** Controls whether the memcopy operations are timed. */
|
||||||
|
bool timeit_;
|
||||||
|
/** Specifies the desired alignment in bytes, as a power of 2. */
|
||||||
|
uint64_t block_size_;
|
||||||
|
/** Number of threads to be used for parallel memcopy operations. */
|
||||||
|
int threadpool_size_;
|
||||||
|
/** Internal threadpool to be used in the fork/join pattern. */
|
||||||
|
std::vector<std::thread> threadpool_;
|
||||||
|
|
||||||
|
void memcopy_aligned(
|
||||||
|
uint8_t* dst, const uint8_t* src, uint64_t nbytes, uint64_t block_size) {
|
||||||
|
uint64_t num_threads = threadpool_size_;
|
||||||
|
uint64_t dst_address = reinterpret_cast<uint64_t>(dst);
|
||||||
|
uint64_t src_address = reinterpret_cast<uint64_t>(src);
|
||||||
|
uint64_t left_address = (src_address + block_size - 1) & ~(block_size - 1);
|
||||||
|
uint64_t right_address = (src_address + nbytes) & ~(block_size - 1);
|
||||||
|
uint64_t num_blocks = (right_address - left_address) / block_size;
|
||||||
|
// Update right address
|
||||||
|
right_address = right_address - (num_blocks % num_threads) * block_size;
|
||||||
|
// Now we divide these blocks between available threads. The remainder is
|
||||||
|
// handled on the main thread.
|
||||||
|
|
||||||
|
uint64_t chunk_size = (right_address - left_address) / num_threads;
|
||||||
|
uint64_t prefix = left_address - src_address;
|
||||||
|
uint64_t suffix = src_address + nbytes - right_address;
|
||||||
|
// Now the data layout is | prefix | k * num_threads * block_size | suffix |.
|
||||||
|
// We have chunk_size = k * block_size, therefore the data layout is
|
||||||
|
// | prefix | num_threads * chunk_size | suffix |.
|
||||||
|
// Each thread gets a "chunk" of k blocks.
|
||||||
|
|
||||||
|
// Start all threads first and handle leftovers while threads run.
|
||||||
|
for (int i = 0; i < num_threads; i++) {
|
||||||
|
threadpool_[i] = std::thread(memcpy, dst + prefix + i * chunk_size,
|
||||||
|
reinterpret_cast<uint8_t*>(left_address) + i * chunk_size, chunk_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy(dst, src, prefix);
|
||||||
|
memcpy(dst + prefix + num_threads * chunk_size,
|
||||||
|
reinterpret_cast<uint8_t*>(right_address), suffix);
|
||||||
|
|
||||||
|
for (auto& t : threadpool_) {
|
||||||
|
if (t.joinable()) { t.join(); }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
class FixedBufferStream : public arrow::io::OutputStream,
|
class FixedBufferStream : public arrow::io::OutputStream,
|
||||||
public arrow::io::ReadableFileInterface {
|
public arrow::io::ReadableFileInterface {
|
||||||
public:
|
public:
|
||||||
virtual ~FixedBufferStream() {}
|
virtual ~FixedBufferStream() {}
|
||||||
|
|
||||||
explicit FixedBufferStream(uint8_t* data, int64_t nbytes)
|
explicit FixedBufferStream(uint8_t* data, int64_t nbytes)
|
||||||
: data_(data), position_(0), size_(nbytes) {}
|
: data_(data),
|
||||||
|
position_(0),
|
||||||
|
size_(nbytes),
|
||||||
|
memcopy_helper(MEMCOPY_BLOCK_SIZE, false, THREADPOOL_SIZE) {}
|
||||||
|
|
||||||
arrow::Status Read(int64_t nbytes, std::shared_ptr<arrow::Buffer>* out) override {
|
arrow::Status Read(int64_t nbytes, std::shared_ptr<arrow::Buffer>* out) override {
|
||||||
DCHECK(out);
|
DCHECK(out);
|
||||||
|
@ -44,7 +142,7 @@ class FixedBufferStream : public arrow::io::OutputStream,
|
||||||
DCHECK(position_ + nbytes <= size_) << "position: " << position_
|
DCHECK(position_ + nbytes <= size_) << "position: " << position_
|
||||||
<< " nbytes: " << nbytes << "size: " << size_;
|
<< " nbytes: " << nbytes << "size: " << size_;
|
||||||
uint8_t* dst = data_ + position_;
|
uint8_t* dst = data_ + position_;
|
||||||
memcpy(dst, data, nbytes);
|
memcopy_helper.memcopy(dst, data, nbytes);
|
||||||
position_ += nbytes;
|
position_ += nbytes;
|
||||||
return arrow::Status::OK();
|
return arrow::Status::OK();
|
||||||
}
|
}
|
||||||
|
@ -60,6 +158,7 @@ class FixedBufferStream : public arrow::io::OutputStream,
|
||||||
uint8_t* data_;
|
uint8_t* data_;
|
||||||
int64_t position_;
|
int64_t position_;
|
||||||
int64_t size_;
|
int64_t size_;
|
||||||
|
ParallelMemcopy memcopy_helper;
|
||||||
};
|
};
|
||||||
|
|
||||||
class MockBufferStream : public arrow::io::OutputStream {
|
class MockBufferStream : public arrow::io::OutputStream {
|
||||||
|
|
|
@ -57,9 +57,9 @@ if(APPLE)
|
||||||
endif(APPLE)
|
endif(APPLE)
|
||||||
|
|
||||||
if(APPLE)
|
if(APPLE)
|
||||||
target_link_libraries(plasma -Wl,-force_load,${FLATBUFFERS_STATIC_LIB} common ${PYTHON_LIBRARIES} ${FLATBUFFERS_STATIC_LIB})
|
target_link_libraries(plasma -Wl,-force_load,${FLATBUFFERS_STATIC_LIB} common ${PYTHON_LIBRARIES} ${FLATBUFFERS_STATIC_LIB} -lpthread)
|
||||||
else(APPLE)
|
else(APPLE)
|
||||||
target_link_libraries(plasma -Wl,--whole-archive ${FLATBUFFERS_STATIC_LIB} -Wl,--no-whole-archive common ${PYTHON_LIBRARIES} ${FLATBUFFERS_STATIC_LIB})
|
target_link_libraries(plasma -Wl,--whole-archive ${FLATBUFFERS_STATIC_LIB} -Wl,--no-whole-archive common ${PYTHON_LIBRARIES} ${FLATBUFFERS_STATIC_LIB} -lpthread)
|
||||||
endif(APPLE)
|
endif(APPLE)
|
||||||
|
|
||||||
include_directories("${FLATBUFFERS_INCLUDE_DIR}")
|
include_directories("${FLATBUFFERS_INCLUDE_DIR}")
|
||||||
|
@ -87,7 +87,7 @@ add_library(plasma_lib STATIC
|
||||||
fling.c
|
fling.c
|
||||||
thirdparty/xxhash.c)
|
thirdparty/xxhash.c)
|
||||||
|
|
||||||
target_link_libraries(plasma_lib common ${FLATBUFFERS_STATIC_LIB})
|
target_link_libraries(plasma_lib common ${FLATBUFFERS_STATIC_LIB} -lpthread)
|
||||||
add_dependencies(plasma_lib gen_plasma_fbs)
|
add_dependencies(plasma_lib gen_plasma_fbs)
|
||||||
|
|
||||||
add_dependencies(plasma protocol_fbs)
|
add_dependencies(plasma protocol_fbs)
|
||||||
|
|
|
@ -16,6 +16,9 @@
|
||||||
#include "utarray.h"
|
#include "utarray.h"
|
||||||
#include "uthash.h"
|
#include "uthash.h"
|
||||||
|
|
||||||
|
/** Allocation granularity used in plasma for object allocation. */
|
||||||
|
#define BLOCK_SIZE 64
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Object request data structure. Used in the plasma_wait_for_objects()
|
* Object request data structure. Used in the plasma_wait_for_objects()
|
||||||
* argument.
|
* argument.
|
||||||
|
|
|
@ -28,6 +28,10 @@
|
||||||
#include "uthash.h"
|
#include "uthash.h"
|
||||||
#include "utlist.h"
|
#include "utlist.h"
|
||||||
|
|
||||||
|
/* C++ includes */
|
||||||
|
#include <vector>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#include "sha256.h"
|
#include "sha256.h"
|
||||||
#include "fling.h"
|
#include "fling.h"
|
||||||
|
@ -38,6 +42,10 @@ extern "C" {
|
||||||
#define XXH64_DEFAULT_SEED 0
|
#define XXH64_DEFAULT_SEED 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define THREADPOOL_SIZE 8
|
||||||
|
#define BYTES_IN_MB (1 << 20)
|
||||||
|
static std::vector<std::thread> threadpool_(THREADPOOL_SIZE);
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
/** Key that uniquely identifies the memory mapped file. In practice, we
|
/** Key that uniquely identifies the memory mapped file. In practice, we
|
||||||
* take the numerical value of the file descriptor in the object store. */
|
* take the numerical value of the file descriptor in the object store. */
|
||||||
|
@ -465,6 +473,66 @@ void plasma_contains(PlasmaConnection *conn, ObjectID obj_id, int *has_object) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void compute_block_hash(const unsigned char *data,
|
||||||
|
int64_t nbytes,
|
||||||
|
uint64_t *hash) {
|
||||||
|
XXH64_state_t hash_state;
|
||||||
|
XXH64_reset(&hash_state, XXH64_DEFAULT_SEED);
|
||||||
|
XXH64_update(&hash_state, data, nbytes);
|
||||||
|
*hash = XXH64_digest(&hash_state);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline bool compute_object_hash_parallel(XXH64_state_t *hash_state,
|
||||||
|
const unsigned char *data,
|
||||||
|
int64_t nbytes) {
|
||||||
|
/* Note that this function will likely be faster if the address of data is
|
||||||
|
* aligned on a 64-byte boundary. */
|
||||||
|
const uint64_t num_threads = THREADPOOL_SIZE;
|
||||||
|
uint64_t threadhash[num_threads + 1];
|
||||||
|
const uint64_t data_address = reinterpret_cast<uint64_t>(data);
|
||||||
|
const uint64_t num_blocks = nbytes / BLOCK_SIZE;
|
||||||
|
const uint64_t chunk_size = (num_blocks / num_threads) * BLOCK_SIZE;
|
||||||
|
const uint64_t right_address = data_address + chunk_size * num_threads;
|
||||||
|
const uint64_t suffix = (data_address + nbytes) - right_address;
|
||||||
|
/* Now the data layout is | k * num_threads * block_size | suffix | ==
|
||||||
|
* | num_threads * chunk_size | suffix |, where chunk_size = k * block_size.
|
||||||
|
* Each thread gets a "chunk" of k blocks, except the suffix thread. */
|
||||||
|
|
||||||
|
for (int i = 0; i < num_threads; i++) {
|
||||||
|
threadpool_[i] =
|
||||||
|
std::thread(compute_block_hash,
|
||||||
|
reinterpret_cast<uint8_t *>(data_address) + i * chunk_size,
|
||||||
|
chunk_size, &threadhash[i]);
|
||||||
|
}
|
||||||
|
compute_block_hash(reinterpret_cast<uint8_t *>(right_address), suffix,
|
||||||
|
&threadhash[num_threads]);
|
||||||
|
|
||||||
|
/* Join the threads. */
|
||||||
|
for (auto &t : threadpool_) {
|
||||||
|
if (t.joinable()) {
|
||||||
|
t.join();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
XXH64_update(hash_state, (unsigned char *) threadhash, sizeof(threadhash));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint64_t compute_object_hash(const ObjectBuffer &obj_buffer) {
|
||||||
|
XXH64_state_t hash_state;
|
||||||
|
XXH64_reset(&hash_state, XXH64_DEFAULT_SEED);
|
||||||
|
if (obj_buffer.data_size >= BYTES_IN_MB) {
|
||||||
|
compute_object_hash_parallel(&hash_state, (unsigned char *) obj_buffer.data,
|
||||||
|
obj_buffer.data_size);
|
||||||
|
} else {
|
||||||
|
XXH64_update(&hash_state, (unsigned char *) obj_buffer.data,
|
||||||
|
obj_buffer.data_size);
|
||||||
|
}
|
||||||
|
XXH64_update(&hash_state, (unsigned char *) obj_buffer.metadata,
|
||||||
|
obj_buffer.metadata_size);
|
||||||
|
return XXH64_digest(&hash_state);
|
||||||
|
}
|
||||||
|
|
||||||
bool plasma_compute_object_hash(PlasmaConnection *conn,
|
bool plasma_compute_object_hash(PlasmaConnection *conn,
|
||||||
ObjectID obj_id,
|
ObjectID obj_id,
|
||||||
unsigned char *digest) {
|
unsigned char *digest) {
|
||||||
|
@ -472,21 +540,15 @@ bool plasma_compute_object_hash(PlasmaConnection *conn,
|
||||||
* the operation should timeout immediately. */
|
* the operation should timeout immediately. */
|
||||||
ObjectBuffer obj_buffer;
|
ObjectBuffer obj_buffer;
|
||||||
ObjectID obj_id_array[1] = {obj_id};
|
ObjectID obj_id_array[1] = {obj_id};
|
||||||
|
uint64_t hash;
|
||||||
|
|
||||||
plasma_get(conn, obj_id_array, 1, 0, &obj_buffer);
|
plasma_get(conn, obj_id_array, 1, 0, &obj_buffer);
|
||||||
/* If the object was not retrieved, return false. */
|
/* If the object was not retrieved, return false. */
|
||||||
if (obj_buffer.data_size == -1) {
|
if (obj_buffer.data_size == -1) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
/* Compute the hash. */
|
/* Compute the hash. */
|
||||||
XXH64_state_t hash_state;
|
hash = compute_object_hash(obj_buffer);
|
||||||
XXH64_reset(&hash_state, XXH64_DEFAULT_SEED);
|
|
||||||
XXH64_update(&hash_state, (unsigned char *) obj_buffer.data,
|
|
||||||
obj_buffer.data_size);
|
|
||||||
XXH64_update(&hash_state, (unsigned char *) obj_buffer.metadata,
|
|
||||||
obj_buffer.metadata_size);
|
|
||||||
uint64_t hash = XXH64_digest(&hash_state);
|
|
||||||
DCHECK(DIGEST_SIZE >= sizeof(hash));
|
|
||||||
memset(digest, 0, DIGEST_SIZE);
|
|
||||||
memcpy(digest, &hash, sizeof(hash));
|
memcpy(digest, &hash, sizeof(hash));
|
||||||
/* Release the plasma object. */
|
/* Release the plasma object. */
|
||||||
plasma_release(conn, obj_id);
|
plasma_release(conn, obj_id);
|
||||||
|
@ -505,7 +567,7 @@ void plasma_seal(PlasmaConnection *conn, ObjectID object_id) {
|
||||||
"Plasma client called seal an already sealed object");
|
"Plasma client called seal an already sealed object");
|
||||||
object_entry->is_sealed = true;
|
object_entry->is_sealed = true;
|
||||||
/* Send the seal request to Plasma. */
|
/* Send the seal request to Plasma. */
|
||||||
unsigned char digest[DIGEST_SIZE];
|
static unsigned char digest[DIGEST_SIZE];
|
||||||
CHECK(plasma_compute_object_hash(conn, object_id, &digest[0]));
|
CHECK(plasma_compute_object_hash(conn, object_id, &digest[0]));
|
||||||
CHECK(plasma_send_SealRequest(conn->store_conn, conn->builder, object_id,
|
CHECK(plasma_send_SealRequest(conn->store_conn, conn->builder, object_id,
|
||||||
&digest[0]) >= 0);
|
&digest[0]) >= 0);
|
||||||
|
|
|
@ -40,6 +40,7 @@ extern "C" {
|
||||||
#include "fling.h"
|
#include "fling.h"
|
||||||
#include "malloc.h"
|
#include "malloc.h"
|
||||||
void *dlmalloc(size_t);
|
void *dlmalloc(size_t);
|
||||||
|
void *dlmemalign(size_t alignment, size_t bytes);
|
||||||
void dlfree(void *);
|
void dlfree(void *);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -230,8 +231,15 @@ int create_object(Client *client_context,
|
||||||
if (!success) {
|
if (!success) {
|
||||||
return PlasmaError_OutOfMemory;
|
return PlasmaError_OutOfMemory;
|
||||||
}
|
}
|
||||||
/* Allocate space for the new object */
|
/* Allocate space for the new object. We use dlmemalign instead of dlmalloc in
|
||||||
uint8_t *pointer = (uint8_t *) dlmalloc(data_size + metadata_size);
|
* order to align the allocated region to a 64-byte boundary. This is not
|
||||||
|
* strictly necessary, but it is an optimization that could speed up the
|
||||||
|
* computation of a hash of the data (see compute_object_hash_parallel in
|
||||||
|
* plasma_client.cc). Note that even though this pointer is 64-byte aligned,
|
||||||
|
* it is not guaranteed that the corresponding pointer in the client will be
|
||||||
|
* 64-byte aligned, but in practice it often will be. */
|
||||||
|
uint8_t *pointer =
|
||||||
|
(uint8_t *) dlmemalign(BLOCK_SIZE, data_size + metadata_size);
|
||||||
int fd;
|
int fd;
|
||||||
int64_t map_size;
|
int64_t map_size;
|
||||||
ptrdiff_t offset;
|
ptrdiff_t offset;
|
||||||
|
|
Loading…
Add table
Reference in a new issue