mirror of
https://github.com/vale981/ray
synced 2025-03-11 21:56:39 -04:00

* Worker reports error in previous task, actor task counter is incremented after task is successful * Refactor actor task execution - Return new task counter in GetTaskRequest - Update worker state for actor tasks inside of the actor method executor * Manually invoked checkpoint method * Scheduling for actor checkpoint methods * Fix python bugs in checkpointing * Return task success from worker to local scheduler instead of actor counter * Kill local schedulers halfway through actor execution instead of waiting for all tasks to execute once * Remove redundant actor tasks during dispatch, reconstruct missing dependencies for actor tasks * Make executor for temporary actor methods * doc * Set default argument for whether the previous task was a success * Refactor actor method call * Simplify checkpoint task submission * lint * fix philipp's comments * Add missing line * Make actor reconstruction tests run faster * Unimportant whitespace. * Unimportant whitespace. * Update checkpoint method signature * Documentation and handle exceptions during checkpoint save/resume * Rename get_task message field to actor_checkpoint_failed * Fix bug. * Remove debugging check, redirect test output
152 lines
5 KiB
C++
152 lines
5 KiB
C++
#ifndef LOCAL_SCHEDULER_CLIENT_H
|
|
#define LOCAL_SCHEDULER_CLIENT_H
|
|
|
|
#include "common/task.h"
|
|
#include "local_scheduler_shared.h"
|
|
|
|
struct LocalSchedulerConnection {
|
|
/** File descriptor of the Unix domain socket that connects to local
|
|
* scheduler. */
|
|
int conn;
|
|
/** The actor ID of this client. If this client is not an actor, then this
|
|
* should be NIL_ACTOR_ID. */
|
|
ActorID actor_id;
|
|
/** The IDs of the GPUs that this client can use. */
|
|
std::vector<int> gpu_ids;
|
|
};
|
|
|
|
/**
|
|
* Connect to the local scheduler.
|
|
*
|
|
* @param local_scheduler_socket The name of the socket to use to connect to the
|
|
* local scheduler.
|
|
* @param actor_id The ID of the actor running on this worker. If no actor is
|
|
* running on this actor, this should be NIL_ACTOR_ID.
|
|
* @param is_worker Whether this client is a worker. If it is a worker, an
|
|
* additional message will be sent to register as one.
|
|
* @param num_gpus The number of GPUs required by this worker. This is only
|
|
* used if the worker is an actor.
|
|
* @return The connection information.
|
|
*/
|
|
LocalSchedulerConnection *LocalSchedulerConnection_init(
|
|
const char *local_scheduler_socket,
|
|
UniqueID worker_id,
|
|
ActorID actor_id,
|
|
bool is_worker,
|
|
int64_t num_gpus);
|
|
|
|
/**
|
|
* Disconnect from the local scheduler.
|
|
*
|
|
* @param conn Local scheduler connection information returned by
|
|
* LocalSchedulerConnection_init.
|
|
* @return Void.
|
|
*/
|
|
void LocalSchedulerConnection_free(LocalSchedulerConnection *conn);
|
|
|
|
/**
|
|
* Submit a task to the local scheduler.
|
|
*
|
|
* @param conn The connection information.
|
|
* @param task The address of the task to submit.
|
|
* @return Void.
|
|
*/
|
|
void local_scheduler_submit(LocalSchedulerConnection *conn,
|
|
TaskSpec *task,
|
|
int64_t task_size);
|
|
|
|
/**
|
|
* Notify the local scheduler that this client is disconnecting gracefully. This
|
|
* is used by actors to exit gracefully so that the local scheduler doesn't
|
|
* propagate an error message to the driver.
|
|
*
|
|
* @param conn The connection information.
|
|
* @return Void.
|
|
*/
|
|
void local_scheduler_disconnect_client(LocalSchedulerConnection *conn);
|
|
|
|
/**
|
|
* Log an event to the event log. This will call RPUSH key value. We use RPUSH
|
|
* instead of SET so that it is possible to flush the log multiple times with
|
|
* the same key (for example the key might be shared across logging calls in the
|
|
* same task on a worker).
|
|
*
|
|
* @param conn The connection information.
|
|
* @param key The key to store the event in.
|
|
* @param key_length The length of the key.
|
|
* @param value The value to store.
|
|
* @param value_length The length of the value.
|
|
* @param timestamp The time that the event is logged.
|
|
* @return Void.
|
|
*/
|
|
void local_scheduler_log_event(LocalSchedulerConnection *conn,
|
|
uint8_t *key,
|
|
int64_t key_length,
|
|
uint8_t *value,
|
|
int64_t value_length,
|
|
double timestamp);
|
|
|
|
/**
|
|
* Get next task for this client. This will block until the scheduler assigns
|
|
* a task to this worker. This allocates and returns a task, and so the task
|
|
* must be freed by the caller.
|
|
*
|
|
* @todo When does this actually get freed?
|
|
*
|
|
* @param conn The connection information.
|
|
* @param task_size A pointer to fill out with the task size.
|
|
* @param actor_checkpoint_failed If the last task assigned was a checkpoint
|
|
* task that failed.
|
|
* @return The address of the assigned task.
|
|
*/
|
|
TaskSpec *local_scheduler_get_task(LocalSchedulerConnection *conn,
|
|
int64_t *task_size,
|
|
bool actor_checkpoint_failed);
|
|
|
|
/**
|
|
* Tell the local scheduler that the client has finished executing a task.
|
|
*
|
|
* @param conn The connection information.
|
|
* @return Void.
|
|
*/
|
|
void local_scheduler_task_done(LocalSchedulerConnection *conn);
|
|
|
|
/**
|
|
* Tell the local scheduler to reconstruct an object.
|
|
*
|
|
* @param conn The connection information.
|
|
* @param object_id The ID of the object to reconstruct.
|
|
* @return Void.
|
|
*/
|
|
void local_scheduler_reconstruct_object(LocalSchedulerConnection *conn,
|
|
ObjectID object_id);
|
|
|
|
/**
|
|
* Send a log message to the local scheduler.
|
|
*
|
|
* @param conn The connection information.
|
|
* @return Void.
|
|
*/
|
|
void local_scheduler_log_message(LocalSchedulerConnection *conn);
|
|
|
|
/**
|
|
* Notify the local scheduler that this client (worker) is no longer blocked.
|
|
*
|
|
* @param conn The connection information.
|
|
* @return Void.
|
|
*/
|
|
void local_scheduler_notify_unblocked(LocalSchedulerConnection *conn);
|
|
|
|
/**
|
|
* Record the mapping from object ID to task ID for put events.
|
|
*
|
|
* @param conn The connection information.
|
|
* @param task_id The ID of the task that called put.
|
|
* @param object_id The ID of the object being stored.
|
|
* @return Void.
|
|
*/
|
|
void local_scheduler_put_object(LocalSchedulerConnection *conn,
|
|
TaskID task_id,
|
|
ObjectID object_id);
|
|
|
|
#endif
|