From 49decce6f632a875aac11478505c7a818d0c3470 Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Wed, 6 Jul 2016 00:25:02 -0700 Subject: [PATCH] print python backtrace in error message (#209) --- lib/python/ray/worker.py | 26 ++++++++++++++++++++++---- src/scheduler.cc | 2 +- test/runtest.py | 6 +++--- 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/lib/python/ray/worker.py b/lib/python/ray/worker.py index 7a5865afb..c67113a8e 100644 --- a/lib/python/ray/worker.py +++ b/lib/python/ray/worker.py @@ -8,6 +8,7 @@ import funcsigs import numpy as np import colorama import copy +import traceback import ray import ray.config as config @@ -120,7 +121,7 @@ def print_failed_task(task_status): Error: Task failed Function Name: {} Task ID: {} - Error Message: {} + Error Message: \n{} """.format(task_status["function_name"], task_status["operationid"], task_status["error_message"]) # This is a helper method. It should not be called by users. @@ -266,6 +267,22 @@ def restart_workers_local(num_workers, worker_path, worker=global_worker): return False services.start_workers(worker.scheduler_address, worker.objstore_address, num_workers, worker_path) +def format_error_message(exception_message): + """ + This method takes an backtrace from an exception and makes it nicer by + removing a few uninformative lines and adding some space to indent the + remaining lines nicely. + + :param exception_message: a string generated by traceback.format_exc() + :rtype: a string + """ + lines = exception_message.split("\n") + # Remove lines 1, 2, 3, and 4, which are always the same, they just contain + # information about the main loop. + lines = lines[0:1] + lines[5:] + lines = [10 * " " + line for line in lines] + return "\n".join(lines) + def main_loop(worker=global_worker): if not ray.lib.connected(worker.handle): raise Exception("Worker is attempting to enter main_loop but has not been connected yet.") @@ -278,12 +295,13 @@ def main_loop(worker=global_worker): if len(return_objrefs) == 1: outputs = (outputs,) except Exception as e: + exception_message = format_error_message(traceback.format_exc()) # Here we are storing RayFailedObjects in the object store to indicate # failure (this is only interpreted by the worker). - failure_objects = [RayFailedObject(str(e)) for _ in range(len(return_objrefs))] + failure_objects = [RayFailedObject(exception_message) for _ in range(len(return_objrefs))] store_outputs_in_objstore(return_objrefs, failure_objects, worker) - ray.lib.notify_task_completed(worker.handle, False, str(e)) # notify the scheduler that the task threw an exception - logging.info("Worker through exception with message: {}, while running function {}.".format(str(e), func_name)) + ray.lib.notify_task_completed(worker.handle, False, exception_message) # notify the scheduler that the task threw an exception + logging.info("Worker threw exception with message: \n\n{}\n, while running function {}.".format(exception_message, func_name)) else: store_outputs_in_objstore(return_objrefs, outputs, worker) # store output in local object store ray.lib.notify_task_completed(worker.handle, True, "") # notify the scheduler that the task completed successfully diff --git a/src/scheduler.cc b/src/scheduler.cc index a7275dc3e..51fe4fc33 100644 --- a/src/scheduler.cc +++ b/src/scheduler.cc @@ -164,7 +164,7 @@ Status SchedulerService::ReadyForNewTask(ServerContext* context, const ReadyForN (*workers)[workerid].current_task = NO_OPERATION; // clear operation ID } if (!request->previous_task_info().task_succeeded()) { - RAY_LOG(RAY_INFO, "Error: Task " << info.operationid() << " executing function " << info.function_name() << " on worker " << workerid << " failed with error message: " << info.error_message()); + RAY_LOG(RAY_INFO, "Error: Task " << info.operationid() << " executing function " << info.function_name() << " on worker " << workerid << " failed with error message:\n" << info.error_message()); failed_tasks_.get()->push_back(info); } else { successful_tasks_.get()->push_back(info.operationid()); diff --git a/test/runtest.py b/test/runtest.py index 2421acabc..0bbf39587 100644 --- a/test/runtest.py +++ b/test/runtest.py @@ -289,7 +289,7 @@ class TaskStatusTest(unittest.TestCase): for task in result["failed_tasks"]: self.assertTrue(task.has_key("worker_address")) self.assertTrue(task.has_key("operationid")) - self.assertEqual(task.get("error_message"), "Test function 1 intentionally failed.") + self.assertTrue("Test function 1 intentionally failed." in task.get("error_message")) self.assertTrue(task["operationid"] not in task_ids) task_ids.add(task["operationid"]) @@ -297,7 +297,7 @@ class TaskStatusTest(unittest.TestCase): try: ray.get(x) except Exception as e: - self.assertEqual(str(e), "The task that created this object reference failed with error message: Test function 2 intentionally failed.") + self.assertTrue("Test function 2 intentionally failed."in str(e)) else: self.assertTrue(False) # ray.get should throw an exception @@ -306,7 +306,7 @@ class TaskStatusTest(unittest.TestCase): try: ray.get(ref) except Exception as e: - self.assertEqual(str(e), "The task that created this object reference failed with error message: Test function 3 intentionally failed.") + self.assertTrue("Test function 3 intentionally failed."in str(e)) else: self.assertTrue(False) # ray.get should throw an exception