-
Notifications
You must be signed in to change notification settings - Fork 148
Open
Description
With GPU enabled, tensorflow freezes unless I force "Discounted Monte-Carlo returns." to CPU. Adding with tf.device("/cpu") into discounted_return(reward, length, discount) seem to address the issue.
def discounted_return(reward, length, discount):
"""Discounted Monte-Carlo returns."""
timestep = tf.range(reward.shape[1].value)
mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
with tf.device("/cpu"):
return_ = tf.reverse(tf.transpose(tf.scan(
lambda agg, cur: cur + discount * agg,
tf.transpose(tf.reverse(mask * reward, [1]), [1, 0]),
tf.zeros_like(reward[:, -1]), 1, False), [1, 0]), [1])
return tf.check_numerics(tf.stop_gradient(return_), 'return')
I've seen this with TF 1.7, 1.11, 1.12, CUDA 8, 9, 10 and CUDA compute capability from 5.2 to 7.5.
Not sure how to debug TensorFlow when it quietly freezes (or crashes). Tried the thing with TensorFlow Debugger - it doesn't really show where it happens and also has GRPC issues. GDB shows that the process is in the following place, but with so many threads it is hard to tell if this has any relevance:
#0 syscall () at ../sysdeps/unix/sysv/linux/x86_64/syscall.S:38
#1 0x00007f4902dde6db in nsync::nsync_mu_semaphore_p_with_deadline(nsync::nsync_semaphore_s_*, timespec) () from /home/dmitry/.local/lib/python3.6/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#2 0x00007f4902dddcf9 in nsync::nsync_sem_wait_with_cancel_(nsync::waiter*, timespec, nsync::nsync_note_s_*) () from /home/dmitry/.local/lib/python3.6/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#3 0x00007f4902ddb2bb in nsync::nsync_cv_wait_with_deadline_generic(nsync::nsync_cv_s_*, void*, void (*)(void*), void (*)(void*), timespec, nsync::nsync_note_s_*) () from /home/dmitry/.local/lib/python3.6/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#4 0x00007f4902ddb793 in nsync::nsync_cv_wait_with_deadline(nsync::nsync_cv_s_*, nsync::nsync_mu_s_*, timespec, nsync::nsync_note_s_*) () from /home/dmitry/.local/lib/python3.6/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#5 0x00007f490280594c in tensorflow::DirectSession::WaitForNotification(tensorflow::Notification*, long long) () from /home/dmitry/.local/lib/python3.6/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#6 0x00007f490280599b in tensorflow::DirectSession::WaitForNotification(tensorflow::DirectSession::RunState*, tensorflow::CancellationManager*, long long) () from /home/dmitry/.local/lib/python3.6/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#7 0x00007f490280c5fb in tensorflow::DirectSession::RunInternal(long long, tensorflow::RunOptions const&, tensorflow::CallFrameInterface*, tensorflow::DirectSession::ExecutorsAndKeys*, tensorflow::RunMetadata*) ()
from /home/dmitry/.local/lib/python3.6/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#8 0x00007f4902815598 in tensorflow::DirectSession::Run(tensorflow::RunOptions const&, std::vector<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorflow::Tensor>, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorflow::Tensor> > > const&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, std::vector<tensorflow::Tensor, std::allocator<tensorflow::Tensor> >*, tensorflow::RunMetadata*) ()
from /home/dmitry/.local/lib/python3.6/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#9 0x00007f48ff8afc8c in tensorflow::SessionRef::Run(tensorflow::RunOptions const&, std::vector<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorflow::Tensor>, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorflow::Tensor> > > const&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, std::vector<tensorflow::Tensor, std::allocator<tensorflow::Tensor> >*, tensorflow::RunMetadata*) ()
from /home/dmitry/.local/lib/python3.6/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#10 0x00007f48ffaa49b4 in TF_Run_Helper(tensorflow::Session*, char const*, TF_Buffer const*, std::vector<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorflow::Tensor>, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorflow::Tensor> > > const&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, TF_Tensor**, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, TF_Buffer*, TF_Status*) ()
from /home/dmitry/.local/lib/python3.6/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#11 0x00007f48ffaa57e6 in TF_SessionRun () from /home/dmitry/.local/lib/python3.6/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#12 0x00007f48ff8acffd in tensorflow::TF_SessionRun_wrapper_helper(TF_Session*, char const*, TF_Buffer const*, std::vector<TF_Output, std::allocator<TF_Output> > const&, std::vector<_object*, std::allocator<_object*> > const&, std::vector<TF_Output, std::allocator<TF_Output> > const&, std::vector<TF_Operation*, std::allocator<TF_Operation*> > const&, TF_Buffer*, TF_Status*, std::vector<_object*, std::allocator<_object*> >*) () from /home/dmitry/.local/lib/python3.6/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#13 0x00007f48ff8ad032 in tensorflow::TF_SessionRun_wrapper(TF_Session*, TF_Buffer const*, std::vector<TF_Output, std::allocator<TF_Output> > const&, std::vector<_object*, std::allocator<_object*> > const&, std::vector<TF_Output, std::allocator<TF_Output> > const&, std::vector<TF_Operation*, std::allocator<TF_Operation*> > const&, TF_Buffer*, TF_Status*, std::vector<_object*, std::allocator<_object*> >*) () from /home/dmitry/.local/lib/python3.6/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#14 0x00007f48ff867d84 in _wrap_TF_SessionRun_wrapper () from /home/dmitry/.local/lib/python3.6/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
Metadata
Metadata
Assignees
Labels
No labels