Skip to content

Commit

Permalink
TensorFlow: upstream changes to git.
Browse files Browse the repository at this point in the history
Change 109321497
	Move all images to images directory to make docs versioning easier
	- adjust all paths in the docs to point to the new locations
	- remove some now redundant section-order tags added for the old website
Change 109317807
	Added a kernel op to compute the eigendecomposition of a self-adjoint matrix.

	Added a new kernel op called self_adjoint_eig (and a batch_self_adjoint_eig) that
	computes the eigendecomposition of a self-adjoint matrix. The return value is
	the concatenation of the eigenvalues as a row vector, and the eigenvectors.
Change 109310773
	Change `_read32()` in the MNIST input example to return an int.

	Currently we return a 1-D numpy array with 1 element. Numpy has
	recently deprecated the ability to treat this as a scalar, and as a
	result this tutorial fails. The fix returns the 0th element of the
	array instead.
Change 109301269
	Re-arrange TensorBoard demo files.
Change 109273589
	add ci_build for ci.tensorflow.org
Change 109260293
	Speed up NodeDef -> OpKernel process by not spending time generating
	an error message for missing "_kernel" attr that will be thrown away.
Change 109257179
	TensorFlow:make event_file_loader_test hermetic by using tempfile
	instead of fixed filenames.  Without this change, running
	event_file_loader_test twice in the same client (locally)
	causes it to fail, because it writes into the same file and appends
	another event, instead of starting from scratch.
Change 109256464
	Minor cleanup in TensorBoard server code
Change 109255382
	Change to reduce critical section times in gpu_event_mgr.h:
	(1) Call stream->ThenRecordEvent outside the EventMgr critical section
	(2) Do memory deallocation outside the critical section

	Speeds up one configuration of ptb_word_lm from 2924 words per
	second (wps) to 3278 wps on my desktop machine with a Titan X.
Change 109254843
	Fix use of uninitialized memory in test.
Change 109250995
	python_config.sh needs a license header

	Otherwise the license test fails.
Change 109249914
	add ci_build for ci.tensorflow.org
Change 109249397
	Fixes reduce_sum (complex) on GPU segfaults.

	Fixes tensorflow#357

Change 109245652
	add ci_build for ci.tensorflow.org

Base CL: 109321563
  • Loading branch information
Vijay Vasudevan committed Dec 3, 2015
1 parent bb7a7a8 commit a4806a3
Show file tree
Hide file tree
Showing 64 changed files with 1,016 additions and 298 deletions.
4 changes: 3 additions & 1 deletion tensorflow/cc/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
# TensorFlow is a computational framework, primarily for use in machine
# learning applications.

package(default_visibility = ["//tensorflow:internal"])
package(
default_visibility = ["//tensorflow:internal"],
)

licenses(["notice"]) # Apache 2.0

Expand Down
4 changes: 3 additions & 1 deletion tensorflow/core/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
# TensorFlow is a computational framework, primarily for use in machine
# learning applications.

package(default_visibility = ["//tensorflow:internal"])
package(
default_visibility = ["//tensorflow:internal"],
)

package_group(name = "friends")

Expand Down
34 changes: 17 additions & 17 deletions tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,13 @@ EventMgr::~EventMgr() {
delete e;
}
while (!used_events_.empty()) {
InUse* ue = &used_events_[0];
delete ue->event;
delete ue->mem;
if (ue->bufrec.buf) {
ue->bufrec.alloc->DeallocateRaw(ue->bufrec.buf);
delete used_events_[0].event;
delete used_events_[0].mem;
if (used_events_[0].bufrec.buf) {
used_events_[0].bufrec.alloc->DeallocateRaw(used_events_[0].bufrec.buf);
}
if (ue->func != nullptr) threadpool_.Schedule(ue->func);
if (used_events_[0].func != nullptr)
threadpool_.Schedule(used_events_[0].func);
used_events_.pop_front();
}
}
Expand All @@ -60,17 +60,15 @@ EventMgr::~EventMgr() {
void EventMgr::PollLoop() {
while (!stop_polling_.HasBeenNotified()) {
Env::Default()->SleepForMicroseconds(1 * 1000);
ToFreeVector to_free;
{
mutex_lock l(mu_);
PollEvents(true, &to_free);
PollEvents(true);
}
FreeMemory(to_free);
}
polling_stopped_.Notify();
}

void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu, gpu::Event** e) {
void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu) {
VLOG(2) << "QueueInUse free_events_ " << free_events_.size()
<< " used_events_ " << used_events_.size();
// Events are created on demand, and repeatedly reused. There is no
Expand All @@ -79,9 +77,10 @@ void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu, gpu::Event** e) {
free_events_.push_back(new gpu::Event(exec_));
free_events_.back()->Init();
}
*e = free_events_.back();
gpu::Event* e = free_events_.back();
free_events_.pop_back();
iu.event = *e;
stream->ThenRecordEvent(e);
iu.event = e;
used_events_.push_back(iu);
}

Expand All @@ -104,8 +103,7 @@ void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu, gpu::Event** e) {
// GPU memory use to spike needlessly. An alternative strategy would
// be to throttle new Op execution until the pending event queue
// clears.
void EventMgr::PollEvents(bool is_dedicated_poller,
gtl::InlinedVector<InUse, 4>* to_free) {
void EventMgr::PollEvents(bool is_dedicated_poller) {
VLOG(2) << "PollEvents free_events_ " << free_events_.size()
<< " used_events_ " << used_events_.size();
// Sweep the remaining events in order. If this is the dedicated
Expand All @@ -125,9 +123,11 @@ void EventMgr::PollEvents(bool is_dedicated_poller,
if (!is_dedicated_poller) return; // quit processing queue
break;
case gpu::Event::Status::kComplete:
// Make a copy of the InUse record so we can free it after releasing
// the lock
to_free->push_back(iu);
delete iu.mem;
if (iu.bufrec.buf) iu.bufrec.alloc->DeallocateRaw(iu.bufrec.buf);
// The function must be called in another thread, outside of
// the mutex held here.
if (iu.func != nullptr) threadpool_.Schedule(iu.func);
free_events_.push_back(iu.event);
// Mark this InUse record as completed.
iu.event = nullptr;
Expand Down
74 changes: 18 additions & 56 deletions tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,8 @@ limitations under the License.

#include <deque>
#include <vector>
#include "tensorflow/stream_executor/stream.h"
#include "tensorflow/core/lib/core/notification.h"
#include "tensorflow/core/lib/core/threadpool.h"
#include "tensorflow/core/lib/gtl/inlined_vector.h"
#include "tensorflow/core/platform/port.h"
#include "tensorflow/core/platform/thread_annotations.h"
#include "tensorflow/core/public/tensor.h"
Expand Down Expand Up @@ -49,15 +47,9 @@ class EventMgr {
// currently enqueued on *stream have completed.
inline void ThenDeleteTensors(perftools::gputools::Stream* stream,
std::vector<Tensor>* tensors) {
ToFreeVector to_free;
::perftools::gputools::Event* e;
{
mutex_lock l(mu_);
QueueTensors(stream, tensors, &e);
PollEvents(false, &to_free);
}
stream->ThenRecordEvent(e);
FreeMemory(to_free);
mutex_lock l(mu_);
QueueTensors(stream, tensors);
PollEvents(false);
}

struct BufRec {
Expand All @@ -69,28 +61,16 @@ class EventMgr {
// on it as soon as all events currently enqueued on *stream have completed.
inline void ThenDeleteBuffer(perftools::gputools::Stream* stream,
BufRec bufrec) {
ToFreeVector to_free;
::perftools::gputools::Event* e;
{
mutex_lock l(mu_);
QueueBuffer(stream, bufrec, &e);
PollEvents(false, &to_free);
}
stream->ThenRecordEvent(e);
FreeMemory(to_free);
mutex_lock l(mu_);
QueueBuffer(stream, bufrec);
PollEvents(false);
}

inline void ThenExecute(perftools::gputools::Stream* stream,
std::function<void()> func) {
ToFreeVector to_free;
::perftools::gputools::Event* e;
{
mutex_lock l(mu_);
QueueFunc(stream, func, &e);
PollEvents(false, &to_free);
}
stream->ThenRecordEvent(e);
FreeMemory(to_free);
mutex_lock l(mu_);
QueueFunc(stream, func);
PollEvents(false);
}

private:
Expand All @@ -105,50 +85,32 @@ class EventMgr {
std::function<void()> func;
};

typedef gtl::InlinedVector<InUse, 4> ToFreeVector;

void FreeMemory(const ToFreeVector& to_free) {
for (const auto& iu : to_free) {
delete iu.mem;
if (iu.bufrec.buf) iu.bufrec.alloc->DeallocateRaw(iu.bufrec.buf);
// The function must be called in another thread.
if (iu.func != nullptr) threadpool_.Schedule(iu.func);
}
}

// Stream-enqueue an unused Event and save with it a collection of
// Tensors and/or a BufRec to be deleted only after the Event
// records.
void QueueInUse(perftools::gputools::Stream* stream, InUse in_use,
::perftools::gputools::Event** e)
void QueueInUse(perftools::gputools::Stream* stream, InUse in_use)
EXCLUSIVE_LOCKS_REQUIRED(mu_);

void QueueTensors(perftools::gputools::Stream* stream,
std::vector<Tensor>* tensors,
::perftools::gputools::Event** e)
std::vector<Tensor>* tensors)
EXCLUSIVE_LOCKS_REQUIRED(mu_) {
QueueInUse(stream, {nullptr, tensors, BufRec(), nullptr}, e);
QueueInUse(stream, {nullptr, tensors, BufRec(), nullptr});
}

void QueueBuffer(perftools::gputools::Stream* stream, BufRec bufrec,
::perftools::gputools::Event** e)
void QueueBuffer(perftools::gputools::Stream* stream, BufRec bufrec)
EXCLUSIVE_LOCKS_REQUIRED(mu_) {
QueueInUse(stream, {nullptr, nullptr, bufrec, nullptr}, e);
QueueInUse(stream, {nullptr, nullptr, bufrec, nullptr});
}

void QueueFunc(perftools::gputools::Stream* stream,
std::function<void()> func, ::perftools::gputools::Event** e)
EXCLUSIVE_LOCKS_REQUIRED(mu_) {
QueueInUse(stream, {nullptr, nullptr, BufRec(), func}, e);
std::function<void()> func) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
QueueInUse(stream, {nullptr, nullptr, BufRec(), func});
}

// This function should be called at roughly the same tempo as
// QueueTensors() to check whether pending events have recorded,
// and then retire them. It appends InUse elements that need cleanup
// to "*to_free". The caller should call FreeMemory(to_free)
// when this returns.
void PollEvents(bool is_dedicated_poller, ToFreeVector* to_free)
EXCLUSIVE_LOCKS_REQUIRED(mu_);
// and then retire them.
void PollEvents(bool is_dedicated_poller) EXCLUSIVE_LOCKS_REQUIRED(mu_);

// An internal polling loop that runs at a low frequency to clear
// straggler Events.
Expand Down
16 changes: 4 additions & 12 deletions tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,21 +42,13 @@ class TEST_EventMgrHelper {

void QueueTensors(perftools::gputools::Stream* stream,
std::vector<Tensor>* tensors) {
::perftools::gputools::Event* e;
{
mutex_lock l(em_->mu_);
em_->QueueTensors(stream, tensors, &e);
}
stream->ThenRecordEvent(e);
mutex_lock l(em_->mu_);
em_->QueueTensors(stream, tensors);
}

void PollEvents(bool is_dedicated_poller) {
EventMgr::ToFreeVector to_free;
{
mutex_lock l(em_->mu_);
em_->PollEvents(is_dedicated_poller, &to_free);
}
em_->FreeMemory(to_free);
mutex_lock l(em_->mu_);
em_->PollEvents(is_dedicated_poller);
}

private:
Expand Down
5 changes: 4 additions & 1 deletion tensorflow/core/framework/node_def_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,10 @@ Status AttrSlice::Find(const string& attr_name,
return Status::OK();
}
Status s = errors::NotFound("No attr named '", attr_name, "' in NodeDef:");
if (ndef_) {
// Skip AttachDef for internal attrs since it is a little bit
// expensive and it is common for them to correctly not be included
// in a NodeDef.
if (!StringPiece(attr_name).starts_with("_") && ndef_) {
s = AttachDef(s, *ndef_);
}
return s;
Expand Down
7 changes: 4 additions & 3 deletions tensorflow/core/kernels/cholesky_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class CholeskyOp
const int64 rows = input_matrix_shape.dim_size(0);
if (rows > (1LL << 20)) {
// A big number to cap the cost in case overflow.
return kint32max;
return kint64max;
} else {
return rows * rows * rows;
}
Expand All @@ -69,8 +69,9 @@ class CholeskyOp
// Perform the actual LL^T Cholesky decomposition. This will only use
// the lower triangular part of data_in by default. The upper triangular
// part of the matrix will not be read.
Eigen::LLT<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic,
Eigen::RowMajor>> llt_decomposition(input);
Eigen::LLT<
Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
llt_decomposition(input);

// Output the lower triangular in a dense form.
*output = llt_decomposition.matrixL();
Expand Down
2 changes: 1 addition & 1 deletion tensorflow/core/kernels/determinant_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class DeterminantOp
const int64 rows = input_matrix_shape.dim_size(0);
if (rows > (1LL << 20)) {
// A big number to cap the cost in case overflow.
return kint32max;
return kint64max;
} else {
return rows * rows * rows;
}
Expand Down
2 changes: 1 addition & 1 deletion tensorflow/core/kernels/matrix_inverse_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class MatrixInverseOp
const int64 rows = input_matrix_shape.dim_size(0);
if (rows > (1LL << 20)) {
// A big number to cap the cost in case overflow.
return kint32max;
return kint64max;
} else {
return rows * rows * rows;
}
Expand Down
5 changes: 4 additions & 1 deletion tensorflow/core/kernels/reduction_ops_sum.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,10 @@ REGISTER_GPU_KERNELS(float);
#undef REGISTER_GPU_KERNELS

REGISTER_KERNEL_BUILDER(
Name("Sum").Device(DEVICE_GPU).TypeConstraint<complex64>("T"),
Name("Sum")
.Device(DEVICE_GPU)
.TypeConstraint<complex64>("T")
.HostMemory("reduction_indices"),
ReductionOp<GPUDevice, complex64, Eigen::internal::SumReducer<complex64>>);

#endif
Expand Down
Loading

0 comments on commit a4806a3

Please sign in to comment.