Skip to content

Commit

Permalink
fix memory-related issues to enable ASAN tests (apache#14223)
Browse files Browse the repository at this point in the history
* fix heap overflow

* fix memory leak of optimizer and executer

* uncomment memory pool free

* run cleanup in engine shutdown phase

* make asan tests blocking

* fix abort in mxnet shutdown, use forked submodules temporally for tests

* trigger CI

* change submodule mshadow

* change submodule dmlc-core
  • Loading branch information
arcadiaphy authored and haohuw committed Jun 23, 2019
1 parent 7b12205 commit 01197e6
Show file tree
Hide file tree
Showing 20 changed files with 45 additions and 17 deletions.
2 changes: 1 addition & 1 deletion 3rdparty/mshadow
2 changes: 0 additions & 2 deletions ci/docker/runtime_functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1033,8 +1033,6 @@ integrationtest_ubuntu_cpu_asan() {
set -ex
export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libasan.so.5

# We do not want to fail the build on ASAN errors until memory leaks have been addressed.
export ASAN_OPTIONS=exitcode=0
cd /work/mxnet/build/cpp-package/example/
/work/mxnet/cpp-package/example/get_data.sh
./mlp_cpu
Expand Down
1 change: 1 addition & 0 deletions cpp-package/example/alexnet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,7 @@ int main(int argc, char const *argv[]) {
}
/*don't foget to release the executor*/
delete exec;
delete opt;
MXNotifyShutdown();
return 0;
}
10 changes: 10 additions & 0 deletions cpp-package/example/charRNN.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,9 @@ void train(const std::string file, int batch_size, int max_epoch, int start_epoc
std::string filepath = prefix + "-" + std::to_string(epoch) + ".params";
SaveCheckpoint(filepath, RNN, exe);
}

delete exe;
delete opt;
}

/*The original example, rnn_cell_demo.py, uses default Xavier as initalizer, which relies on
Expand Down Expand Up @@ -577,6 +580,9 @@ void trainWithBuiltInRNNOp(const std::string file, int batch_size, int max_epoch
std::string filepath = prefix + "-" + std::to_string(epoch) + ".params";
SaveCheckpoint(filepath, RNN, exe);
}

delete exe;
delete opt;
}

void predict(std::wstring* ptext, int sequence_length, const std::string param_file,
Expand Down Expand Up @@ -639,6 +645,8 @@ void predict(std::wstring* ptext, int sequence_length, const std::string param_f
next = charIndices[n];
ptext->push_back(next);
}

delete exe;
}

void predictWithBuiltInRNNOp(std::wstring* ptext, int sequence_length, const std::string param_file,
Expand Down Expand Up @@ -693,6 +701,8 @@ void predictWithBuiltInRNNOp(std::wstring* ptext, int sequence_length, const std
next = charIndices[n];
ptext->push_back(next);
}

delete exe;
}

int main(int argc, char** argv) {
Expand Down
1 change: 1 addition & 0 deletions cpp-package/example/googlenet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ int main(int argc, char const *argv[]) {
}

delete exec;
delete opt;
MXNotifyShutdown();
return 0;
}
1 change: 1 addition & 0 deletions cpp-package/example/inception_bn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ int main(int argc, char const *argv[]) {
LG << "Validation Accuracy: " << val_acc.Get();
}
delete exec;
delete opt;
MXNotifyShutdown();
return 0;
}
1 change: 1 addition & 0 deletions cpp-package/example/lenet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ class Lenet {
<< ", accuracy: " << ValAccuracy(batch_size * 10, lenet);
}
delete exe;
delete opt;
}

private:
Expand Down
1 change: 1 addition & 0 deletions cpp-package/example/lenet_with_mxdataiter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ int main(int argc, char const *argv[]) {
}

delete exec;
delete opt;
MXNotifyShutdown();
return 0;
}
1 change: 1 addition & 0 deletions cpp-package/example/mlp_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ int main(int argc, char** argv) {
}

delete exec;
delete opt;
MXNotifyShutdown();
return 0;
}
1 change: 1 addition & 0 deletions cpp-package/example/mlp_csv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ int main(int argc, char** argv) {
}

delete exec;
delete opt;
MXNotifyShutdown();
return 0;
}
1 change: 1 addition & 0 deletions cpp-package/example/mlp_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ int main(int argc, char** argv) {
}

delete exec;
delete opt;
MXNotifyShutdown();
return 0;
}
1 change: 1 addition & 0 deletions cpp-package/example/resnet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ int main(int argc, char const *argv[]) {
LG << "Validation Accuracy: " << val_acc.Get();
}
delete exec;
delete opt;
MXNotifyShutdown();
return 0;
}
1 change: 1 addition & 0 deletions cpp-package/example/test_optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ int main(int argc, char** argv) {
opt = OptimizerRegistry::Find("adam");
int ret = (opt == 0) ? 1 : 0;

delete opt;
MXNotifyShutdown();
return ret;
}
1 change: 1 addition & 0 deletions cpp-package/example/test_score.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ int main(int argc, char** argv) {
}

delete exec;
delete opt;
MXNotifyShutdown();
return score >= MIN_SCORE ? 0 : 1;
}
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,9 @@ class BufferFile {
ifs.seekg(0, std::ios::beg);
std::cout << file_path.c_str() << " ... " << length_ << " bytes\n";

buffer_.reset(new char[length_]);
// Buffer as null terminated to be converted to string
buffer_.reset(new char[length_ + 1]);
buffer_[length_] = 0;
ifs.read(buffer_.get(), length_);
ifs.close();
}
Expand Down
19 changes: 13 additions & 6 deletions include/mxnet/ndarray.h
Original file line number Diff line number Diff line change
Expand Up @@ -848,13 +848,17 @@ class NDArray {
// The shape of aux data. The default value for the shape depends on the type of storage.
// If aux_shapes[i].Size() is zero, aux data i is empty.
mxnet::ShapeVector aux_shapes;
/*! \brief Reference to the storage to ensure proper destruct order */
std::shared_ptr<Storage> storage_ref_;

/*! \brief default cosntructor */
Chunk() : static_data(true), delay_alloc(false) {}
Chunk() : static_data(true), delay_alloc(false),
storage_ref_(Storage::_GetSharedRef()) {}

/*! \brief construct a new chunk */
Chunk(mxnet::TShape shape, Context ctx_, bool delay_alloc_, int dtype)
: static_data(false), delay_alloc(true), ctx(ctx_) {
: static_data(false), delay_alloc(true), ctx(ctx_),
storage_ref_(Storage::_GetSharedRef()) {
auto size = shape.Size();
storage_shape = shape;
var = Engine::Get()->NewVariable();
Expand All @@ -864,7 +868,8 @@ class NDArray {
}

Chunk(const TBlob &data, int dev_id)
: static_data(true), delay_alloc(false) {
: static_data(true), delay_alloc(false),
storage_ref_(Storage::_GetSharedRef()) {
CHECK(storage_type == kDefaultStorage);
var = Engine::Get()->NewVariable();
if (data.dev_mask() == cpu::kDevMask) {
Expand All @@ -881,7 +886,8 @@ class NDArray {
}

Chunk(int shared_pid, int shared_id, const mxnet::TShape& shape, int dtype)
: static_data(false), delay_alloc(false) {
: static_data(false), delay_alloc(false),
storage_ref_(Storage::_GetSharedRef()) {
var = Engine::Get()->NewVariable();
ctx = Context::CPUShared(0);
shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);
Expand All @@ -897,7 +903,7 @@ class NDArray {
const mxnet::ShapeVector &aux_shapes_)
: static_data(false), delay_alloc(delay_alloc_), storage_type(storage_type_),
aux_types(aux_types_), ctx(ctx_), storage_shape(storage_shape_),
aux_shapes(aux_shapes_) {
aux_shapes(aux_shapes_), storage_ref_(Storage::_GetSharedRef()) {
shandle.ctx = ctx;
var = Engine::Get()->NewVariable();
// aux_handles always reflect the correct number of aux data
Expand All @@ -914,7 +920,8 @@ class NDArray {

Chunk(const NDArrayStorageType storage_type_, const TBlob &data,
const std::vector<TBlob> &aux_data, int dev_id)
: static_data(true), delay_alloc(false), storage_type(storage_type_) {
: static_data(true), delay_alloc(false), storage_type(storage_type_),
storage_ref_(Storage::_GetSharedRef()) {
using namespace mshadow;
CHECK_NE(storage_type, kDefaultStorage);
// init var
Expand Down
7 changes: 3 additions & 4 deletions src/common/object_pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,9 @@ struct ObjectPoolAllocatable {

template <typename T>
ObjectPool<T>::~ObjectPool() {
// TODO(hotpxl): mind destruction order
// for (auto i : allocated_) {
// free(i);
// }
for (auto i : allocated_) {
free(i);
}
}

template <typename T>
Expand Down
2 changes: 1 addition & 1 deletion src/engine/threaded_engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ void ThreadedEngine::DeleteOperator(OprHandle op) {
this->PushAsync([threaded_opr](RunContext, CallbackOnComplete on_complete) {
ThreadedOpr::Delete(threaded_opr);
on_complete();
}, Context::CPU(), {}, deps, FnProperty::kAsync, 0,
}, Context::CPU(), {}, deps, FnProperty::kDeleteVar, 0,
"DeleteOperator");
}

Expand Down
3 changes: 2 additions & 1 deletion src/engine/threaded_engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,8 @@ class ThreadedEngine : public Engine {
LOG(INFO) << "ExecuteOprBlock " << opr_block
<< "shutdown_phase=" << shutdown_phase_;
}
if (!shutdown_phase_) {
// still run cleanup in shutdown_phase
if (!shutdown_phase_ || threaded_opr->prop == FnProperty::kDeleteVar) {
try {
OnStart(threaded_opr);
if (debug_info) {
Expand Down

0 comments on commit 01197e6

Please sign in to comment.