diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 00000000000..9e2728c0935 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,7 @@ +# .readthedocs.yml + +build: + image: latest + +python: + version: 3.7 diff --git a/CMakeLists.txt b/CMakeLists.txt index eee3e65bf73..4dfb77a0e19 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.8) +cmake_minimum_required(VERSION 3.12) project(LBANN CXX) @@ -48,8 +48,8 @@ endif () # set(LBANN_VERSION_MAJOR 0) -set(LBANN_VERSION_MINOR 98) -set(LBANN_VERSION_PATCH 1) +set(LBANN_VERSION_MINOR 99) +set(LBANN_VERSION_PATCH 0) set(LBANN_VERSION "${LBANN_VERSION_MAJOR}.${LBANN_VERSION_MINOR}.${LBANN_VERSION_PATCH}") @@ -100,7 +100,7 @@ option(LBANN_WITH_ALUMINUM "Enable Aluminum all-reduce library" OFF) option(LBANN_WITH_CNPY "Include cnpy" ON) -option(LBANN_WITH_CONDUIT "Enable Conduit library" OFF) +option(LBANN_WITH_CONDUIT "Enable Conduit library" ON) option(LBANN_WITH_CUDNN "Include Nvidia cuDNN" ON) @@ -110,12 +110,17 @@ option(LBANN_WITH_HWLOC option(LBANN_WITH_NVPROF "Enable NVTX-based instrumentation for nvprof" OFF) -option(LBANN_WITH_TBINF "Include Tensorboard interface" ON) +option(LBANN_WITH_PYTHON + "Install Python frontend and enable embedded Python" ON) +option(LBANN_WITH_TBINF "Include Tensorboard interface" ON) option(LBANN_WITH_VTUNE "Link the Intel VTune profiling library" OFF) +option(LBANN_WITH_UNIT_TESTING + "Enable the unit testing framework (requires Catch2)" OFF) + # Enable parallel random matrix generation, if possible option(LBANN_DETERMINISTIC "Use deterministic algorithms as much as possible." OFF) @@ -167,12 +172,12 @@ set(LBANN_HAS_CEREAL ${CEREAL_FOUND}) # The imported target is just called "cereal". Super. # Setup the linear algebra library -find_package(Hydrogen 1.1.0 NO_MODULE QUIET +find_package(Hydrogen 1.2.0 NO_MODULE QUIET HINTS ${Hydrogen_DIR} ${HYDROGEN_DIR} $ENV{Hydrogen_DIR} $ENV{HYDROGEN_DIR} PATH_SUFFIXES lib/cmake/hydrogen NO_DEFAULT_PATH) if (NOT Hydrogen_FOUND) - find_package(Hydrogen 1.1.0 NO_MODULE QUIET REQUIRED) + find_package(Hydrogen 1.2.0 NO_MODULE QUIET REQUIRED) endif () message(STATUS "Found Hydrogen: ${Hydrogen_DIR}") set(LBANN_HAS_HYDROGEN ${Hydrogen_FOUND}) @@ -209,13 +214,13 @@ endif () if (LBANN_WITH_ALUMINUM) # Aluminum may have already been found by Hydrogen if (NOT Aluminum_FOUND) - find_package(Aluminum NO_MODULE QUIET + find_package(Aluminum 0.2.0 NO_MODULE QUIET HINTS ${Aluminum_DIR} ${ALUMINUM_DIR} ${AL_DIR} $ENV{Aluminum_DIR} $ENV{ALUMINUM_DIR} $ENV{AL_DIR} PATH_SUFFIXES lib64/cmake/aluminum lib/cmake/aluminum NO_DEFAULT_PATH) if (NOT Aluminum_FOUND) - find_package(Aluminum NO_MODULE QUIET) + find_package(Aluminum 0.2.0 NO_MODULE QUIET) endif () endif () set(LBANN_HAS_ALUMINUM ${Aluminum_FOUND}) @@ -287,6 +292,29 @@ if (LBANN_WITH_TBINF) add_subdirectory(external/TBinf) endif () +# Find Python +# Note: This uses the Python module in cmake/modules, not the module +# that comes included with CMake. See the file for a discussion of the +# differences. +if (LBANN_WITH_PYTHON) + find_package(Python REQUIRED) + set(LBANN_HAS_PYTHON "${Python_FOUND}") + if (NOT Python_VERSION_MAJOR EQUAL 3) + set(LBANN_HAS_PYTHON FALSE) + message(FATAL_ERROR "Python 2 is not supported.") + endif () + + # Setup the installation stuff + set(PYTHON_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}" + CACHE PATH "The prefix for the python installation") + + set(CMAKE_INSTALL_PYTHONDIR + "lib/python${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}/site-packages" + CACHE PATH + "Relative path from PYTHON_INSTALL_PREFIX to the python package install") + +endif (LBANN_WITH_PYTHON) + if (LBANN_WITH_VTUNE) find_package(VTune MODULE) @@ -305,7 +333,7 @@ if (LBANN_WITH_VTUNE) endif (VTune_FOUND) endif (LBANN_WITH_VTUNE) -if (LBANN_WITH_NVPROF) +if (LBANN_WITH_CUDA AND LBANN_WITH_NVPROF) set(LBANN_NVPROF TRUE) endif () @@ -336,15 +364,15 @@ if (LBANN_WITH_CONDUIT) message(STATUS "Found HDF5: ${HDF5_DIR}") endif () - find_package(CONDUIT CONFIG QUIET - HINTS ${CONDUIT_DIR} $ENV{CONDUIT_DIR} + find_package(Conduit CONFIG QUIET + HINTS ${Conduit_DIR} $ENV{Conduit_DIR} ${CONDUIT_DIR} $ENV{CONDUIT_DIR} PATH_SUFFIXES lib64/cmake lib/cmake NO_DEFAULT_PATH) - if (NOT CONDUIT_FOUND) - find_package(CONDUIT CONFIG QUIET REQUIRED + if (NOT Conduit_FOUND) + find_package(Conduit CONFIG QUIET REQUIRED PATH_SUFFIXES lib64/cmake lib/cmake) endif () - message(STATUS "Found CONDUIT: ${CONDUIT_DIR}") + message(STATUS "Found CONDUIT: ${Conduit_DIR}") # Ugh. I don't like that this requires intimate knowledge of # specific targets that CONDUIT exports. It should support @@ -402,9 +430,28 @@ if (LBANN_WITH_CONDUIT) "${_conduit_interface_link_libs}") set(CONDUIT_LIBRARIES conduit::conduit) - set(LBANN_HAS_CONDUIT ${CONDUIT_FOUND}) + set(LBANN_HAS_CONDUIT ${Conduit_FOUND}) endif (LBANN_WITH_CONDUIT) +if (LBANN_WITH_UNIT_TESTING) + find_package(Catch2 2.0.0 CONFIG QUIET + HINTS ${CATCH2_DIR} $ENV{CATCH2_DIR} ${CATCH_DIR} $ENV{CATCH_DIR} + PATH_SUFFIXES lib64/cmake/Catch2 lib/cmake/Catch2 + NO_DEFAULT_PATH) + if (NOT Catch2_FOUND) + find_package(Catch2 2.0.0 CONFIG QUIET REQUIRED) + endif () + message(STATUS "Found Catch2: ${Catch2_DIR}") + + # Now that Catch2 has been found, start adding the unit tests + include(CTest) + include(Catch) + add_subdirectory(src/utils/unit_test) + + # Add this one last + add_subdirectory(unit_test) +endif (LBANN_WITH_UNIT_TESTING) + # Handle the documentation add_subdirectory(docs) @@ -430,6 +477,10 @@ target_include_directories(lbann PUBLIC $ $) +if (LBANN_HAS_PYTHON) + target_include_directories(lbann PUBLIC ${Python_INCLUDE_DIRS}) +endif () + # Use the IMPORTED targets when possible. target_link_libraries(lbann PUBLIC LbannProto) target_link_libraries(lbann PUBLIC cereal) @@ -460,6 +511,10 @@ if (LBANN_HAS_VTUNE) target_link_libraries(lbann PUBLIC ${VTUNE_STATIC_LIB}) endif () +if (LBANN_HAS_PYTHON) + target_link_libraries(lbann PUBLIC ${Python_LIBRARIES}) +endif () + if (TARGET LBANN_CXX_FLAGS_werror) target_link_libraries(lbann PUBLIC LBANN_CXX_FLAGS_werror) endif () @@ -516,8 +571,8 @@ export(EXPORT LBANNTargets NAMESPACE LBANN:: FILE LBANNTargets.cmake) # Write the configure file for the install tree set(INCLUDE_INSTALL_DIRS include) -set(LIB_INSTALL_DIR lib) -set(CMAKE_INSTALL_DIR lib/cmake/lbann) +set(LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR}) +set(CMAKE_INSTALL_DIR ${LIB_INSTALL_DIR}/cmake/lbann) set(EXTRA_CMAKE_MODULE_DIR) configure_package_config_file(cmake/configure_files/LBANNConfig.cmake.in "${CMAKE_BINARY_DIR}/LBANNConfig.cmake.install" @@ -559,6 +614,64 @@ install( FILES "${PROJECT_BINARY_DIR}/lbann_config.hpp" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}") +# Install Python frontend +# Note (tym): Python best practices are to put setup.py at the package +# root and setuptools only accepts relative paths. However, we need to +# insert a config file containing install-specific file paths and make +# sure setup.py can pick it up. I see three approaches for the build +# process: +# 1) Inject the config file into a known location in the source +# directory so that setup.py can pick it up. +# 2) Copy the Python source tree into the build directory and insert +# setup.py and the config file. +# 3) Create setup.py and the config file in the build directory and +# pass the source directory as a relative path. +# We go for option 3 since it's simple and lightweight, but it runs +# counter to the intent of setuptools. If we learn about any nicer +# approaches, we should use them. +if (LBANN_HAS_PYTHON) + + # Construct config file + # NOTE (trb): python_config.ini is installed by setup.py + set(_PYTHON_CONFIG_INI ${CMAKE_BINARY_DIR}/python_config.ini) + set(_LBANN_PB2_PY ${PYTHON_INSTALL_PREFIX}/${CMAKE_INSTALL_PYTHONDIR}/lbann_pb2.py) + set(_LBANN_EXE ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/lbann) + configure_file( + "${CMAKE_SOURCE_DIR}/cmake/configure_files/python_config.ini.in" + "${_PYTHON_CONFIG_INI}" + @ONLY) + + # Construct setup.py + set(_SETUP_PY ${CMAKE_BINARY_DIR}/setup.py) + set(_LBANN_PYTHON_DIR "${CMAKE_SOURCE_DIR}/python") + configure_file( + "${CMAKE_SOURCE_DIR}/cmake/configure_files/setup.py.in" + "${_SETUP_PY}" + @ONLY) + + # Install Python package with setuptools + set(_PY_INSTALL_DIR "${PYTHON_INSTALL_PREFIX}/${CMAKE_INSTALL_PYTHONDIR}") + set(_SETUP_PY_ARGS + "${_SETUP_PY_ARGS} --root ${_PY_INSTALL_DIR} --install-lib . --install-data .") + install(CODE + "execute_process(COMMAND ${Python_EXECUTABLE} ${_SETUP_PY} install ${_SETUP_PY_ARGS})") + + set(_PY_INSTALL_MSG + " +\n********************************************************************** + +A Python package has been installed to ${_PY_INSTALL_DIR}. To use +this package, be sure to add this directory to your PYTHONPATH, e.g.: + + export PYTHONPATH=${_PY_INSTALL_DIR}:\\$\{PYTHONPATH\} + +**********************************************************************\n +") + install(CODE + "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"${_PY_INSTALL_MSG}\")") + +endif (LBANN_HAS_PYTHON) + # Install contributor list, license, readme install( FILES "${PROJECT_SOURCE_DIR}/CONTRIBUTORS" @@ -583,8 +696,10 @@ macro(append_str_tf STRING_VAR) math(EXPR _num_spaces "${_max_length} - ${_var_length}") lbann_get_space_string(_spaces ${_num_spaces}) if (${var}) + set(${var} "TRUE") string(APPEND ${STRING_VAR} " ${var}:" "${_spaces}" "TRUE\n") else () + set(${var} "FALSE") string(APPEND ${STRING_VAR} " ${var}:" "${_spaces}" "FALSE\n") endif () endforeach() @@ -632,10 +747,33 @@ append_str_tf(_str LBANN_HAS_DOXYGEN LBANN_HAS_LBANN_PROTO LBANN_HAS_ALUMINUM - LBANN_HAS_CONDUIT) + LBANN_HAS_CONDUIT + LBANN_HAS_PYTHON) string(APPEND _str "\n== End LBANN Configuration Summary ==\n") # Output to stdout execute_process(COMMAND ${CMAKE_COMMAND} -E echo "${_str}") set(_str) + +# +# Write a basic modulefile +# +set(LBANN_MODULEFILE_NAME "lbann-${LBANN_VERSION}.lua" + CACHE STRING + "The name of the LBANN modulefile to install. Must end in .lua.") + +if (NOT (LBANN_MODULEFILE_NAME MATCHES ".+\.lua")) + message(WARNING + "LBANN_MODULEFILE_NAME must have extension \".lua\". Appending.") + set(LBANN_MODULEFILE_NAME "${LBANN_MODULEFILE_NAME}.lua" + CACHE STRING "" FORCE) +endif () + +configure_file( + "${CMAKE_SOURCE_DIR}/cmake/configure_files/lbann_module.lua.in" + "${CMAKE_BINARY_DIR}/lbann_module.lua.install" + @ONLY) +install(FILES "${CMAKE_BINARY_DIR}/lbann_module.lua.install" + RENAME "${LBANN_MODULEFILE_NAME}" + DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}/modulefiles") diff --git a/LICENSE b/LICENSE index ebd51e42629..68681ede2ee 100644 --- a/LICENSE +++ b/LICENSE @@ -1,5 +1,5 @@ -Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. -Produced at the Lawrence Livermore National Laboratory. +Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +Produced at the Lawrence Livermore National Laboratory. Written by the LBANN Research Team (B. Van Essen, et al.) listed in the CONTRIBUTORS file. @@ -8,7 +8,7 @@ All rights reserved. This file is part of LBANN: Livermore Big Artificial Neural Network Toolkit. For details, see http://software.llnl.gov/LBANN or -https://github.com/LLNL/LBANN. +https://github.com/LLNL/LBANN. Licensed under the Apache License, Version 2.0 (the "Licensee"); you may not use this file except in compliance with the License. You may @@ -21,4 +21,3 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the license. - diff --git a/README.md b/README.md index 10d3c8e1b7b..8afb7203cfe 100644 --- a/README.md +++ b/README.md @@ -21,9 +21,17 @@ methods. ## Building LBANN -A few options for building LBANN are documented -[here](docs/BuildingLBANN.md#top). +The preferred method for LBANN users to install LBANN is to use +[Spack](https://github.com/llnl/spack). After some system +configuration, this should be as straightforward as +```bash +spack install lbann +``` + +More detailed instructions for building and installing LBANN are +available at the [main LBANN +documentation](https://lbann.readthedocs.io/en/latest/index.html). ## Running LBANN The basic template for running LBANN is @@ -42,8 +50,12 @@ optimized for the case in which one assigns one GPU per MPI the MPI launcher. More details about running LBANN are documented -[here](docs/RunningLBANN.md#top). +[here](https://lbann.readthedocs.io/en/latest/running_lbann.html). + +## Publications +A list of publications, presentations and posters are shown +[here](https://lbann.readthedocs.io/en/latest/publications.html). ## Reporting issues Issues, questions, and bugs can be raised on the [Github issue diff --git a/ReleaseNotes.txt b/ReleaseNotes.txt index fea20150a3f..13418207629 100644 --- a/ReleaseNotes.txt +++ b/ReleaseNotes.txt @@ -1,21 +1,90 @@ -============================== (Pending) Release Notes: v0.99 ============================== +============================== (Pending) Release Notes: v1.00 ============================== Support for new training algorithms: Support for new network structures: Support for new layers: +Python front-end: + +Performance optimizations: + +Model portability & usability: + +Internal features: + +I/O & data readers: + +Build system: + +Bug fixes: + +Retired features: + +============================== Release Notes: v0.99 ============================== +Support for new training algorithms: + - Improvements to LTFB infrastructure (including transfer of SGD and Adam hyperparameters) + +Support for new network structures: + - Support for Wide ResNets + +Support for new layers: + +Python front-end: + - Python front-end for generating neural network architectures (lbann namespace): + including layers, objective functions, callbacks, metrics, and optimizers. + - Python interface for launching (SLURM or LSF) jobs on HPC systems + - Support for running LBANN experiments and capturing experimental output + - Network templates for AlexNet, LeNet, arbitrary ResNet models, and Wide ResNet models + - Python scripts for LeNet, AlexNet, and (Wide) ResNets in model zoo. + Performance optimizations: + - GPU implementation of RMSprop optimizer. + - cuDNN convolution algorithms are determined by empirically measuring + performance rather than using heuristics. + - Avoid setting up unused bias weights. + - Perform gradient accumulations in-place when possible. Model portability & usability: Internal features: + - Weight gradient allreduces are in-place rather than on a staging buffer. + - Fully connected and convolution layers only create bias weights when + needed. + - Optimizer exposes gradient buffers so they can be updated in-place. + - Added callback support to explicitly save model + - Min-max metric for reporting on multiple LTFB trainers + - Cleanup of Hydrogen interface to match Hydrogen v1.2.0 + - Added type-erased matrix class for internal refactoring + - Make CUB always log performance critical events I/O & data readers: + - Python data reader that interacts with an embedded Python session. + - Optimized data store to provide preload option + - Extended data store to operate with Cosmoflow-numpy data reader Build system: + - Added documentation for how users can use Spack to install LBANN + either directly or via environments. + - Conduit is a required dependency. + - Provided Spack environment for installing LBANN as a user + - Improved documentation on lbann.readthedocs.io + - CMake installs a module file in the installation directory that + sets up PATH and PYTHONPATH variables appropriately + +Bug fixes: + - Models can now be copied or setup multiple times. + - Fixed incorrect weight initialization with multiple trainers. + - Updated I/O random number generators to be C++ thread safe (rather than OpenMP) + - Added an I/O random number generator for preprocessing that is independent + of the data sequence RNG. + - Fixed initialization order of RNGs and multiple models / trainers. + - General fixes for I/O and LTFB interaction. Retired features: + - "Zero" layer (hack for early GAN implementation). + - Removed data reader specific implementations of data store (in favor of Conduit-based + data store) ============================== Release Notes: v0.98.1 ============================== Bug Fixes: diff --git a/bamboo/README.md b/bamboo/README.md index 4ad8b6508be..c317c496379 100644 --- a/bamboo/README.md +++ b/bamboo/README.md @@ -47,13 +47,13 @@ As an alternative to splitting the file, errors can be searched for with `grep - Bamboo agent properties are used to specify requirements for each job. -| Agents (jobs) | `agent_owner` | `architecture` | `cluster` | `gpu_architecture` | `sys_type` | -| --- | --- | --- | --- | --- | --- | -| Catalyst Agents (x86_cpu) | `lbannusr` | `x86_64` | `catalyst` | `none` | `toss_3_x86_64_ib` | -| Pascal Agents | `lbannusr` | `x86_64` | `pascal` | `pascal` | `chaos_6_x86_64_ib` | -| Quartz Agents (x86_cpu) | `lbannusr` | `x86_64` | `quartz` | `none` | `toss_3_x86_64_ib` | -| Ray Agents (ppc64le_gpu) | `lbannusr` | `ppc64_le` | `ray` | `pascal` | `blueos_3_ppc64le_ib` | -| Surface Agents (x86_gpu) | `lbannusr` | `x86_64` | `surface` | `kepler` | `chaos_5_x86_64_ib` | +| Agents (jobs) | `agent_owner` | `architecture` | `cluster` | `gpu_architecture` | `sys_type` | +| --- | --- | --- | --- | --- | --- | +| Catalyst Agents (x86_cpu) | `lbannusr` | `x86_64` | `catalyst` | `none` | `toss_3_x86_64_ib` | +| Pascal Agents (x86_gpu_pascal) | `lbannusr` | `x86_64` | `pascal` | `pascal` | `chaos_6_x86_64_ib` | +| Quartz Agents (x86_cpu) | `lbannusr` | `x86_64` | `quartz` | `none` | `toss_3_x86_64_ib` | +| Ray Agents (ppc64le_gpu) | `lbannusr` | `ppc64_le` | `ray` | `pascal` | `blueos_3_ppc64le_ib` | +| Surface Agents (x86_gpu) | `lbannusr` | `x86_64` | `surface` | `kepler` | `chaos_5_x86_64_ib` | Currently, `agent_owner`, `architecture`, and `gpu_architecture` are used to determine agents to run a job. diff --git a/bamboo/common_python/test_tools.py b/bamboo/common_python/test_tools.py index 0fdbf044160..6cafbb39bd6 100644 --- a/bamboo/common_python/test_tools.py +++ b/bamboo/common_python/test_tools.py @@ -5,27 +5,27 @@ # Run locally with python -m pytest -s def test_command_catalyst(): - actual = tools.get_command(cluster='catalyst', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existance=False) + actual = tools.get_command(cluster='catalyst', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existence=False) expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' assert actual == expected def test_command_pascal(): - actual = tools.get_command(cluster='pascal', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existance=False) + actual = tools.get_command(cluster='pascal', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existence=False) expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' assert actual == expected def test_command_quartz(): - actual = tools.get_command(cluster='quartz', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existance=False) + actual = tools.get_command(cluster='quartz', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existence=False) expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --ntasks=40 exe --data_filedir=lscratchh/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' assert actual == expected def test_command_surface(): - actual = tools.get_command(cluster='surface', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existance=False) + actual = tools.get_command(cluster='surface', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existence=False) expected = 'salloc --nodes=20 --partition=pbatch --time=30 srun --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' assert actual == expected def test_command_ray(): - actual = tools.get_command(cluster='ray', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existance=False) + actual = tools.get_command(cluster='ray', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existence=False) expected = 'bsub -x -G guests -Is -n 40 -q pdebug -R "span[ptile=2]" -W 30 mpirun -np 40 -N 2 exe --data_filedir=gscratchr/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' assert actual == expected @@ -33,112 +33,112 @@ def test_command_ray(): def test_blacklisted_substrings(): try: - tools.get_command('ray', 'exe', partition=';', optimizer_path='--model=new_model', check_executable_existance=False) - except Exception, e: + tools.get_command('ray', 'exe', partition=';', optimizer_path='--model=new_model', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid character(s): ; contains ; , --model=new_model contains --' assert actual == expected def test_unsupported_cluster(): try: - tools.get_command('quartz', 'exe', check_executable_existance=False) - except Exception, e: + tools.get_command('quartz', 'exe', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Unsupported Cluster: quartz' assert actual == expected def test_bad_model_1(): try: - tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', model_name='name', model_path='path', check_executable_existance=False) - except Exception, e: + tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', model_name='name', model_path='path', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: model_path is set but so is at least one of model folder and model_name' assert actual == expected def test_bad_model_2(): try: - tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', model_path='path', check_executable_existance=False) - except Exception, e: + tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', model_path='path', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: model_path is set but so is at least one of model folder and model_name' assert actual == expected def test_bad_model_3(): try: - tools.get_command('ray', 'exe', dir_name='dir', model_name='name', model_path='path', check_executable_existance=False) - except Exception, e: + tools.get_command('ray', 'exe', dir_name='dir', model_name='name', model_path='path', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: model_path is set but so is at least one of model folder and model_name' assert actual == expected def test_bad_model_4(): try: - tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', check_executable_existance=False) - except Exception, e: + tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: model_folder set but not model_name.' assert actual == expected def test_bad_model_5(): try: - tools.get_command('ray', 'exe', dir_name='dir', model_name='name', check_executable_existance=False) - except Exception, e: + tools.get_command('ray', 'exe', dir_name='dir', model_name='name', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: model_name set but not model_folder.' assert actual == expected def test_bad_data_reader(): try: - tools.get_command('catalyst', 'exe', dir_name='dir', data_reader_name='name', data_reader_path='path', check_executable_existance=False) - except Exception, e: + tools.get_command('catalyst', 'exe', dir_name='dir', data_reader_name='name', data_reader_path='path', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: data_reader_path is set but so is data_reader_name , data_reader_name or data_reader_path is set but not data_filedir_default. If a data reader is provided, the default filedir must be set. This allows for determining what the filedir should be on each cluster. Alternatively, some or all of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] can be set.' assert actual == expected def test_bad_optimizer(): try: - tools.get_command('ray', 'exe', dir_name='dir', optimizer_name='name', optimizer_path='path', check_executable_existance=False) - except Exception, e: + tools.get_command('ray', 'exe', dir_name='dir', optimizer_name='name', optimizer_path='path', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: optimizer_path is set but so is optimizer_name' assert actual == expected def test_bad_dir_name_1(): try: - tools.get_command('ray', 'exe', dir_name='dir', check_executable_existance=False) - except Exception, e: + tools.get_command('ray', 'exe', dir_name='dir', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: dir_name set but none of model_folder, model_name, data_reader_name, optimizer_name are.' assert actual == expected def test_bad_dir_name_2(): try: - tools.get_command('ray', 'exe', model_folder='folder', check_executable_existance=False) - except Exception, e: + tools.get_command('ray', 'exe', model_folder='folder', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is.' assert actual == expected def test_bad_dir_name_3(): try: - tools.get_command('ray', 'exe', model_name='name', check_executable_existance=False) - except Exception, e: + tools.get_command('ray', 'exe', model_name='name', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is.' assert actual == expected def test_bad_dir_name_4(): try: - tools.get_command('catalyst', 'exe', data_reader_name='name', check_executable_existance=False) - except Exception, e: + tools.get_command('catalyst', 'exe', data_reader_name='name', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is. , data_reader_name or data_reader_path is set but not data_filedir_default. If a data reader is provided, the default filedir must be set. This allows for determining what the filedir should be on each cluster. Alternatively, some or all of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] can be set.' assert actual == expected def test_bad_dir_name_5(): try: - tools.get_command('ray', 'exe', optimizer_name='name', check_executable_existance=False) - except Exception, e: + tools.get_command('ray', 'exe', optimizer_name='name', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is.' assert actual == expected @@ -146,8 +146,8 @@ def test_bad_dir_name_5(): def test_bad_data_filedir_1(): try: tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', data_filedir_default='filedir', data_filedir_train_default='a', - check_executable_existance=False) - except Exception, e: + check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' assert actual == expected @@ -155,8 +155,8 @@ def test_bad_data_filedir_1(): def test_bad_data_filedir_2(): try: tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', data_filedir_default='filedir', data_filename_train_default='b', - check_executable_existance=False) - except Exception, e: + check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' assert actual == expected @@ -165,8 +165,8 @@ def test_bad_data_filedir_2(): def test_bad_data_filedir_3(): try: tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', data_filedir_default='filedir', data_filedir_test_default='c', - check_executable_existance=False) - except Exception, e: + check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' assert actual == expected @@ -174,24 +174,24 @@ def test_bad_data_filedir_3(): def test_bad_data_filedir_4(): try: tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', data_filedir_default='filedir', data_filename_test_default='d', - check_executable_existance=False) - except Exception, e: + check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' assert actual == expected def test_bad_data_filedir_5(): try: - tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filedir_train_default='e', check_executable_existance=False) - except Exception, e: + tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filedir_train_default='e', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' assert actual == expected def test_bad_data_filedir_6(): try: - tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filename_train_default='f', check_executable_existance=False) - except Exception, e: + tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filename_train_default='f', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' assert actual == expected @@ -199,48 +199,48 @@ def test_bad_data_filedir_6(): def test_bad_data_filedir_7(): try: - tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filedir_test_default='g', check_executable_existance=False) - except Exception, e: + tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filedir_test_default='g', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' assert actual == expected def test_bad_data_filedir_8(): try: - tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filename_test_default='h', check_executable_existance=False) - except Exception, e: + tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filename_test_default='h', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' assert actual == expected def test_bad_data_filedir_9(): try: - tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', check_executable_existance=False) - except Exception, e: + tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: data_reader_name or data_reader_path is set but not data_filedir_default. If a data reader is provided, the default filedir must be set. This allows for determining what the filedir should be on each cluster. Alternatively, some or all of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] can be set.' assert actual == expected def test_bad_data_filedir_10(): try: - tools.get_command('ray', 'exe', data_reader_path='path', check_executable_existance=False) - except Exception, e: + tools.get_command('ray', 'exe', data_reader_path='path', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: data_reader_name or data_reader_path is set but not data_filedir_default. If a data reader is provided, the default filedir must be set. This allows for determining what the filedir should be on each cluster. Alternatively, some or all of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] can be set.' assert actual == expected def test_bad_data_filedir_11(): try: - tools.get_command('ray', 'exe', data_filedir_default='filedir', check_executable_existance=False) - except Exception, e: + tools.get_command('ray', 'exe', data_filedir_default='filedir', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: data_filedir_default set but neither data_reader_name or data_reader_path are.' assert actual == expected def test_bad_data_filedir_12(): try: - tools.get_command('ray', 'exe', data_filedir_train_default='a', check_executable_existance=False) - except Exception, e: + tools.get_command('ray', 'exe', data_filedir_train_default='a', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: At least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] is set, but neither data_reader_name or data_reader_path are.' assert actual == expected @@ -248,8 +248,8 @@ def test_bad_data_filedir_12(): def test_bad_data_filedir_13(): try: - tools.get_command('ray', 'exe', data_filename_train_default='b', check_executable_existance=False) - except Exception, e: + tools.get_command('ray', 'exe', data_filename_train_default='b', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: At least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] is set, but neither data_reader_name or data_reader_path are.' assert actual == expected @@ -257,8 +257,8 @@ def test_bad_data_filedir_13(): def test_bad_data_filedir_14(): try: - tools.get_command('ray', 'exe', data_filedir_test_default='c', check_executable_existance=False) - except Exception, e: + tools.get_command('ray', 'exe', data_filedir_test_default='c', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: At least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] is set, but neither data_reader_name or data_reader_path are.' assert actual == expected @@ -266,8 +266,8 @@ def test_bad_data_filedir_14(): def test_bad_data_filedir_15(): try: - tools.get_command('ray', 'exe', data_filename_test_default='e', check_executable_existance=False) - except Exception, e: + tools.get_command('ray', 'exe', data_filename_test_default='e', check_executable_existence=False) + except Exception as e: actual = str(e) expected = 'Invalid Usage: At least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] is set, but neither data_reader_name or data_reader_path are.' assert actual == expected diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index 4a9508c8b3a..7110ddc9a67 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -1,14 +1,16 @@ import pytest import math, os, re + def check_list(substrings, strings): errors = [] for string in strings: for substring in substrings: if (string != None) and (substring in string): - errors.append('%s contains %s' % (string, substring)) + errors.append('%s contains %s' % (string, substring)) return errors + def get_command(cluster, executable, num_nodes=None, @@ -37,29 +39,30 @@ def get_command(cluster, output_file_name=None, error_file_name=None, return_tuple=False, - check_executable_existance=True, + check_executable_existence=True, skip_no_exe=True): # Check parameters for black-listed characters like semi-colons that # would terminate the command and allow for an extra command blacklist = [';', '--'] - strings = [partition, dir_name, data_filedir_default, data_filedir_train_default, + strings = [partition, dir_name, data_filedir_default, + data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default, data_reader_name, data_reader_path, model_folder, model_name, model_path, optimizer_name, optimizer_path, output_file_name, error_file_name] invalid_character_errors = check_list(blacklist, strings) if invalid_character_errors != []: - raise Exception('Invalid character(s): %s' % ' , '.join(invalid_character_errors)) + raise Exception('Invalid character(s): %s' % ' , '.join( + invalid_character_errors)) - # Check executable existance - if check_executable_existance: - executable_exists = os.path.exists(executable) - if not executable_exists: - error_string = 'Executable does not exist: %s' % executable - if skip_no_exe: - pytest.skip(error_string) - else: - raise Exception(error_string) + # Never give lbannusr an allocation for over 12 hours though. + strict_time_limit = 60*6 # 6 hours. + if time_limit > strict_time_limit: + time_limit = strict_time_limit + + # Check executable existence + if check_executable_existence: + process_executable_existence(executable, skip_no_exe) # Determine scheduler if cluster in ['catalyst', 'pascal', 'quartz', 'surface']: @@ -81,20 +84,21 @@ def get_command(cluster, option_num_nodes = '' option_partition = '' option_time_limit = '' - if num_nodes != None: + if num_nodes is not None: # --nodes= => # Request that a minimum of minnodes nodes be allocated to this # job. A maximum node count may also be specified with # maxnodes. option_num_nodes = ' --nodes=%d' % num_nodes - if partition != None: + if partition is not None: # Surface does not have pdebug, so switch to pbatch - if (cluster == 'surface') and (partition == 'pdebug'): + if (cluster in ['surface', 'pascal']) and \ + (partition == 'pdebug'): partition = 'pbatch' # --partition => Request a specific partition for the resource # allocation. option_partition = ' --partition=%s' % partition - if time_limit != None: + if time_limit is not None: # --time => Set a limit on the total run time of the job # allocation. # Time limit in minutes @@ -109,7 +113,7 @@ def get_command(cluster, else: command_run = ' srun --mpibind=off' option_num_processes = '' - if num_processes != None: + if num_processes is not None: # --ntasks => Specify the number of tasks to run. # Number of processes to run => MPI Rank option_num_processes = ' --ntasks=%d' % num_processes @@ -120,7 +124,7 @@ def get_command(cluster, command_allocate = '' # Allocate a node if we don't have one already # Running the tests manually allows for already having a node allocated - if os.getenv('LSB_HOSTS') == None: + if os.getenv('LSB_HOSTS') is None: command_allocate = 'bsub' # x => Puts the host running your job into exclusive execution # mode. @@ -135,19 +139,19 @@ def get_command(cluster, option_partition = '' option_processes_per_node = '' option_time_limit = '' - if num_processes != None: + if num_processes is not None: # n => Submits a parallel job and specifies the number of # tasks in the job. option_num_processes = ' -n %d' % num_processes - if (num_nodes != None) and (num_nodes != 0): + if (num_nodes is not None) and (num_nodes != 0): # R => Runs the job on a host that meets the specified # resource requirements. option_processes_per_node = ' -R "span[ptile=%d]"' % int( math.ceil(float(num_processes)/num_nodes)) - if partition != None: + if partition is not None: # q => Submits the job to one of the specified queues. option_partition = ' -q %s' % partition - if time_limit != None: + if time_limit is not None: if cluster == 'ray': max_ray_time = 480 if time_limit > max_ray_time: @@ -166,10 +170,10 @@ def get_command(cluster, command_run = ' mpirun' option_num_processes = '' option_processes_per_node = '' - if num_processes != None: + if num_processes is not None: # -np => Run this many copies of the program on the given nodes. option_num_processes = ' -np %d' % num_processes - if (num_nodes != None) and (num_nodes != 0): + if (num_nodes is not None) and (num_nodes != 0): option_processes_per_node = ' -N %d' % int( math.ceil(float(num_processes)/num_nodes)) command_run = '%s%s%s' % ( @@ -194,57 +198,68 @@ def get_command(cluster, option_optimizer = '' option_processes_per_model = '' lbann_errors = [] - if model_path != None: + if model_path is not None: # If model_folder and/or model_name are set, an exception will be # raised later. option_model = ' --model=%s' % model_path - if data_reader_path != None: + if data_reader_path is not None: # If data_reader_name is set, an exception will be raised later. option_data_reader = ' --reader=%s' % data_reader_path - if optimizer_path != None: + if optimizer_path is not None: # If optimizer_name is set, an exception will be raised later. option_optimizer_name = ' --optimizer=%s' % optimizer_path - if dir_name != None: - if model_path != None: - if (model_folder != None) or (model_name != None): + if dir_name is not None: + if model_path is not None: + if (model_folder is not None) or (model_name is not None): lbann_errors.append( - 'model_path is set but so is at least one of model folder and model_name') + ('model_path is set but so is at least one of model' + ' folder and model_name')) else: - if (model_folder != None) and (model_name != None): - option_model = ' --model=%s/model_zoo/%s/model_%s.prototext' % (dir_name, model_folder, model_name) - elif model_folder != None: + if (model_folder is not None) and (model_name is not None): + option_model = ' --model=%s/model_zoo/%s/model_%s.prototext' % ( + dir_name, model_folder, model_name) + elif model_folder is not None: lbann_errors.append('model_folder set but not model_name.') - elif model_name != None: + elif model_name is not None: lbann_errors.append('model_name set but not model_folder.') - if data_reader_name != None: - if data_reader_path != None: - lbann_errors.append('data_reader_path is set but so is data_reader_name') + if data_reader_name is not None: + if data_reader_path is not None: + lbann_errors.append(('data_reader_path is set but so is' + ' data_reader_name')) else: option_data_reader = ' --reader=%s/model_zoo/data_readers/data_reader_%s.prototext' % (dir_name, data_reader_name) - if optimizer_name != None: - if optimizer_path != None: - lbann_errors.append('optimizer_path is set but so is optimizer_name') + if optimizer_name is not None: + if optimizer_path is not None: + lbann_errors.append(('optimizer_path is set but so is' + ' optimizer_name')) else: option_optimizer = ' --optimizer=%s/model_zoo/optimizers/opt_%s.prototext' % (dir_name, optimizer_name) - if (model_folder == None) and (model_name == None) and (data_reader_name == None) and (optimizer_name == None): - lbann_errors.append('dir_name set but none of model_folder, model_name, data_reader_name, optimizer_name are.') - elif (model_folder != None) or (model_name != None) or (data_reader_name != None) or (optimizer_name != None): + if (model_folder is None) and (model_name is None) and \ + (data_reader_name is None) and (optimizer_name is None): + lbann_errors.append( + ('dir_name set but none of model_folder, model_name,' + ' data_reader_name, optimizer_name are.')) + elif (model_folder is not None) or (model_name is not None) or \ + (data_reader_name is not None) or (optimizer_name is not None): lbann_errors.append( - 'dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is.') + ('dir_name is not set but at least one of model_folder,' + ' model_name, data_reader_name, optimizer_name is.')) data_file_parameters = [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] # Determine data file paths # If there is no regex match, then re.sub keeps the original string - if data_filedir_default != None: + if data_filedir_default is not None: if cluster in ['catalyst', 'pascal', 'surface']: # option_data_filedir = data_filedir_default # lscratchh, presumably - pass # No need to pass in a parameter + pass # No need to pass in a parameter elif cluster == 'quartz': - option_data_filedir = ' --data_filedir=%s' % re.sub('[a-z]scratch[a-z]', 'lscratchh', data_filedir_default) + option_data_filedir = ' --data_filedir=%s' % re.sub( + '[a-z]scratch[a-z]', 'lscratchh', data_filedir_default) elif cluster == 'ray': - option_data_filedir = ' --data_filedir=%s' % re.sub('[a-z]scratch[a-z]', 'gscratchr', data_filedir_default) + option_data_filedir = ' --data_filedir=%s' % re.sub( + '[a-z]scratch[a-z]', 'gscratchr', data_filedir_default) elif None not in data_file_parameters: if cluster in ['catalyst', 'pascal', 'surface']: # option_data_filedir_train = data_filedir_train_default @@ -262,35 +277,56 @@ def get_command(cluster, option_data_filename_train = ' --data_filename_train=%s' % re.sub('[a-z]scratch[a-z]', 'gscratchr', data_filename_train_default) option_data_filedir_test = ' --data_filedir_test=%s' % re.sub('[a-z]scratch[a-z]', 'gscratchr', data_filedir_test_default) option_data_filename_test = ' --data_filename_test=%s' % re.sub('[a-z]scratch[a-z]', 'gscratchr', data_filename_test_default) - if (data_reader_name != None) or (data_reader_path != None): - if data_filedir_default != None: - if data_file_parameters != [None, None, None, None]: # If any are not None - lbann_errors.append('data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]') + if (data_reader_name is not None) or (data_reader_path is not None): + if data_filedir_default is not None: + # If any are not None + if data_file_parameters != [None, None, None, None]: + lbann_errors.append( + ('data_fildir_default set but so is at least one of' + ' [data_filedir_train_default, data_filename_train' + '_default, data_filedir_test_default,' + ' data_filename_test_default]')) # else: only data_filedir_default is set else: # if None in data_file_parameters: # If any are None if data_file_parameters == [None, None, None, None]: # If all are None - lbann_errors.append('data_reader_name or data_reader_path is set but not data_filedir_default. If a data reader is provided, the default filedir must be set. This allows for determining what the filedir should be on each cluster. Alternatively, some or all of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] can be set.') + lbann_errors.append( + ('data_reader_name or data_reader_path is set but not' + ' data_filedir_default. If a data reader is provided,' + ' the default filedir must be set. This allows for' + ' determining what the filedir should be on each' + ' cluster. Alternatively, some or all of' + ' [data_filedir_train_default, data_filename_train' + '_default, data_filedir_test_default, data_filename' + '_test_default] can be set.')) # else: no data_file parameters are set else: - if data_filedir_default != None: - lbann_errors.append('data_filedir_default set but neither data_reader_name or data_reader_path are.') - elif filter(lambda x: x != None, data_file_parameters) != []: # If the list of non-None data_file parameters is not empty - lbann_errors.append('At least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] is set, but neither data_reader_name or data_reader_path are.') + if data_filedir_default is not None: + lbann_errors.append( + ('data_filedir_default set but neither data_reader_name' + ' or data_reader_path are.')) + elif filter(lambda x: x is not None, data_file_parameters) != []: + # If the list of non-None data_file parameters is not empty + lbann_errors.append( + ('At least one of [data_filedir_train_default, data_filename' + '_train_default, data_filedir_test_default, data_filename' + '_test_default] is set, but neither data_reader_name or' + ' data_reader_path are.')) # else: no conflicts - if data_reader_percent != None: + if data_reader_percent is not None: option_data_reader_percent = ' --data_reader_percent=%f' % data_reader_percent if exit_after_setup: option_exit_after_setup = ' --exit_after_setup' - if mini_batch_size != None: + if mini_batch_size is not None: option_mini_batch_size = ' --mini_batch_size=%d' % mini_batch_size - if num_epochs != None: + if num_epochs is not None: option_num_epochs = ' --num_epochs=%d' % num_epochs - if processes_per_model != None: + if processes_per_model is not None: option_processes_per_model = ' --procs_per_model=%d' % processes_per_model - if ckpt_dir != None: + if ckpt_dir is not None: option_ckpt_dir = ' --ckpt_dir=%s' % ckpt_dir if lbann_errors != []: + print('lbann_errors={lbann_errors}.'.format(lbann_errors=lbann_errors)) raise Exception('Invalid Usage: ' + ' , '.join(lbann_errors)) command_lbann = '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' % ( executable, option_ckpt_dir, option_data_filedir, @@ -304,15 +340,84 @@ def get_command(cluster, # Create redirect command command_output = '' command_error = '' - if output_file_name != None: + if output_file_name is not None: command_output = ' > %s' % output_file_name - if error_file_name != None: + if error_file_name is not None: command_error = ' 2> %s' % error_file_name command_redirect = '%s%s' % (command_output, command_error) t = (command_allocate, command_run, command_lbann, command_redirect) if return_tuple: + print('command_tuple=' + str(t)) return t else: - return '%s%s %s%s' % t + command_string = '%s%s %s%s' % t + print('command_string=' + command_string) + return command_string + + +def process_executable_existence(executable, skip_no_exe=True): + executable_exists = os.path.exists(executable) + if not executable_exists: + error_string = 'Executable does not exist: %s' % executable + if skip_no_exe: + pytest.skip(error_string) + else: + raise Exception(error_string) + + +def get_spack_exes(default_dirname, cluster): + exes = {} + + exes['clang4'] = '%s/bamboo/compiler_tests/builds/%s_clang-4.0.0_rel/build/model_zoo/lbann' % (default_dirname, cluster) + exes['gcc4'] = '%s/bamboo/compiler_tests/builds/%s_gcc-4.9.3_rel/build/model_zoo/lbann' % (default_dirname, cluster) + exes['gcc7'] = '%s/bamboo/compiler_tests/builds/%s_gcc-7.1.0_rel/build/model_zoo/lbann' % (default_dirname, cluster) + exes['intel18'] = '%s/bamboo/compiler_tests/builds/%s_intel-18.0.0_rel/build/model_zoo/lbann' % (default_dirname, cluster) + + exes['clang4_debug'] = '%s/bamboo/compiler_tests/builds/%s_clang-4.0.0_debug/build/model_zoo/lbann' % (default_dirname, cluster) + exes['gcc4_debug'] = '%s/bamboo/compiler_tests/builds/%s_gcc-4.9.3_debug/build/model_zoo/lbann' % (default_dirname, cluster) + exes['gcc7_debug'] = '%s/bamboo/compiler_tests/builds/%s_gcc-7.1.0_debug/build/model_zoo/lbann' % (default_dirname, cluster) + exes['intel18_debug'] = '%s/bamboo/compiler_tests/builds/%s_intel-18.0.0_debug/build/model_zoo/lbann' % (default_dirname, cluster) + + return exes + + +def get_default_exes(default_dirname, cluster): + exes = get_spack_exes(default_dirname, cluster) + # Use build script as a backup if the Spack build doesn't work. + if not os.path.exists(exes['clang4']): + exes['clang4'] = '%s/build/clang.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) + if not os.path.exists(exes['gcc7']): + exes['gcc7'] = '%s/build/gnu.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) + if not os.path.exists(exes['intel18']): + exes['intel18'] = '%s/build/intel.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) + + if not os.path.exists(exes['clang4_debug']): + exes['clang4_debug'] = '%s/build/clang.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) + if not os.path.exists(exes['gcc7_debug']): + exes['gcc7_debug'] = '%s/build/gnu.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) + if not os.path.exists(exes['intel18_debug']): + exes['intel18_debug'] = '%s/build/intel.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) + + default_exes = {} + default_exes['default'] = '%s/build/gnu.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) + if cluster in ['catalyst', 'quartz', 'pascal']: + # x86_cpu - catalyst, quartz + # x86_gpu_pascal - pascal + default_exes['clang4'] = exes['clang4'] + default_exes['gcc4'] = exes['gcc4'] + default_exes['gcc7'] = exes['gcc7'] + default_exes['intel18'] = exes['intel18'] + + default_exes['clang4_debug'] = exes['clang4_debug'] + default_exes['gcc4_debug'] = exes['gcc4_debug'] + default_exes['gcc7_debug'] = exes['gcc7_debug'] + default_exes['intel18_debug'] = exes['intel18_debug'] + elif cluster in ['surface']: + # x86_gpu - surface + default_exes['gcc4'] = exes['gcc4'] + default_exes['gcc4_debug'] = exes['gcc4_debug'] + + print('default_exes={d}'.format(d=default_exes)) + return default_exes diff --git a/bamboo/compiler_tests/build_script.sh b/bamboo/compiler_tests/build_script.sh index c52f239b5fa..07a19172f26 100755 --- a/bamboo/compiler_tests/build_script.sh +++ b/bamboo/compiler_tests/build_script.sh @@ -1,53 +1,7 @@ -set -e CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') -LBANN_DIR=$(git rev-parse --show-toplevel) -DEBUG='' if [ "${CLUSTER}" != 'surface' ]; then source /usr/share/lmod/lmod/init/bash source /etc/profile.d/00-modulepath.sh fi - -while :; do - case ${1} in - --compiler) - # Choose compiler - if [ -n "${2}" ]; then - COMPILER=${2} - shift - else - echo "\"${1}\" option requires a non-empty option argument" >&2 - exit 1 - fi - ;; - - -d|--debug) - # Debug mode - DEBUG='--debug' - ;; - *) - # Break loop if there are no more options - break - - esac - shift -done - -if [ "${COMPILER}" == 'clang' ]; then - module load clang/4.0.0 - ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler clang ${DEBUG} --reconfigure -fi - -if [ "${COMPILER}" == 'intel' ]; then - module load intel/18.0.0 - ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler intel ${DEBUG} --reconfigure -fi - -if [ "${COMPILER}" == 'gcc4' ]; then - module load gcc/4.9.3 - ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler gnu ${DEBUG} --reconfigure -fi - -if [ "${COMPILER}" == 'gcc7' ]; then - module load gcc/7.1.0 - ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler gnu ${DEBUG} --reconfigure -fi +LBANN_DIR=$(git rev-parse --show-toplevel) +${LBANN_DIR}/scripts/build_lbann_lc.sh --with-conduit diff --git a/bamboo/compiler_tests/build_script_specific.sh b/bamboo/compiler_tests/build_script_specific.sh new file mode 100755 index 00000000000..975d58ac4a1 --- /dev/null +++ b/bamboo/compiler_tests/build_script_specific.sh @@ -0,0 +1,53 @@ +set -e +CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') +LBANN_DIR=$(git rev-parse --show-toplevel) +DEBUG='' +if [ "${CLUSTER}" != 'surface' ]; then + source /usr/share/lmod/lmod/init/bash + source /etc/profile.d/00-modulepath.sh +fi + +while :; do + case ${1} in + --compiler) + # Choose compiler + if [ -n "${2}" ]; then + COMPILER=${2} + shift + else + echo "\"${1}\" option requires a non-empty option argument" >&2 + exit 1 + fi + ;; + + -d|--debug) + # Debug mode + DEBUG='--debug' + ;; + *) + # Break loop if there are no more options + break + + esac + shift +done + +if [ "${COMPILER}" == 'clang4' ]; then + module load clang/4.0.0 + ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler clang ${DEBUG} --reconfigure --with-conduit +fi + +if [ "${COMPILER}" == 'intel18' ]; then + module load intel/18.0.0 + ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler intel ${DEBUG} --reconfigure --with-conduit +fi + +if [ "${COMPILER}" == 'gcc4' ]; then + module load gcc/4.9.3 + ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler gnu ${DEBUG} --reconfigure --with-conduit +fi + +if [ "${COMPILER}" == 'gcc7' ]; then + module load gcc/7.1.0 + ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler gnu ${DEBUG} --reconfigure --with-conduit +fi diff --git a/bamboo/compiler_tests/conftest.py b/bamboo/compiler_tests/conftest.py index 6e07162c5d3..238b812e638 100644 --- a/bamboo/compiler_tests/conftest.py +++ b/bamboo/compiler_tests/conftest.py @@ -1,18 +1,23 @@ import pytest import re, subprocess + def pytest_addoption(parser): - cluster = re.sub('[0-9]+', '', subprocess.check_output('hostname'.split()).strip()) - default_dirname = subprocess.check_output('git rev-parse --show-toplevel'.split()).strip() + cluster = re.sub('[0-9]+', '', subprocess.check_output( + 'hostname'.split()).strip()) + default_dirname = subprocess.check_output( + 'git rev-parse --show-toplevel'.split()).strip() parser.addoption('--cluster', action='store', default=cluster, help='--cluster= to specify the cluster being run on, for the purpose of determing which commands to use. Default the current cluster') parser.addoption('--dirname', action='store', default=default_dirname, help='--dirname specifies the top-level directory') + @pytest.fixture def cluster(request): return request.config.getoption('--cluster') - + + @pytest.fixture def dirname(request): return request.config.getoption('--dirname') diff --git a/bamboo/compiler_tests/test_compiler.py b/bamboo/compiler_tests/test_compiler.py index 383c8701832..5682d11f3af 100644 --- a/bamboo/compiler_tests/test_compiler.py +++ b/bamboo/compiler_tests/test_compiler.py @@ -1,109 +1,167 @@ +# import sys +# sys.path.insert(0, '../common_python') +# import tools import pytest import os, re, subprocess -def build_script(cluster, dirname, compiler, debug): - if debug: - build = 'debug' + +def test_compiler_build_script(cluster, dirname): + if cluster in ['pascal']: + output_file_name = '%s/bamboo/compiler_tests/output/build_script_output.txt' % (dirname) + error_file_name = '%s/bamboo/compiler_tests/error/build_script_error.txt' % (dirname) + command = '%s/bamboo/compiler_tests/build_script.sh > %s 2> %s' % ( + dirname, output_file_name, error_file_name) + return_code = os.system(command) + if return_code != 0: + output_file = open(output_file_name, 'r') + for line in output_file: + print('%s: %s' % (output_file_name, line)) + error_file = open(error_file_name, 'r') + for line in error_file: + print('%s: %s' % (error_file_name, line)) + assert return_code == 0 else: - build = 'release' - output_file_name = '%s/bamboo/compiler_tests/output/%s_%s_%s_output.txt' % (dirname, cluster, compiler, build) - error_file_name = '%s/bamboo/compiler_tests/error/%s_%s_%s_error.txt' % (dirname, cluster, compiler, build) - command = '%s/bamboo/compiler_tests/build_script.sh --compiler %s %s> %s 2> %s' % (dirname, compiler, debug, output_file_name, error_file_name) - return_code = os.system(command) - if return_code != 0: - output_file = open(output_file_name, 'r') - for line in output_file: - print('%s: %s' % (output_file_name, line)) - error_file = open(error_file_name, 'r') - for line in error_file: - print('%s: %s' % (error_file_name, line)) - assert return_code == 0 + e = 'test_compiler_build_script: Unsupported Cluster %s' % cluster + print('Skip - ' + e) + pytest.skip(e) + def test_compiler_clang4_release(cluster, dirname): - #skeleton_clang4(cluster, dirname, False) - if cluster in ['ray', 'catalyst']: - build_script(cluster, dirname, 'clang', '') - else: - pytest.skip('Unsupported Cluster %s' % cluster) + try: + skeleton_clang4(cluster, dirname, False) + except AssertionError as e: + print(e) + build_script(cluster, dirname, 'clang4', False) + path = '%s/bamboo/compiler_tests/builds/%s_clang-4.0.0_rel/build/model_zoo/lbann' % (dirname, cluster) + if not os.path.exists(path): + path = '%s/build/clang.Release.%s.llnl.gov/install/bin/lbann' % (dirname, cluster) + assert os.path.exists(path) + def test_compiler_clang4_debug(cluster, dirname): - #skeleton_clang4(cluster, dirname, True) - if cluster in ['ray', 'catalyst']: - build_script(cluster, dirname, 'clang', '--debug') - else: - pytest.skip('Unsupported Cluster %s' % cluster) + try: + skeleton_clang4(cluster, dirname, True) + except AssertionError as e: + print(e) + build_script(cluster, dirname, 'clang4', True) + path = '%s/bamboo/compiler_tests/builds/%s_clang-4.0.0_debug/build/model_zoo/lbann' % (dirname, cluster) + if not os.path.exists(path): + path = '%s/build/clang.Debug.%s.llnl.gov/install/bin/lbann' % (dirname, cluster) + assert os.path.exists(path) + def test_compiler_gcc4_release(cluster, dirname): - #skeleton_gcc4(cluster, dirname, False) - build_script(cluster, dirname, 'gcc4', '') + try: + skeleton_gcc4(cluster, dirname, False) + except AssertionError as e: + print(e) + build_script(cluster, dirname, 'gcc4', False) + path = '%s/bamboo/compiler_tests/builds/%s_gcc-4.9.3_rel/build/model_zoo/lbann' % (dirname, cluster) + assert os.path.exists(path) + def test_compiler_gcc4_debug(cluster, dirname): - #skeleton_gcc4(cluster, dirname, True) - build_script(cluster, dirname, 'gcc4', '--debug') + try: + skeleton_gcc4(cluster, dirname, True) + except AssertionError as e: + print(e) + build_script(cluster, dirname, 'gcc4', True) + path = '%s/bamboo/compiler_tests/builds/%s_gcc-4.9.3_debug/build/model_zoo/lbann' % (dirname, cluster) + assert os.path.exists(path) + def test_compiler_gcc7_release(cluster, dirname): - #skeleton_gcc7(cluster, dirname, False) - if cluster == 'catalyst': - build_script(cluster, dirname, 'gcc7', '') - else: - pytest.skip('Unsupported Cluster %s' % cluster) + try: + skeleton_gcc7(cluster, dirname, False) + except AssertionError as e: + print(e) + build_script(cluster, dirname, 'gcc7', False) + path = '%s/bamboo/compiler_tests/builds/%s_gcc-7.1.0_rel/build/model_zoo/lbann' % (dirname, cluster) + if not os.path.exists(path): + path = '%s/build/gnu.Release.%s.llnl.gov/install/bin/lbann' % (dirname, cluster) + assert os.path.exists(path) + def test_compiler_gcc7_debug(cluster, dirname): - #skeleton_gcc7(cluster, dirname, True) - if cluster == 'catalyst': - build_script(cluster, dirname, 'gcc7', '--debug') - else: - pytest.skip('Unsupported Cluster %s' % cluster) + try: + skeleton_gcc7(cluster, dirname, True) + except AssertionError as e: + print(e) + build_script(cluster, dirname, 'gcc7', True) + path = '%s/bamboo/compiler_tests/builds/%s_gcc-7.1.0_debug/build/model_zoo/lbann' % (dirname, cluster) + if not os.path.exists(path): + path = '%s/build/gnu.Debug.%s.llnl.gov/install/bin/lbann' % (dirname, cluster) + assert os.path.exists(path) + def test_compiler_intel18_release(cluster, dirname): - #skeleton_intel18(cluster, dirname, False) - if cluster == 'catalyst': - build_script(cluster, dirname, 'intel', '') - else: - pytest.skip('Unsupported Cluster %s' % cluster) + try: + skeleton_intel18(cluster, dirname, False) + except AssertionError as e: + print(e) + build_script(cluster, dirname, 'intel18', False) + path = '%s/bamboo/compiler_tests/builds/%s_intel-18.0.0_rel/build/model_zoo/lbann' % (dirname, cluster) + if not os.path.exists(path): + path = '%s/build/intel.Release.%s.llnl.gov/install/bin/lbann' % (dirname, cluster) + assert os.path.exists(path) + def test_compiler_intel18_debug(cluster, dirname): - #skeleton_intel18(cluster, dirname, True) - if cluster == 'catalyst': - build_script(cluster, dirname, 'intel', '--debug') - else: - pytest.skip('Unsupported Cluster %s' % cluster) + try: + skeleton_intel18(cluster, dirname, True) + except AssertionError as e: + print(e) + build_script(cluster, dirname, 'intel18', True) + path = '%s/bamboo/compiler_tests/builds/%s_intel-18.0.0_debug/build/model_zoo/lbann' % (dirname, cluster) + if not os.path.exists(path): + path = '%s/build/intel.Debug.%s.llnl.gov/install/bin/lbann' % (dirname, cluster) + assert os.path.exists(path) + def skeleton_clang4(cluster, dir_name, debug, should_log=False): if cluster in ['catalyst', 'quartz']: spack_skeleton(dir_name, 'clang@4.0.0', 'mvapich2@2.2', debug, should_log) build_skeleton(dir_name, 'clang@4.0.0', debug, should_log) else: - pytest.skip('Unsupported Cluster %s' % cluster) + e = 'skeleton_clang4: Unsupported Cluster %s' % cluster + print('Skip - ' + e) + pytest.skip(e) + def skeleton_gcc4(cluster, dir_name, debug, should_log=False): - if cluster in ['catalyst', 'quartz', 'ray']: - if cluster in ['catalyst','quartz']: - mpi = 'mvapich2@2.2' - elif cluster in ['pascal', 'surface']: - mpi = 'mvapich2@2.2+cuda' - elif cluster == 'ray': - mpi = 'spectrum-mpi@2018.04.27' - else: - raise Exception('Unsupported Cluster %s' % cluster) - spack_skeleton(dir_name, 'gcc@4.9.3', mpi, debug, should_log) - build_skeleton(dir_name, 'gcc@4.9.3', debug, should_log) + if cluster in ['quartz']: # Taking out 'catalyst' + mpi = 'mvapich2@2.2' + elif cluster in ['surface']: # Taking out 'pascal' + mpi = 'mvapich2@2.2+cuda' + elif cluster == 'ray': + mpi = 'spectrum-mpi@2018.04.27' else: - pytest.skip('Unsupported Cluster %s' % cluster) + e = 'skeleton_gcc4: Unsupported Cluster %s' % cluster + print('Skip - ' + e) + pytest.skip(e) + spack_skeleton(dir_name, 'gcc@4.9.3', mpi, debug, should_log) + build_skeleton(dir_name, 'gcc@4.9.3', debug, should_log) + def skeleton_gcc7(cluster, dir_name, debug, should_log=False): if cluster in ['catalyst', 'quartz']: spack_skeleton(dir_name, 'gcc@7.1.0', 'mvapich2@2.2', debug, should_log) build_skeleton(dir_name, 'gcc@7.1.0', debug, should_log) else: - pytest.skip('Unsupported Cluster %s' % cluster) + e = 'skeleton_gcc7: Unsupported Cluster %s' % cluster + print('Skip - ' + e) + pytest.skip(e) + def skeleton_intel18(cluster, dir_name, debug, should_log=False): - if cluster in ['catalyst', 'quartz']: + if cluster in ['quartz']: # Taking out 'catalyst' spack_skeleton(dir_name, 'intel@18.0.0', 'mvapich2@2.2', debug, should_log) build_skeleton(dir_name, 'intel@18.0.0', debug, should_log) else: - pytest.skip('Unsupported Cluster %s' % cluster) + e = 'skeleton_intel18: Unsupported Cluster %s' % cluster + print('Skip - ' + e) + pytest.skip(e) + def spack_skeleton(dir_name, compiler, mpi_lib, debug, should_log): compiler_underscored = re.sub('[@\.]', '_', compiler) @@ -130,6 +188,7 @@ def spack_skeleton(dir_name, compiler, mpi_lib, debug, should_log): print('%s: %s' % (error_file_name, line)) assert return_code == 0 + def build_skeleton(dir_name, compiler, debug, should_log): compiler_underscored = re.sub('[@\.]', '_', compiler) if debug: @@ -142,7 +201,8 @@ def build_skeleton(dir_name, compiler, debug, should_log): #mpi_lib = mpi_lib.replace('@', '-') cluster = re.sub('[0-9]+', '', subprocess.check_output('hostname'.split()).strip()) # For reference: - # Commenting out for now. These additions to path name will likely return one day, so I am not removing them entirely + # Commenting out for now. These additions to path name will likely return + # one day, so I am not removing them entirely. # x86_64 <=> catalyst, pascal, quartz, surface # ppc64le <=> ray #architecture = subprocess.check_output('uname -m'.split()).strip() @@ -164,3 +224,27 @@ def build_skeleton(dir_name, compiler, debug, should_log): for line in error_file: print('%s: %s' % (error_file_name, line)) assert return_code == 0 + + +def build_script(cluster, dirname, compiler, debug): + print(('Running build_script for cluster={cluster},' + ' compiler={compiler}, debug={debug}.').format( + cluster=cluster, compiler=compiler, debug=debug)) + if debug: + build = 'debug' + debug_flag = '--debug' + else: + build = 'release' + debug_flag = '' + output_file_name = '%s/bamboo/compiler_tests/output/%s_%s_%s_build_script_output.txt' % (dirname, cluster, compiler, build) + error_file_name = '%s/bamboo/compiler_tests/error/%s_%s_%s_build_script_error.txt' % (dirname, cluster, compiler, build) + command = '%s/bamboo/compiler_tests/build_script_specific.sh --compiler %s %s> %s 2> %s' % (dirname, compiler, debug_flag, output_file_name, error_file_name) + return_code = os.system(command) + if return_code != 0: + output_file = open(output_file_name, 'r') + for line in output_file: + print('%s: %s' % (output_file_name, line)) + error_file = open(error_file_name, 'r') + for line in error_file: + print('%s: %s' % (error_file_name, line)) + assert return_code == 0 diff --git a/bamboo/integration_tests/common_code.py b/bamboo/integration_tests/common_code.py index 7a3cea95c71..0d0a4dda68e 100644 --- a/bamboo/integration_tests/common_code.py +++ b/bamboo/integration_tests/common_code.py @@ -3,6 +3,7 @@ import tools import collections, csv, os, pprint, re, time + # Set up the command ########################################################## def get_command(cluster, dir_name, model_folder, model_name, executable, output_file_name, error_file_name, compiler_name, weekly=False): @@ -23,7 +24,8 @@ def get_command(cluster, dir_name, model_folder, model_name, executable, optimizer_name='adagrad', output_file_name=output_file_name, error_file_name=error_file_name) elif model_name in ['conv_autoencoder_mnist', 'lenet_mnist']: - if (model_name == 'lenet_mnist') and (compiler_name in ['clang4', 'intel18']): + if (model_name == 'lenet_mnist') and \ + (compiler_name in ['clang4', 'intel18']): partition = 'pbatch' time_limit = 600 else: @@ -33,10 +35,10 @@ def get_command(cluster, dir_name, model_folder, model_name, executable, num_processes = 20 else: num_processes = 2 - command = tools.get_command( + command = tools.get_command( cluster=cluster, executable=executable, num_nodes=1, - partition=partition, time_limit=time_limit, num_processes=num_processes, - dir_name=dir_name, + partition=partition, time_limit=time_limit, + num_processes=num_processes, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', data_reader_name='mnist', model_folder=model_folder, model_name=model_name, num_epochs=5, optimizer_name='adagrad', @@ -47,18 +49,23 @@ def get_command(cluster, dir_name, model_folder, model_name, executable, # Run LBANN ################################################################### -def run_lbann(command, model_name, output_file_name, error_file_name, should_log=False): + +def run_lbann(command, model_name, output_file_name, error_file_name, + should_log=False): print('About to run: %s' % command) - print('%s began waiting in the queue at ' % model_name + time.strftime('%H:%M:%S', time.localtime())) + print('%s began waiting in the queue at ' % model_name + + time.strftime('%H:%M:%S', time.localtime())) output_value = os.system(command) - print('%s finished at ' % model_name + time.strftime('%H:%M:%S', time.localtime())) + print('%s finished at ' % model_name + + time.strftime('%H:%M:%S', time.localtime())) lbann_exceptions = [] timed_out = False if should_log or (output_value != 0): output_file = open(output_file_name, 'r') for line in output_file: print('%s: %s' % (output_file_name, line)) - is_match = re.search('This lbann_exception is about to be thrown:(.*)', line) + is_match = re.search( + 'This lbann_exception is about to be thrown:(.*)', line) if is_match: lbann_exceptions.append(is_match.group(1)) is_match = re.search('CANCELLED AT (.*) DUE TO TIME LIMIT', line) @@ -67,15 +74,22 @@ def run_lbann(command, model_name, output_file_name, error_file_name, should_log error_file = open(error_file_name, 'r') for line in error_file: print('%s: %s' % (error_file_name, line)) + is_match = re.search('LBANN error on (.*)', line) + if is_match: + lbann_exceptions.append(is_match.group(1)) if output_value != 0: - error_string = 'Model %s crashed with output_value=%d, timed_out=%s, and lbann exceptions=%s. Command was: %s' % ( - model_name, output_value, str(timed_out), str(collections.Counter(lbann_exceptions)), command) + error_string = ('Model %s crashed with output_value=%d, timed_out=%s,' + ' and lbann exceptions=%s. Command was: %s') % ( + model_name, output_value, str(timed_out), + str(collections.Counter(lbann_exceptions)), command) raise Exception(error_string) return output_value # Extract data from output #################################################### -def populate_data_dict_epoch(regex, line, data_field, data_fields, data_dict, model_id): + +def populate_data_dict_epoch(regex, line, data_field, data_fields, data_dict, + model_id): is_match = re.search(regex, line) if is_match and (data_field in data_fields): if model_id not in data_dict[data_field].keys(): @@ -84,7 +98,9 @@ def populate_data_dict_epoch(regex, line, data_field, data_fields, data_dict, mo value = float(is_match.group(2)) data_dict[data_field][model_id][epoch_id] = value -def populate_data_dict_overall(regex, line, data_field, data_fields, data_dict, model_id): + +def populate_data_dict_overall(regex, line, data_field, data_fields, data_dict, + model_id): is_match = re.search(regex, line) if is_match and (data_field in data_fields): if model_id not in data_dict[data_field].keys(): @@ -92,6 +108,7 @@ def populate_data_dict_overall(regex, line, data_field, data_fields, data_dict, value = float(is_match.group(1)) data_dict[data_field][model_id]['overall'] = value + # data_dict[data_field][model_id][epoch_id] = float # data_fields is the list or set of data we're interested in. def extract_data(output_file_name, data_fields, should_log): @@ -102,24 +119,31 @@ def extract_data(output_file_name, data_fields, should_log): for line in output_file: if should_log: - print('%s: %s' % (output_file_name, line)) + print('extract_data: %s: %s' % (output_file_name, line)) # Check if line is reporting model results is_model = re.search('^Model ([0-9]+)', line) + if not is_model: + is_model = re.search('^model([0-9]+)', line) if is_model: + print('extract_data: is_model={is_model}'.format(is_model=is_model)) model_id = is_model.group(1) regex = 'training epoch ([0-9]+) objective function : ([0-9.]+)' data_field = 'training_objective_function' - populate_data_dict_epoch(regex, line, data_field, data_fields, data_dict, model_id) + populate_data_dict_epoch(regex, line, data_field, data_fields, + data_dict, model_id) regex = 'training epoch ([0-9]+) run time : ([0-9.]+)' data_field = 'training_run_time' - populate_data_dict_epoch(regex, line, data_field, data_fields, data_dict, model_id) + populate_data_dict_epoch(regex, line, data_field, data_fields, + data_dict, model_id) regex = 'training epoch ([0-9]+) mini-batch time statistics : ([0-9.]+)s mean, ([0-9.]+)s max, ([0-9.]+)s min, ([0-9.]+)s stdev' is_match = re.search(regex, line) if is_match: + print('extract_data: is_mini-batch time statistics={is_match}'.format( + is_match=is_match)) epoch_id = is_match.group(1) mean_value = float(is_match.group(2)) max_value = float(is_match.group(3)) @@ -129,53 +153,66 @@ def extract_data(output_file_name, data_fields, should_log): if data_field in data_fields: if model_id not in data_dict[data_field].keys(): data_dict[data_field][model_id] = {} + print('extract_data: mean_value={mv}'.format(mv=mean_value)) data_dict[data_field][model_id][epoch_id] = mean_value data_field = 'training_max' if data_field in data_fields: if model_id not in data_dict[data_field].keys(): data_dict[data_field][model_id] = {} + print('extract_data: max_value={mv}'.format(mv=max_value)) data_dict[data_field][model_id][epoch_id] = max_value data_field = 'training_min' if data_field in data_fields: if model_id not in data_dict[data_field].keys(): data_dict[data_field][model_id] = {} + print('extract_data: min_value={mv}'.format(mv=min_value)) data_dict[data_field][model_id][epoch_id] = min_value data_field = 'training_stdev' if data_field in data_fields: if model_id not in data_dict[data_field].keys(): data_dict[data_field][model_id] = {} + print('extract_data: stdev={sv}'.format(sv=stdev_value)) data_dict[data_field][model_id][epoch_id] = stdev_value regex = 'test categorical accuracy : ([0-9.]+)' data_field = 'test_accuracy' - populate_data_dict_overall(regex, line, data_field, data_fields, data_dict, model_id) + populate_data_dict_overall(regex, line, data_field, data_fields, + data_dict, model_id) output_file.close() if should_log: + print('extract_data: Extracted Data below:') pprint.pprint(data_dict) return data_dict # Skeleton #################################################################### -def skeleton(cluster, dir_name, executable, model_folder, model_name, data_fields, should_log, compiler_name=None, weekly=False): - if compiler_name == None: + +def skeleton(cluster, dir_name, executable, model_folder, model_name, + data_fields, should_log, compiler_name=None, weekly=False): + if compiler_name is None: output_file_name = '%s/bamboo/integration_tests/output/%s_output.txt' % (dir_name, model_name) error_file_name = '%s/bamboo/integration_tests/error/%s_error.txt' % (dir_name, model_name) else: - output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name) - error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name) - command = get_command(cluster, dir_name, model_folder, model_name, executable, output_file_name, error_file_name, compiler_name, weekly=weekly) - run_lbann(command, model_name, output_file_name, error_file_name, should_log) # Don't need return value + output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' % (dir_name, model_name, compiler_name) + error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' % (dir_name, model_name, compiler_name) + command = get_command( + cluster, dir_name, model_folder, model_name, executable, + output_file_name, error_file_name, compiler_name, weekly=weekly) + run_lbann(command, model_name, output_file_name, + error_file_name, should_log) # Don't need return value return extract_data(output_file_name, data_fields, should_log) # Misc. functions ############################################################ + # csv_dict[row_header][column_header] = float def csv_to_dict(csv_path): - with open(csv_path, 'r') as csv_file: - reader = csv.reader(csv_file, skipinitialspace=True) - column_headers = reader.next() - values = {} - for row in reader: - row_header = row[0] - values[row_header] = dict(zip(column_headers[1:], map(float, row[1:]))) - return values + with open(csv_path, 'r') as csv_file: + reader = csv.reader(csv_file, skipinitialspace=True) + column_headers = reader.next() + values = {} + for row in reader: + row_header = row[0] + values[row_header] = dict( + zip(column_headers[1:], map(float, row[1:]))) + return values diff --git a/bamboo/integration_tests/conftest.py b/bamboo/integration_tests/conftest.py index 4039eeb7dac..da2ffc127be 100644 --- a/bamboo/integration_tests/conftest.py +++ b/bamboo/integration_tests/conftest.py @@ -1,31 +1,15 @@ -import pytest, os, re, subprocess +import sys +sys.path.insert(0, '../common_python') +import tools +import pytest, re, subprocess -def pytest_addoption(parser): - cluster = re.sub('[0-9]+', '', subprocess.check_output('hostname'.split()).strip()) - default_dirname = subprocess.check_output('git rev-parse --show-toplevel'.split()).strip() - default_exes = {} - default_exes['default'] = '%s/build/gnu.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) - if cluster in ['catalyst', 'quartz']: - default_exes['clang4'] = '%s/build/clang.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) #'%s/bamboo/compiler_tests/builds/%s_clang-4.0.0_rel/build/model_zoo/lbann' % (default_dirname, cluster) - #default_exes['gcc4'] = '%s/bamboo/compiler_tests/builds/%s_gcc-4.9.3_rel/build/model_zoo/lbann' % (default_dirname, cluster) - default_exes['gcc7'] = '%s/build/gnu.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) #'%s/bamboo/compiler_tests/builds/%s_gcc-7.1.0_rel/build/model_zoo/lbann' % (default_dirname, cluster) - default_exes['intel18'] = '%s/build/intel.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) #'%s/bamboo/compiler_tests/builds/%s_intel-18.0.0_rel/build/model_zoo/lbann' % (default_dirname, cluster) - - default_exes['clang4_debug'] = '%s/build/clang.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) #'%s/bamboo/compiler_tests/builds/%s_clang-4.0.0_debug/build/model_zoo/lbann' % (default_dirname, cluster) - #default_exes['gcc4_debug'] = '%s/bamboo/compiler_tests/builds/%s_gcc-4.9.3_debug/build/model_zoo/lbann' % (default_dirname, cluster) - default_exes['gcc7_debug'] = '%s/build/gnu.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) #'%s/bamboo/compiler_tests/builds/%s_gcc-7.1.0_debug/build/model_zoo/lbann' % (default_dirname, cluster) - default_exes['intel18_debug'] = '%s/build/intel.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) #'%s/bamboo/compiler_tests/builds/%s_intel-18.0.0_debug/build/model_zoo/lbann' % (default_dirname, cluster) - - if cluster == 'ray': - default_exes['clang4'] = '%s/build/clang.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) - default_exes['gcc4'] = '%s/build/gnu.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) #'%s/bamboo/compiler_tests/builds/%s_gcc-4.9.3_rel/build/model_zoo/lbann' % (default_dirname, cluster) - default_exes['clang4_debug'] = '%s/build/clang.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) - default_exes['gcc4_debug'] = '%s/build/gnu.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) #'%s/bamboo/compiler_tests/builds/%s_gcc-4.9.3_debug/build/model_zoo/lbann' % (default_dirname, cluster) - - if cluster in ['surface', 'pascal']: - default_exes['gcc4'] = default_exes['default'] - default_exes['gcc4_debug'] = '%s/build/gnu.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) +def pytest_addoption(parser): + cluster = re.sub('[0-9]+', '', subprocess.check_output( + 'hostname'.split()).strip()) + default_dirname = subprocess.check_output( + 'git rev-parse --show-toplevel'.split()).strip() + default_exes = tools.get_default_exes(default_dirname, cluster) parser.addoption('--cluster', action='store', default=cluster, help='--cluster= to specify the cluster being run on, for the purpose of determing which commands to use. Default the current cluster') @@ -40,26 +24,32 @@ def pytest_addoption(parser): # For local testing only parser.addoption('--exe', action='store', help='--exe=') + @pytest.fixture def cluster(request): return request.config.getoption('--cluster') + @pytest.fixture def debug(request): return request.config.getoption('--debug') + @pytest.fixture def dirname(request): return request.config.getoption('--dirname') + @pytest.fixture def exes(request): return request.config.getoption('--exes') + @pytest.fixture def weekly(request): return request.config.getoption('--weekly') + @pytest.fixture def exe(request): return request.config.getoption('--exe') diff --git a/bamboo/integration_tests/expected_values/catalyst/clang4/expected_performance.csv b/bamboo/integration_tests/expected_values/catalyst/clang4/expected_performance.csv index 2234e14d5ab..32551e8e70b 100644 --- a/bamboo/integration_tests/expected_values/catalyst/clang4/expected_performance.csv +++ b/bamboo/integration_tests/expected_values/catalyst/clang4/expected_performance.csv @@ -1,5 +1,5 @@ Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy -alexnet_nightly, 353.48, 7.07, 21.57, 1.24, 4.21, 0.00 -alexnet_weekly, 882.26, 1.78, 4.68, 0.95, 0.21, 2.49 -cache_alexnet, 623.30, 1.27, 4.98, 0.66, 2.24, 0.57 -lenet_mnist, 33.55, 0.04, 0.09, 0.04, 0.01, 98.96 +alexnet_nightly, 56.00, 1.20, 5.00, 0.80, 0.40, 0.00 +alexnet_weekly, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 +cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 +lenet_mnist, 88.00, 0.12, 0.40, 0.10, 0.09, 98.40 diff --git a/bamboo/integration_tests/expected_values/catalyst/gcc4/expected_conv_autoencoder_mnist_objective_functions.csv b/bamboo/integration_tests/expected_values/catalyst/gcc4/expected_conv_autoencoder_mnist_objective_functions.csv deleted file mode 100644 index 80c12b2b0ed..00000000000 --- a/bamboo/integration_tests/expected_values/catalyst/gcc4/expected_conv_autoencoder_mnist_objective_functions.csv +++ /dev/null @@ -1,6 +0,0 @@ -Epoch_number, training_objective_function -0, 0.207480 -1, 0.194710 -2, 0.193224 -3, 0.192867 -4, 0.192758 diff --git a/bamboo/integration_tests/expected_values/catalyst/gcc4/expected_performance.csv b/bamboo/integration_tests/expected_values/catalyst/gcc4/expected_performance.csv deleted file mode 100644 index 639e20aa5f4..00000000000 --- a/bamboo/integration_tests/expected_values/catalyst/gcc4/expected_performance.csv +++ /dev/null @@ -1,5 +0,0 @@ -Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy -alexnet_nightly, 63.18, 1.27, 3.11, 0.79, 0.55, 0.00 -alexnet_weekly, 565.30, 1.14, 3.83, 0.76, 0.30, 3.11 -cache_alexnet, 623.30, 1.27, 4.98, 0.66, 2.24, 0.57 -lenet_mnist, 15.61, 0.02, 0.08, 0.02, 0.01, 98.96 diff --git a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv index b315574f51d..d3ac7caa6b4 100644 --- a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv +++ b/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv @@ -1,5 +1,5 @@ Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy -alexnet_nightly, 71.14, 1.43, 3.20, 0.98, 0.51, 0.00 -alexnet_weekly, 691.96, 1.40, 4.53, 1.09, 0.22, 1.05 -cache_alexnet, 623.30, 1.27, 4.98, 0.66, 2.24, 0.57 -lenet_mnist, 15.51, 0.02, 0.06, 0.02, 0.01, 99.00 +alexnet_nightly, 57.00, 1.11, 4.80, 0.37, 1.20, 0.00 +alexnet_weekly, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 +cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 +lenet_mnist, 64.00, 0.10, 0.40, 0.08, 0.04, 98.92 diff --git a/bamboo/integration_tests/expected_values/catalyst/intel18/expected_conv_autoencoder_imagenet_objective_functions.csv b/bamboo/integration_tests/expected_values/catalyst/intel18/expected_conv_autoencoder_imagenet_objective_functions.csv deleted file mode 100644 index 003794fd557..00000000000 --- a/bamboo/integration_tests/expected_values/catalyst/intel18/expected_conv_autoencoder_imagenet_objective_functions.csv +++ /dev/null @@ -1,21 +0,0 @@ -Epoch_number, training_objective_function_nightly, training_objective_function_weekly -0, 0.675652, 0.608574 -1, 0.590008, 0.590008 -2, 0.587484, 0.587484 -3, 0.586305, 0.586305 -4, 0.585585, 0.585585 -5, 0.585036, 0.585036 -6, 0.584688, 0.584688 -7, 0.584348, 0.584348 -8, 0.584041, 0.584041 -9, 0.583865, 0.583865 -10, 0.583665, 0.583665 -11, 0.583521, 0.583521 -12, 0.583303, 0.583303 -13, 0.58328, 0.58328 -14, 0.5832, 0.5832 -15, 0.583134, 0.583134 -16, 0.583052, 0.583052 -17, 0.583039, 0.583039 -18, 0.582954, 0.582954 -19, 0.582936, 0.582936 diff --git a/bamboo/integration_tests/expected_values/catalyst/intel18/expected_conv_autoencoder_mnist_objective_functions.csv b/bamboo/integration_tests/expected_values/catalyst/intel18/expected_conv_autoencoder_mnist_objective_functions.csv deleted file mode 100644 index 80c12b2b0ed..00000000000 --- a/bamboo/integration_tests/expected_values/catalyst/intel18/expected_conv_autoencoder_mnist_objective_functions.csv +++ /dev/null @@ -1,6 +0,0 @@ -Epoch_number, training_objective_function -0, 0.207480 -1, 0.194710 -2, 0.193224 -3, 0.192867 -4, 0.192758 diff --git a/bamboo/integration_tests/expected_values/catalyst/intel18/expected_performance.csv b/bamboo/integration_tests/expected_values/catalyst/intel18/expected_performance.csv deleted file mode 100644 index 4fc534169fe..00000000000 --- a/bamboo/integration_tests/expected_values/catalyst/intel18/expected_performance.csv +++ /dev/null @@ -1,5 +0,0 @@ -Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy -alexnet_nightly, 49.54, 0.96, 3.21, 1.00, 0.62, 0.00 -alexnet_weekly, 402.50, 0.82, 3.36, 0.47, 0.34, 3.27 -cache_alexnet, 623.30, 1.27, 4.98, 0.66, 2.24, 0.57 -lenet_mnist, 20.02, 0.03, 0.09, 0.03, 0.01, 98.91 diff --git a/bamboo/integration_tests/expected_values/catalyst/gcc4/expected_conv_autoencoder_imagenet_objective_functions.csv b/bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv similarity index 100% rename from bamboo/integration_tests/expected_values/catalyst/gcc4/expected_conv_autoencoder_imagenet_objective_functions.csv rename to bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv diff --git a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv b/bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv new file mode 100644 index 00000000000..8bcf25bb71d --- /dev/null +++ b/bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv @@ -0,0 +1,6 @@ +Epoch_number, training_objective_function +0, 0.207514 +1, 0.194710 +2, 0.193221 +3, 0.192864 +4, 0.192755 diff --git a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv new file mode 100644 index 00000000000..cca3451efd2 --- /dev/null +++ b/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv @@ -0,0 +1,5 @@ +Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy +alexnet_nightly, 51.00, 1.20, 4.00, 0.50, 0.40, 0.17 +alexnet_weekly, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 +cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 +lenet_mnist, 9.00, 0.01, 6.00, 0.01, 0.40, 98.40 diff --git a/bamboo/integration_tests/expected_values/ray/gcc4/expected_conv_autoencoder_imagenet_objective_functions.csv b/bamboo/integration_tests/expected_values/ray/gcc4/expected_conv_autoencoder_imagenet_objective_functions.csv deleted file mode 100644 index 28c4d8c9e98..00000000000 --- a/bamboo/integration_tests/expected_values/ray/gcc4/expected_conv_autoencoder_imagenet_objective_functions.csv +++ /dev/null @@ -1,21 +0,0 @@ -Epoch_number, training_objective_function_nightly, training_objective_function_weekly -0, 0.608574, 0.608574 -1, 0.590008, 0.590008 -2, 0.587484, 0.587484 -3, 0.586305, 0.586305 -4, 0.585585, 0.585585 -5, 0.585036, 0.585036 -6, 0.584688, 0.584688 -7, 0.584348, 0.584348 -8, 0.584041, 0.584041 -9, 0.583865, 0.583865 -10, 0.583665, 0.583665 -11, 0.583521, 0.583521 -12, 0.583303, 0.583303 -13, 0.58328, 0.58328 -14, 0.5832, 0.5832 -15, 0.583134, 0.583134 -16, 0.583052, 0.583052 -17, 0.583039, 0.583039 -18, 0.582954, 0.582954 -19, 0.582936, 0.582936 diff --git a/bamboo/integration_tests/expected_values/ray/gcc4/expected_conv_autoencoder_mnist_objective_functions.csv b/bamboo/integration_tests/expected_values/ray/gcc4/expected_conv_autoencoder_mnist_objective_functions.csv deleted file mode 100644 index 96a9ed9a8b7..00000000000 --- a/bamboo/integration_tests/expected_values/ray/gcc4/expected_conv_autoencoder_mnist_objective_functions.csv +++ /dev/null @@ -1,6 +0,0 @@ -Epoch_number, training_objective_function -0, 0.207587 -1, 0.194595 -2, 0.193141 -3, 0.192808 -4, 0.192716 diff --git a/bamboo/integration_tests/expected_values/ray/gcc4/expected_performance.csv b/bamboo/integration_tests/expected_values/ray/gcc4/expected_performance.csv deleted file mode 100644 index e3331534d90..00000000000 --- a/bamboo/integration_tests/expected_values/ray/gcc4/expected_performance.csv +++ /dev/null @@ -1,5 +0,0 @@ -Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy -alexnet_nightly, 47.42, 0.95, 3.34, 0.54, 0.59, 0.00 -alexnet_weekly, 623.30, 1.27, 4.98, 0.66, 2.24, 0.57 -cache_alexnet, 623.30, 1.27, 4.98, 0.66, 2.24, 0.57 -lenet_mnist, 260.85, 0.31, 0.88, 0.28, 0.03, 98.66 diff --git a/bamboo/integration_tests/expected_values/surface/gcc4/expected_conv_autoencoder_imagenet_objective_functions.csv b/bamboo/integration_tests/expected_values/surface/gcc4/expected_conv_autoencoder_imagenet_objective_functions.csv deleted file mode 100644 index 32d30822dce..00000000000 --- a/bamboo/integration_tests/expected_values/surface/gcc4/expected_conv_autoencoder_imagenet_objective_functions.csv +++ /dev/null @@ -1,21 +0,0 @@ -Epoch_number, training_objective_function_nightly, training_objective_function_weekly -0, 0.608574, 0.608574 -1, 0.590008, 0.590008 -2, 0.587484, 0.587484 -3, 0.586305, 0.586305 -4, 0.585585, 0.585585 -5, 0.585036, 0.585036 -6, 0.584688, 0.584688 -7, 0.584348, 0.584348 -8, 0.584041, 0.584041 -9, 0.583865, 0.583865 -10, 0.583665, 0.583665 -11, 0.583521, 0.583521 -12, 0.583303, 0.583303 -13, 0.58328, 0.58328 -14, 0.5832, 0.5832 -15, 0.583134, 0.583134 -16, 0.583052, 0.583052 -17, 0.583039, 0.583039 -18, 0.582954, 0.582954 -19, 0.582936, 0.582936 diff --git a/bamboo/integration_tests/expected_values/surface/gcc4/expected_conv_autoencoder_mnist_objective_functions.csv b/bamboo/integration_tests/expected_values/surface/gcc4/expected_conv_autoencoder_mnist_objective_functions.csv deleted file mode 100644 index 96a9ed9a8b7..00000000000 --- a/bamboo/integration_tests/expected_values/surface/gcc4/expected_conv_autoencoder_mnist_objective_functions.csv +++ /dev/null @@ -1,6 +0,0 @@ -Epoch_number, training_objective_function -0, 0.207587 -1, 0.194595 -2, 0.193141 -3, 0.192808 -4, 0.192716 diff --git a/bamboo/integration_tests/expected_values/surface/gcc4/expected_performance.csv b/bamboo/integration_tests/expected_values/surface/gcc4/expected_performance.csv deleted file mode 100644 index 0e8c11d0edc..00000000000 --- a/bamboo/integration_tests/expected_values/surface/gcc4/expected_performance.csv +++ /dev/null @@ -1,5 +0,0 @@ -Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy -alexnet_nightly, 39.60, 0.80, 5.15, 0.37, 0.69, 0.00 -alexnet_weekly, 623.30, 1.27, 7.37, 0.66, 2.24, 0.15 -cache_alexnet, 623.30, 1.27, 4.98, 0.66, 2.24, 0.57 -lenet_mnist, 21.91, 0.04, 1.95, 0.04, 0.07, 98.66 diff --git a/bamboo/integration_tests/test_integration_autoencoders.py b/bamboo/integration_tests/test_integration_autoencoders.py index 4fbe0172d0f..5f021ce6f53 100644 --- a/bamboo/integration_tests/test_integration_autoencoders.py +++ b/bamboo/integration_tests/test_integration_autoencoders.py @@ -1,36 +1,47 @@ import pytest import common_code -def error_if(f, f_symbol, data_field, actual_values, expected_values, model_name, errors, all_values, frequency_str): + +def error_if(f, f_symbol, data_field, actual_values, expected_values, + model_name, errors, all_values, frequency_str): d = actual_values[data_field] for model_id in sorted(d.keys()): for epoch_id in sorted(d[model_id].keys()): actual_value = d[model_id][epoch_id] expected_value = expected_values[epoch_id][data_field + frequency_str] - if actual_value == None: + if actual_value is None: errors.append('d[%s][%s] == None' % (model_id, epoch_id)) - if expected_value == None: + if expected_value is None: errors.append('d[%s]([%s] == None' % (model_id, epoch_id)) if f(actual_value, expected_value): - errors.append('%f %s %f %s Model %s Epoch %s %s' % (actual_value, f_symbol, expected_value, model_name, model_id, epoch_id, data_field)) - all_values.append('%f %s Model %s Epoch %s %s' % (actual_value, model_name, model_id, epoch_id, data_field)) + errors.append('%f %s %f %s Model %s Epoch %s %s' % ( + actual_value, f_symbol, expected_value, model_name, model_id, + epoch_id, data_field)) + all_values.append('%f %s Model %s Epoch %s %s' % ( + actual_value, model_name, model_id, epoch_id, data_field)) + -def run_tests(actual_objective_functions, model_name, dir_name, cluster, should_log, compiler_name, frequency_str=''): - expected_objective_functions = common_code.csv_to_dict('%s/bamboo/integration_tests/expected_values/%s/%s/expected_%s_objective_functions.csv' % (dir_name, cluster, compiler_name, model_name)) +def run_tests(actual_objective_functions, model_name, dir_name, cluster, + should_log, compiler_name, frequency_str=''): + expected_objective_functions = common_code.csv_to_dict( + '%s/bamboo/integration_tests/expected_values/%s/%s/expected_%s_objective_functions.csv' % (dir_name, cluster, compiler_name, model_name)) errors = [] all_values = [] tolerance = 0.05 # Are we within tolerance * expected_value? - outside_tolerance = lambda x,y: abs(x - y) > abs(tolerance * y) - error_if(outside_tolerance, '!=', 'training_objective_function', actual_objective_functions, expected_objective_functions, model_name, errors, all_values, frequency_str) + outside_tolerance = lambda x, y: abs(x - y) > abs(tolerance * y) + error_if(outside_tolerance, '!=', 'training_objective_function', + actual_objective_functions, expected_objective_functions, + model_name, errors, all_values, frequency_str) print('Errors for: %s %s (%d)' % (model_name, compiler_name, len(errors))) for error in errors: print(error) if should_log: - print('All values for: %s %s (%d)' % (model_name, compiler_name, len(all_values))) + print('All values for: %s %s (%d)' % (model_name, compiler_name, + len(all_values))) for value in all_values: print(value) assert errors == [] @@ -39,81 +50,53 @@ def run_tests(actual_objective_functions, model_name, dir_name, cluster, should_ 'training_objective_function' ] -def skeleton_autoencoder_mnist(cluster, dir_name, executables, compiler_name): - if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) - model_folder = 'models/autoencoder_mnist' - model_name = 'conv_autoencoder_mnist' - should_log=False - actual_objective_functions = common_code.skeleton(cluster, dir_name, executables[compiler_name], model_folder, model_name, DATA_FIELDS, should_log, compiler_name=compiler_name) - run_tests(actual_objective_functions, model_name, dir_name, cluster, should_log, compiler_name) - -def skeleton_autoencoder_imagenet(cluster, dir_name, executables, compiler_name, weekly): - if cluster == 'surface': - pytest.skip('skeleton_autoencoder_imagenet does not run on surface') + +def skeleton_autoencoder_imagenet(cluster, dir_name, executables, compiler_name, + weekly): + if cluster in ['surface', 'pascal']: + e = 'skeleton_autoencoder_imagenet: does not run on GPU' + print('Skip - ' + e) + pytest.skip(e) if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_autoencoder_imagenet: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) model_folder = 'models/autoencoder_imagenet' model_name = 'conv_autoencoder_imagenet' should_log = False - actual_objective_functions = common_code.skeleton(cluster, dir_name, executables[compiler_name], model_folder, model_name, DATA_FIELDS, should_log, compiler_name=compiler_name, weekly=weekly) + actual_objective_functions = common_code.skeleton( + cluster, dir_name, executables[compiler_name], model_folder, model_name, + DATA_FIELDS, should_log, compiler_name=compiler_name, weekly=weekly) frequency_str = '_nightly' if weekly: frequency_str = '_weekly' - run_tests(actual_objective_functions, model_name, dir_name, cluster, should_log, compiler_name, frequency_str) - -def test_integration_autoencoder_mnist_clang4(cluster, dirname, exes): - if cluster in ['catalyst', 'quartz']: - pytest.skip('FIXME') - # Catalyst Errors: - # 0.219298 != 0.207480 conv_autoencoder_mnist Model 0 Epoch 0 training_objective_function - skeleton_autoencoder_mnist(cluster, dirname, exes, 'clang4') - -def test_integration_autoencoder_imagenet_clang4(cluster, dirname, exes, weekly): - skeleton_autoencoder_imagenet(cluster, dirname, exes, 'clang4', weekly) - -def test_integration_autoencoder_mnist_gcc4(cluster, dirname, exes): - if cluster in ['catalyst', 'quartz', 'surface']: - pytest.skip('FIXME') - # Catalyst Errors: - # 0.219298 != 0.207480 conv_autoencoder_mnist Model 0 Epoch 0 training_objective_function - # Surface Errors: - # 0.053411 != 0.207587 conv_autoencoder_mnist Model 0 Epoch 0 training_objective_function - # 0.026719 != 0.194595 conv_autoencoder_mnist Model 0 Epoch 1 training_objective_function - # 0.024882 != 0.193141 conv_autoencoder_mnist Model 0 Epoch 2 training_objective_function - # 0.023039 != 0.192808 conv_autoencoder_mnist Model 0 Epoch 3 training_objective_function - # 0.023243 != 0.192716 conv_autoencoder_mnist Model 0 Epoch 4 training_objective_function - skeleton_autoencoder_mnist(cluster, dirname, exes, 'gcc4') + run_tests(actual_objective_functions, model_name, dir_name, cluster, + should_log, compiler_name, frequency_str) + + +def test_integration_autoencoder_imagenet_clang4(cluster, dirname, exes, + weekly): + skeleton_autoencoder_imagenet(cluster, dirname, exes, 'clang4', weekly) + def test_integration_autoencoder_imagenet_gcc4(cluster, dirname, exes, weekly): - skeleton_autoencoder_imagenet(cluster, dirname, exes, 'gcc4', weekly) - -def test_integration_autoencoder_mnist_gcc7(cluster, dirname, exes): - if cluster in ['catalyst', 'quartz']: - pytest.skip('FIXME') - # Catalyst Errors: - # 0.219383 != 0.207514 conv_autoencoder_mnist Model 0 Epoch 0 training_objective_function - skeleton_autoencoder_mnist(cluster, dirname, exes, 'gcc7') + skeleton_autoencoder_imagenet(cluster, dirname, exes, 'gcc4', weekly) + def test_integration_autoencoder_imagenet_gcc7(cluster, dirname, exes, weekly): - skeleton_autoencoder_imagenet(cluster, dirname, exes, 'gcc7', weekly) - -def test_integration_autoencoder_mnist_intel18(cluster, dirname, exes): - skeleton_autoencoder_mnist(cluster, dirname, exes, 'intel18') + skeleton_autoencoder_imagenet(cluster, dirname, exes, 'gcc7', weekly) + -def test_integration_autoencoder_imagenet_intel18(cluster, dirname, exes, weekly): - skeleton_autoencoder_imagenet(cluster, dirname, exes, 'intel18', weekly) +def test_integration_autoencoder_imagenet_intel18(cluster, dirname, exes, + weekly): + skeleton_autoencoder_imagenet(cluster, dirname, exes, 'intel18', weekly) -# Run with python -m pytest -s test_integration_autoencoder.py -k 'test_integration_autoencoder_mnist_exe' --exe= -def test_integration_autoencoder_mnist_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} - skeleton_autoencoder_mnist(cluster, dirname, exes, 'exe', True) # Run with python -m pytest -s test_integration_autoencoder.py -k 'test_integration_autoencoder_imagenet_exe' --exe= def test_integration_autoencoder_imagenet_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_integration_autoencoder_imagenet_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip() + exes = {'exe': exe} skeleton_autoencoder_imagenet(cluster, dirname, exes, 'exe', True) diff --git a/bamboo/integration_tests/test_integration_debug.py b/bamboo/integration_tests/test_integration_debug.py index 1744e3243d4..c205dffb24c 100644 --- a/bamboo/integration_tests/test_integration_debug.py +++ b/bamboo/integration_tests/test_integration_debug.py @@ -2,15 +2,20 @@ sys.path.insert(0, '../common_python') import tools import pytest -import os import common_code -def skeleton_mnist_debug(cluster, dir_name, executables, compiler_name, weekly, debug, should_log=False): + +def skeleton_mnist_debug(cluster, dir_name, executables, compiler_name, weekly, + debug, should_log=False): # If weekly or debug are true, then run the test. if (not weekly) and (not debug): - pytest.skip('Not doing weekly or debug testing') + e = 'skeleton_mnist_debug: Not doing weekly or debug testing' + print('Skip - ' + e) + pytest.skip(e) if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_mnist_debug: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) model_name = 'lenet_mnist' output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name) error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name) @@ -24,14 +29,22 @@ def skeleton_mnist_debug(cluster, dir_name, executables, compiler_name, weekly, output_value = common_code.run_lbann(command, model_name, output_file_name, error_file_name) assert output_value == 0 -def skeleton_cifar_debug(cluster, dir_name, executables, compiler_name, weekly, debug, should_log=False): + +def skeleton_cifar_debug(cluster, dir_name, executables, compiler_name, weekly, + debug, should_log=False): # If weekly or debug are true, then run the test. if (not weekly) and (not debug): - pytest.skip('Not doing weekly or debug testing') + e = 'skeleton_cifar_debug: Not doing weekly or debug testing' + print('Skip - ' + e) + pytest.skip(e) if cluster == 'ray': - pytest.skip('cifar not operational on Ray') + e = 'skeleton_cifar_debug: cifar not operational on Ray' + print('Skip - ' + e) + pytest.skip(e) if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_cifar_debug: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) model_name = 'autoencoder_cifar10' output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name) error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name) @@ -46,40 +59,54 @@ def skeleton_cifar_debug(cluster, dir_name, executables, compiler_name, weekly, output_value = common_code.run_lbann(command, model_name, output_file_name, error_file_name) assert output_value == 0 + def test_integration_mnist_clang4_debug(cluster, dirname, exes, weekly, debug): skeleton_mnist_debug(cluster, dirname, exes, 'clang4_debug', weekly, debug) + def test_integration_cifar_clang4_debug(cluster, dirname, exes, weekly, debug): skeleton_cifar_debug(cluster, dirname, exes, 'clang4_debug', weekly, debug) + def test_integration_mnist_gcc4_debug(cluster, dirname, exes, weekly, debug): skeleton_mnist_debug(cluster, dirname, exes, 'gcc4_debug', weekly, debug) + def test_integration_cifar_gcc4_debug(cluster, dirname, exes, weekly, debug): skeleton_cifar_debug(cluster, dirname, exes, 'gcc4_debug', weekly, debug) + def test_integration_mnist_gcc7_debug(cluster, dirname, exes, weekly, debug): skeleton_mnist_debug(cluster, dirname, exes, 'gcc7_debug', weekly, debug) + def test_integration_cifar_gcc7_debug(cluster, dirname, exes, weekly, debug): skeleton_cifar_debug(cluster, dirname, exes, 'gcc7_debug', weekly, debug) + def test_integration_mnist_intel18_debug(cluster, dirname, exes, weekly, debug): skeleton_mnist_debug(cluster, dirname, exes, 'intel18_debug', weekly, debug) + def test_integration_cifar_intel18_debug(cluster, dirname, exes, weekly, debug): skeleton_cifar_debug(cluster, dirname, exes, 'intel18_debug', weekly, debug) + # Run with python -m pytest -s test_integration_debug.py -k 'test_integration_mnist_exe' --exe= def test_integration_mnist_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_integration_mnist_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_mnist_debug(cluster, dirname, exes, 'exe', True, True) + # Run with python -m pytest -s test_integration_debug.py -k 'test_integration_cifar_exe' --exe= def test_integration_cifar_exe(cluster, dirname, exe): if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + e = 'test_integration_cifar_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_cifar_debug(cluster, dirname, exes, 'exe', True, True) diff --git a/bamboo/integration_tests/test_integration_io_buffers.py b/bamboo/integration_tests/test_integration_io_buffers.py deleted file mode 100644 index 9132b36ba83..00000000000 --- a/bamboo/integration_tests/test_integration_io_buffers.py +++ /dev/null @@ -1,125 +0,0 @@ -import sys -sys.path.insert(0, '../common_python') -import tools -import pytest -import os, sys -import common_code - -def skeleton_io_buffers(cluster, dir_name, executables, compiler_name, weekly): - if not weekly: - pytest.skip('Not doing weekly testing') - if cluster == 'surface': - pytest.skip('skeleton_io_buffers does not run on surface') - if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) - max_mb = 300 - # Printing output from 6*6*2=72 runs of LBANN makes the logs too slow. - # Output from run_lbann is still printed - if there is a failure. - should_log = False - partitioned = 'mnist_partitioned_io' - distributed = 'mnist_distributed_io' - model_names = [partitioned, distributed] - accuracies = {} - errors = [] - all_values = [] - fatal_errors = [] - overall_min_partitioned_accuracy = float('inf') - overall_min_distributed_accuracy = float('inf') - for mini_batch_size in [300, 150, 100, 75, 60, 50]: - num_models = max_mb / mini_batch_size - for procs_per_model in [1, 2, 3, 4, 5, 6]: - num_ranks = procs_per_model * num_models - for model_name in model_names: - output_file_name = '%s/bamboo/integration_tests/output/%s_%d_%d_output.txt' % (dir_name, model_name, mini_batch_size, procs_per_model) - error_file_name = '%s/bamboo/integration_tests/error/%s_%d_%d_error.txt' % (dir_name, model_name, mini_batch_size, procs_per_model) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=2, - num_processes=num_ranks, dir_name=dir_name, - data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', mini_batch_size=mini_batch_size, - model_folder='tests', model_name=model_name, num_epochs=5, - optimizer_name='adagrad', - processes_per_model=procs_per_model, - output_file_name=output_file_name, error_file_name=error_file_name) - try: - common_code.run_lbann(command, model_name, output_file_name, error_file_name, should_log) # Don't need return value - accuracy_dict = common_code.extract_data(output_file_name, ['test_accuracy'], should_log) - accuracies[model_name] = accuracy_dict['test_accuracy'] - except Exception: - # We want to keep running to see if any other mini_batch_size & procs_per_model combination crashes. - # However, it is now pointless to compare accuracies. - fatal_errors.append('Crashed running %s with mini_batch_size=%d, procs_per_model=%d' % (model_name, mini_batch_size, procs_per_model)) - # End model name loop - if fatal_errors == []: - partitioned_num_models = len(accuracies[partitioned].keys()) - distributed_num_models = len(accuracies[distributed].keys()) - assert partitioned_num_models == distributed_num_models - - min_partitioned_accuracy = float('inf') - min_distributed_accuracy = float('inf') - for model_num in sorted(accuracies[partitioned].keys()): - partitioned_accuracy = accuracies[partitioned][model_num]['overall'] - distributed_accuracy = accuracies[distributed][model_num]['overall'] - if partitioned_accuracy < min_partitioned_accuracy: - min_partitioned_accuracy = partitioned_accuracy - if distributed_accuracy < min_distributed_accuracy: - min_distributed_accuracy = distributed_accuracy - tolerance = 0.05 - # Are we within tolerance * expected_value? - if abs(partitioned_accuracy - distributed_accuracy) > abs(tolerance * min(partitioned_accuracy, distributed_accuracy)): - errors.append('partitioned = %f != %f = distributed; model_num=%s mini_batch_size=%d procs_per_model=%d' % (partitioned_accuracy, distributed_accuracy, model_num, mini_batch_size, procs_per_model)) - all_values.append('partitioned = %f, %f = distributed; model_num=%s mini_batch_size=%d procs_per_model=%d' % (partitioned_accuracy, distributed_accuracy, model_num, mini_batch_size, procs_per_model)) - # End model_num loop - if min_partitioned_accuracy < overall_min_partitioned_accuracy: - overall_min_partitioned_accuracy = min_partitioned_accuracy - if min_distributed_accuracy < overall_min_distributed_accuracy: - overall_min_distributed_accuracy = min_distributed_accuracy - # End fatal_errors == [] block - # End procs_per_model loop - # End mini_batch_size loop - for fatal_error in fatal_errors: - print(fatal_error) - assert fatal_errors == [] - # If there were no fatal errors, archive the accuracies. - if os.environ['LOGNAME'] == 'lbannusr': - key = 'bamboo_planKey' - if key in os.environ: - plan = os.environ[key] - if plan in ['LBANN-NIGHTD', 'LBANN-WD']: - archive_file = '/usr/workspace/wsb/lbannusr/archives/%s/%s/%s/io_buffers.txt' % (plan, cluster, compiler_name) - with open(archive_file, 'a') as archive: - archive.write('%s, %f, %f\n' % (os.environ['bamboo_buildNumber'], overall_min_partitioned_accuracy, overall_min_distributed_accuracy)) - else: - print('The plan %s does not have archiving activated' % plan) - else: - print('%s is not in os.environ' % key) - else: - print('os.environ["LOGNAME"]=%s' % os.environ['LOGNAME']) - - print('Errors for: partitioned_and_distributed %s (%d)' % (compiler_name, len(errors))) - for error in errors: - print(error) - if should_log: - print('All values for: partitioned_and_distributed %s (%d)' % (compiler_name, len(all_values))) - for value in all_values: - print(value) - assert errors == [] - -def test_integration_io_buffers_clang4(cluster, dirname, exes, weekly): - skeleton_io_buffers(cluster, dirname, exes, 'clang4', weekly) - -def test_integration_io_buffers_gcc4(cluster, dirname, exes, weekly): - skeleton_io_buffers(cluster, dirname, exes, 'gcc4', weekly) - -def test_integration_io_buffers_gcc7(cluster, dirname, exes, weekly): - skeleton_io_buffers(cluster, dirname, exes, 'gcc7', weekly) - -def test_integration_io_buffers_intel18(cluster, dirname, exes, weekly): - skeleton_io_buffers(cluster, dirname, exes, 'intel18', weekly) - -# Run with python -m pytest -s test_integration_io_buffers.py -k 'test_integration_io_buffers_exe' --exe= -def test_integration_performance_io_buffers_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} - skeleton_io_buffers(cluster, dirname, exes, 'exe', True) diff --git a/bamboo/integration_tests/test_integration_performance.py b/bamboo/integration_tests/test_integration_performance.py index da5e6472762..a171184ba5e 100644 --- a/bamboo/integration_tests/test_integration_performance.py +++ b/bamboo/integration_tests/test_integration_performance.py @@ -2,7 +2,9 @@ import operator, os import common_code -def error_if(f, f_symbol, data_field, actual_values, expected_values, model_name, errors, all_values, frequency_str): + +def error_if(f, f_symbol, data_field, actual_values, expected_values, + model_name, errors, all_values, frequency_str): d = actual_values[data_field] if f_symbol == '<': # Every time a value is smaller, update archive_value @@ -17,40 +19,60 @@ def error_if(f, f_symbol, data_field, actual_values, expected_values, model_name actual_value = d[model_id][epoch_id] expected_value = expected_values[model_name + frequency_str][data_field] - if actual_value == None: - errors.append('d[%s][%s] == None' % (model_id, epoch_id)) - if expected_value == None: - errors.append('d[%s]([%s] == None' % (model_id, epoch_id)) - - if f(actual_value, expected_value): - errors.append('%f %s %f %s Model %s Epoch %s %s' % (actual_value, f_symbol, expected_value, model_name, model_id, epoch_id, data_field)) - all_values.append('%f %s Model %s Epoch %s %s' % (actual_value, model_name, model_id, epoch_id, data_field)) - - if f(actual_value, archive_value): - archive_value = actual_value + if actual_value is None: + errors.append('actual_value: d[%s][%s] is None' % (model_id, epoch_id)) + else: + print('actual_value={av}'.format(av=actual_value)) + if expected_value is None: + errors.append( + 'expected_value: d[%s]([%s] is None' % (model_id, epoch_id)) + else: + print('expected_value={ev}'.format(ev=expected_value)) + + if (actual_value is not None) and (expected_value is not None): + if f(actual_value, expected_value): + errors.append('%f %s %f %s Model %s Epoch %s %s' % ( + actual_value, f_symbol, expected_value, model_name, model_id, + epoch_id, data_field)) + all_values.append('%f %s Model %s Epoch %s %s' % ( + actual_value, model_name, model_id, epoch_id, data_field)) + + if f(actual_value, archive_value): + archive_value = actual_value + else: + print('archiving: either actual_value or expected_value is None.') return archive_value -def run_tests(actual_performance, model_name, dir_name, should_log, compiler_name, cluster, frequency_str=''): - expected_performance = common_code.csv_to_dict('%s/bamboo/integration_tests/expected_values/%s/%s/expected_performance.csv' % (dir_name, cluster, compiler_name)) + +def run_tests(actual_performance, model_name, dir_name, should_log, + compiler_name, cluster, frequency_str=''): + expected_performance = common_code.csv_to_dict( + '%s/bamboo/integration_tests/expected_values/%s/%s/expected_performance.csv' % (dir_name, cluster, compiler_name)) errors = [] all_values = [] greater_than = operator.gt less_than = operator.lt max_run_time = error_if(greater_than, '>', 'training_run_time', actual_performance, expected_performance, model_name, errors, all_values, frequency_str) - max_mean = error_if(greater_than, '>', 'training_mean', actual_performance, expected_performance, model_name, errors, all_values, frequency_str) - max_max = error_if(greater_than, '>', 'training_max', actual_performance, expected_performance, model_name, errors, all_values, frequency_str) - max_min = error_if(greater_than, '>', 'training_min', actual_performance, expected_performance, model_name, errors, all_values, frequency_str) - max_stdev = error_if(greater_than, '>', 'training_stdev', actual_performance, expected_performance, model_name, errors, all_values, frequency_str) + max_mean = error_if(greater_than, '>', 'training_mean', actual_performance, expected_performance, model_name, errors, all_values, frequency_str) + max_max = error_if(greater_than, '>', 'training_max', actual_performance, expected_performance, model_name, errors, all_values, frequency_str) + max_min = error_if(greater_than, '>', 'training_min', actual_performance, expected_performance, model_name, errors, all_values, frequency_str) + max_stdev = error_if(greater_than, '>', 'training_stdev', actual_performance, expected_performance, model_name, errors, all_values, frequency_str) min_accuracy = error_if(less_than, '<', 'test_accuracy', actual_performance, expected_performance, model_name, errors, all_values, frequency_str) + archival_string = '%s, %f, %f, %f, %f, %f, %f\n' % ( + os.environ['bamboo_buildNumber'], max_run_time, max_mean, max_max, max_min, + max_stdev, min_accuracy) + print('archival_string: ' + archival_string) if os.environ['LOGNAME'] == 'lbannusr': key = 'bamboo_planKey' if key in os.environ: plan = os.environ[key] if plan in ['LBANN-NIGHTD', 'LBANN-WD']: archive_file = '/usr/workspace/wsb/lbannusr/archives/%s/%s/%s/performance_%s.txt' % (plan, cluster, compiler_name, model_name) + print('Archive file: ' + archive_file) with open(archive_file, 'a') as archive: - archive.write('%s, %f, %f, %f, %f, %f, %f\n' % (os.environ['bamboo_buildNumber'], max_run_time, max_mean, max_max, max_min, max_stdev, min_accuracy)) + print('Archiving to file.') + archive.write(archival_string) else: print('The plan %s does not have archiving activated' % plan) else: @@ -62,7 +84,8 @@ def run_tests(actual_performance, model_name, dir_name, should_log, compiler_nam for error in errors: print(error) if should_log: - print('All values for: %s %s (%d)' % (model_name, compiler_name, len(all_values))) + print('All values for: %s %s (%d)' % ( + model_name, compiler_name, len(all_values))) for value in all_values: print(value) assert errors == [] @@ -76,133 +99,154 @@ def run_tests(actual_performance, model_name, dir_name, should_log, compiler_nam 'test_accuracy' ] -def skeleton_performance_lenet_mnist(cluster, dir_name, executables, compiler_name): + +def skeleton_performance_lenet_mnist(cluster, dir_name, executables, + compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_performance_lenet_mnist: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) executable = executables[compiler_name] model_name = 'lenet_mnist' model_folder = 'models/' + model_name - should_log = False - actual_performance = common_code.skeleton(cluster, dir_name, executable, model_folder, model_name, DATA_FIELDS, should_log, compiler_name=compiler_name) - run_tests(actual_performance, model_name, dir_name, should_log, compiler_name, cluster) + should_log = True + actual_performance = common_code.skeleton( + cluster, dir_name, executable, model_folder, model_name, DATA_FIELDS, + should_log, compiler_name=compiler_name) + run_tests(actual_performance, model_name, dir_name, should_log, + compiler_name, cluster) + -def skeleton_performance_alexnet(cluster, dir_name, executables, compiler_name, weekly): +def skeleton_performance_alexnet(cluster, dir_name, executables, compiler_name, + weekly): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_performance_alexnet: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) executable = executables[compiler_name] model_name = 'alexnet' model_folder = 'models/' + model_name - should_log = False - actual_performance = common_code.skeleton(cluster, dir_name, executable, model_folder, model_name, DATA_FIELDS, should_log, compiler_name=compiler_name, weekly=weekly) + should_log = True + actual_performance = common_code.skeleton( + cluster, dir_name, executable, model_folder, model_name, DATA_FIELDS, + should_log, compiler_name=compiler_name, weekly=weekly) frequency_str = '_nightly' if weekly: frequency_str = '_weekly' - run_tests(actual_performance, model_name, dir_name, should_log, compiler_name, cluster, frequency_str) + run_tests(actual_performance, model_name, dir_name, should_log, + compiler_name, cluster, frequency_str) + -def skeleton_performance_full_alexnet(cluster, dir_name, executables, compiler_name, weekly): +def skeleton_performance_full_alexnet(cluster, dir_name, executables, + compiler_name, weekly): if not weekly: - pytest.skip('Not doing weekly testing') + e = 'skeleton_performance_full_alexnet: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_performance_full_alexnet: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) executable = executables[compiler_name] if not os.path.exists(executable): pytest.skip('Executable does not exist: %s' % executable) model_name = 'full_alexnet' - should_log = False + should_log = True output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name) error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name) - if (cluster in ['catalyst', 'surface']): + if cluster in ['catalyst', 'surface']: command = 'salloc %s/bamboo/integration_tests/%s.sh > %s' % (dir_name, model_name, output_file_name) elif cluster == 'ray': - pytest.skip('Ray is unsupported for skeleton_performance_full_alexnet') + e = 'skeleton_performance_full_alexnet: Ray is unsupported for skeleton_performance_full_alexnet' + print('Skip - ' + e) + pytest.skip(e) else: raise Exception('Unsupported Cluster %s' % cluster) - common_code.run_lbann(command, model_name, output_file_name, error_file_name, should_log) # Don't need return value - actual_performance = common_code.extract_data(output_file_name, DATA_FIELDS, should_log) - run_tests(actual_performance, model_name, dirname, should_log, compiler_name, cluster) + common_code.run_lbann(command, model_name, output_file_name, error_file_name, + should_log) # Don't need return value + actual_performance = common_code.extract_data(output_file_name, DATA_FIELDS, + should_log) + run_tests(actual_performance, model_name, dir_name, should_log, compiler_name, + cluster) + def test_integration_performance_lenet_mnist_clang4(cluster, dirname, exes): - if cluster in ['catalyst', 'quartz']: - pytest.skip('FIXME') - # Catalyst Errors: - # 0.104416 > 0.090000 lenet_mnist Model 0 Epoch 0 training_max - # 98.770000 < 98.960000 lenet_mnist Model 0 Epoch overall test_accuracy skeleton_performance_lenet_mnist(cluster, dirname, exes, 'clang4') - + + def test_integration_performance_alexnet_clang4(cluster, dirname, exes, weekly): skeleton_performance_alexnet(cluster, dirname, exes, 'clang4', weekly) -def test_integration_performance_full_alexnet_clang4(cluster, dirname, exes, weekly): + +def test_integration_performance_full_alexnet_clang4(cluster, dirname, exes, + weekly): skeleton_performance_full_alexnet(cluster, dirname, exes, 'clang4', weekly) - + + def test_integration_performance_lenet_mnist_gcc4(cluster, dirname, exes): - if cluster in ['catalyst', 'quartz', 'surface']: - pytest.skip('FIXME') - # Catalyst Errors: - # 15.634300 > 15.610000 lenet_mnist Model 0 Epoch 3 training_run_time - # 15.655200 > 15.610000 lenet_mnist Model 0 Epoch 4 training_run_time - # 98.770000 < 98.960000 lenet_mnist Model 0 Epoch overall test_accuracy - # Surface Errors: - # [surface145:mpi_rank_0][error_sighandler] Caught error: Segmentation fault (signal 11) - # srun: error: surface145: task 0: Segmentation fault (core dumped) skeleton_performance_lenet_mnist(cluster, dirname, exes, 'gcc4') + def test_integration_performance_alexnet_gcc4(cluster, dirname, exes, weekly): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # [surface59:mpi_rank_0][error_sighandler] Caught error: Segmentation fault (signal 11) - # srun: error: surface59: task 0: Segmentation fault (core dumped) skeleton_performance_alexnet(cluster, dirname, exes, 'gcc4', weekly) + def test_integration_performance_full_alexnet_gcc4(cluster, dirname, exes, weekly): skeleton_performance_full_alexnet(cluster, dirname, exes, 'gcc4', weekly) + def test_integration_performance_lenet_mnist_gcc7(cluster, dirname, exes): - if cluster in ['catalyst', 'quartz']: - pytest.skip('FIXME') - # Catalyst Errors: - # 15.522700 > 15.510000 lenet_mnist Model 0 Epoch 4 training_run_time - # 98.950000 < 99.000000 lenet_mnist Model 0 Epoch overall test_accuracy skeleton_performance_lenet_mnist(cluster, dirname, exes, 'gcc7') + def test_integration_performance_alexnet_gcc7(cluster, dirname, exes, weekly): - if cluster in ['catalyst', 'quartz']: - pytest.skip('FIXME') - # Catalyst Errors: - # 0.546884 > 0.510000 alexnet Model 0 Epoch 17 training_stdev skeleton_performance_alexnet(cluster, dirname, exes, 'gcc7', weekly) -def test_integration_performance_full_alexnet_gcc7(cluster, dirname, exes, weekly): + +def test_integration_performance_full_alexnet_gcc7(cluster, dirname, exes, + weekly): skeleton_performance_full_alexnet(cluster, dirname, exes, 'gcc7', weekly) + def test_integration_performance_lenet_mnist_intel18(cluster, dirname, exes): skeleton_performance_lenet_mnist(cluster, dirname, exes, 'intel18') -def test_integration_performance_alexnet_intel18(cluster, dirname, exes, weekly): + +def test_integration_performance_alexnet_intel18(cluster, dirname, exes, + weekly): skeleton_performance_alexnet(cluster, dirname, exes, 'intel18', weekly) -def test_integration_performance_full_alexnet_intel18(cluster, dirname, exes, weekly): + +def test_integration_performance_full_alexnet_intel18(cluster, dirname, exes, + weekly): skeleton_performance_full_alexnet(cluster, dirname, exes, 'intel18', weekly) # Run with python -m pytest -s test_integration_performance.py -k 'test_integration_performance_lenet_mnist_exe' --exe= def test_integration_performance_lenet_mnist_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_integration_performance_lenet_mnist_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_performance_lenet_mnist(cluster, dirname, exes, 'exe') + # Run with python -m pytest -s test_integration_performance.py -k 'test_integration_performance_alexnet_exe' --exe= def test_integration_performance_alexnet_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'stest_integration_performance_alexnet_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_performance_alexnet(cluster, dirname, exes, 'exe', True) + # Run with python -m pytest -s test_integration_performance.py -k 'test_integration_performance_full_alexnet_exe' --exe= def test_integration_performance_full_alexnet_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_integration_performance_full_alexnet_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_performance_full_alexnet(cluster, dirname, exes, 'exe', True) diff --git a/bamboo/unit_tests/conftest.py b/bamboo/unit_tests/conftest.py index 5e5cce7d2f7..eda975da95a 100644 --- a/bamboo/unit_tests/conftest.py +++ b/bamboo/unit_tests/conftest.py @@ -1,22 +1,14 @@ -import pytest, os, re, subprocess +import sys +sys.path.insert(0, '../common_python') +import tools +import pytest, re, subprocess def pytest_addoption(parser): - cluster = re.sub('[0-9]+', '', subprocess.check_output('hostname'.split()).strip()) - default_dirname = subprocess.check_output('git rev-parse --show-toplevel'.split()).strip() - default_exes = {} - default_exes['default'] = '%s/build/gnu.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) - if cluster in ['catalyst', 'quartz']: - default_exes['clang4'] = '%s/build/clang.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) #'%s/bamboo/compiler_tests/builds/%s_clang-4.0.0_rel/build/model_zoo/lbann' % (default_dirname, cluster) - #default_exes['gcc4'] = '%s/bamboo/compiler_tests/builds/%s_gcc-4.9.3_rel/build/model_zoo/lbann' % (default_dirname, cluster) - default_exes['gcc7'] = '%s/build/gnu.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) #'%s/bamboo/compiler_tests/builds/%s_gcc-7.1.0_rel/build/model_zoo/lbann' % (default_dirname, cluster) - default_exes['intel18'] = '%s/build/intel.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) #'%s/bamboo/compiler_tests/builds/%s_intel-18.0.0_rel/build/model_zoo/lbann' % (default_dirname, cluster) - - if cluster == 'ray': - default_exes['gcc4'] = '%s/build/gnu.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) #'%s/bamboo/compiler_tests/builds/%s_gcc-4.9.3_rel/build/model_zoo/lbann' % (default_dirname, cluster) - default_exes['clang4'] = '%s/build/clang.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) - - if cluster in ['surface', 'pascal']: - default_exes['gcc4'] = default_exes['default'] + cluster = re.sub('[0-9]+', '', subprocess.check_output( + 'hostname'.split()).strip()) + default_dirname = subprocess.check_output( + 'git rev-parse --show-toplevel'.split()).strip() + default_exes = tools.get_default_exes(default_dirname, cluster) parser.addoption('--cluster', action='store', default=cluster, help='--cluster= to specify the cluster being run on, for the purpose of determing which commands to use. Default the current cluster') diff --git a/bamboo/unit_tests/prototext/model_mnist_simple_1.prototext b/bamboo/unit_tests/prototext/model_mnist_simple_1.prototext index c2c6477837b..77a1c7ed256 100644 --- a/bamboo/unit_tests/prototext/model_mnist_simple_1.prototext +++ b/bamboo/unit_tests/prototext/model_mnist_simple_1.prototext @@ -1,19 +1,17 @@ model { - name: "sequential_model" data_layout: "data_parallel" mini_batch_size: 64 block_size: 256 num_epochs: 3 num_parallel_readers: 0 - procs_per_model: 0 - num_gpus: -1 + procs_per_trainer: 0 ################################################### # Objective function ################################################### objective_function { - cross_entropy {} + layer_term { layer: "cross_entropy" } l2_weight_regularization { scale_factor: 1e-4 } @@ -24,7 +22,11 @@ model { ################################################### metric { - categorical_accuracy {} + layer_metric { + name: "categorical accuracy" + layer: "accuracy" + unit: "%" + } } ################################################### @@ -52,13 +54,25 @@ model { layer { name: "data" + children: "image label" data_layout: "data_parallel" - input { - io_buffer: "partitioned" - } + input {} + } + layer { + parents: "data" + name: "image" + data_layout: "data_parallel" + split {} + } + layer { + parents: "data" + name: "label" + data_layout: "data_parallel" + split {} } layer { + parents: "image" name: "ip1" data_layout: "model_parallel" fully_connected { @@ -68,12 +82,14 @@ model { } layer { + parents: "ip1" name: "relu1" data_layout: "model_parallel" relu {} } layer { + parents: "relu1" name: "ip2" data_layout: "model_parallel" fully_connected { @@ -83,18 +99,24 @@ model { } layer { + parents: "ip2" name: "prob" - data_layout: "model_parallel" + data_layout: "data_parallel" softmax {} } layer { - name: "target" + parents: "prob label" + name: "cross_entropy" data_layout: "data_parallel" - target { - io_buffer: "partitioned" - shared_data_reader: true - } + cross_entropy {} + } + + layer { + parents: "prob label" + name: "accuracy" + data_layout: "data_parallel" + categorical_accuracy {} } } diff --git a/bamboo/unit_tests/prototext/model_mnist_simple_2.prototext b/bamboo/unit_tests/prototext/model_mnist_simple_2.prototext index d265acf7276..c89c171566f 100644 --- a/bamboo/unit_tests/prototext/model_mnist_simple_2.prototext +++ b/bamboo/unit_tests/prototext/model_mnist_simple_2.prototext @@ -1,19 +1,17 @@ model { - name: "sequential_model" data_layout: "data_parallel" mini_batch_size: 64 block_size: 256 num_epochs: 3 num_parallel_readers: 0 - procs_per_model: 0 - num_gpus: -1 + procs_per_trainer: 0 ################################################### # Objective function ################################################### objective_function { - cross_entropy {} + layer_term { layer: "cross_entropy" } l2_weight_regularization { scale_factor: 1e-4 } @@ -24,7 +22,11 @@ model { ################################################### metric { - categorical_accuracy {} + layer_metric { + name: "categorical accuracy" + layer: "accuracy" + unit: "%" + } } ################################################### @@ -52,13 +54,25 @@ model { layer { name: "data" + children: "image label" data_layout: "data_parallel" - input { - io_buffer: "partitioned" - } + input {} + } + layer { + parents: "data" + name: "image" + data_layout: "data_parallel" + split {} + } + layer { + parents: "data" + name: "label" + data_layout: "data_parallel" + split {} } layer { + parents: "image" name: "ip1" data_layout: "model_parallel" fully_connected { @@ -68,12 +82,14 @@ model { } layer { + parents: "ip1" name: "relu1" data_layout: "model_parallel" relu {} } layer { + parents: "relu1" name: "ip3" data_layout: "model_parallel" fully_connected { @@ -83,11 +99,13 @@ model { } layer { + parents: "ip3" name: "relu3" data_layout: "model_parallel" relu {} } layer { + parents: "relu3" name: "ip2" data_layout: "model_parallel" fully_connected { @@ -97,18 +115,24 @@ model { } layer { + parents: "ip2" name: "prob" - data_layout: "model_parallel" + data_layout: "data_parallel" softmax {} } layer { - name: "target" + parents: "prob label" + name: "cross_entropy" data_layout: "data_parallel" - target { - io_buffer: "partitioned" - shared_data_reader: true - } + cross_entropy {} + } + + layer { + parents: "prob label" + name: "accuracy" + data_layout: "data_parallel" + categorical_accuracy {} } } diff --git a/bamboo/unit_tests/prototext/opt_sgd.prototext b/bamboo/unit_tests/prototext/opt_sgd.prototext index 3ab5afd6406..8d066780476 100644 --- a/bamboo/unit_tests/prototext/opt_sgd.prototext +++ b/bamboo/unit_tests/prototext/opt_sgd.prototext @@ -1,8 +1,7 @@ optimizer { sgd { learn_rate: 0.01 - momentum: 0.9 - decay_rate: 0 + momentum: 0.9 nesterov: false - } + } } diff --git a/bamboo/unit_tests/test_unit_check_proto_models.py b/bamboo/unit_tests/test_unit_check_proto_models.py index 539d69d0b44..353fca3143a 100644 --- a/bamboo/unit_tests/test_unit_check_proto_models.py +++ b/bamboo/unit_tests/test_unit_check_proto_models.py @@ -2,11 +2,14 @@ sys.path.insert(0, '../common_python') import tools import pytest -import os, re, subprocess, sys +import os + def skeleton_models(cluster, dir_name, executables, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_models: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) opt = 'sgd' node_count = 1 time_limit = 1 @@ -16,17 +19,14 @@ def skeleton_models(cluster, dir_name, executables, compiler_name): for file_name in files: if file_name.endswith('.prototext') and "model" in file_name: model_path = subdir + '/' + file_name - print('Attempting model setup for: ' + file_name ) + print('Attempting model setup for: ' + file_name) data_filedir_default = None data_filedir_train_default=None data_filename_train_default=None data_filedir_test_default=None data_filename_test_default=None data_reader_path=None - if 'motif' in file_name: - print('Skipping %s because motifs are deprecated' % model_path) - continue - elif 'mnist' in file_name: + if 'mnist' in file_name: data_filedir_default = '/p/lscratchh/brainusr/datasets/MNIST' data_reader_name = 'mnist' elif 'adversarial' in file_name: @@ -38,6 +38,9 @@ def skeleton_models(cluster, dir_name, executables, compiler_name): data_reader_path = '%s/model_zoo/models/gan/mnist/discriminator_data.prototext' % (dir_name) data_reader_name = None elif 'triplet' in file_name: + # Disabling triplet test. + print('Skipping triplet tests.') + continue data_filedir_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/' data_filename_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/train/train_list_8h.nfl.npz' data_filedir_test_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/' @@ -58,7 +61,7 @@ def skeleton_models(cluster, dir_name, executables, compiler_name): data_filename_test_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/val.txt' data_reader_name = 'imagenet' node_count = 2 - if(cluster == 'ray'): + if cluster == 'ray': time_limit = 3 if 'resnet50' in file_name: node_count = 8 @@ -70,26 +73,35 @@ def skeleton_models(cluster, dir_name, executables, compiler_name): data_filedir_default = '/p/lscratchh/brainusr/datasets/tinyshakespeare/' data_reader_name = 'ascii' else: - print("Shared lbannusr account doesn't have access to dataset this model requires") + print( + "No access to dataset that model={m} requires.".format( + m=file_name)) continue - if (cluster == 'ray') and (data_reader_name in ['cifar10', 'ascii']): + if (cluster == 'ray') and \ + (data_reader_name in ['cifar10', 'ascii']): print('Skipping %s because data is not available on ray' % model_path) - elif (cluster == 'ray') or (cluster == 'pascal') and ('conv_autoencoder' in file_name) or ('gan' in subdir): + elif (cluster == 'ray') or (cluster == 'pascal') and \ + ('conv_autoencoder' in file_name) or ('gan' in subdir): print('Skipping %s because unpooling/noise is not implemented on gpu' % model_path) else: output_file_name = '%s/bamboo/unit_tests/output/check_proto_models_%s_%s_output.txt' % (dir_name, file_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/check_proto_models_%s_%s_error.txt' % (dir_name, file_name, compiler_name) cmd = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=node_count, - partition='pbatch', time_limit=time_limit, dir_name=dir_name, + cluster=cluster, executable=executables[compiler_name], + num_nodes=node_count, + partition='pbatch', time_limit=time_limit, + dir_name=dir_name, data_filedir_default=data_filedir_default, data_filedir_train_default=data_filedir_train_default, data_filename_train_default=data_filename_train_default, data_filedir_test_default=data_filedir_test_default, data_filename_test_default=data_filename_test_default, - data_reader_name=data_reader_name, data_reader_path=data_reader_path, - exit_after_setup=True, model_path=model_path, optimizer_name=opt, - output_file_name=output_file_name, error_file_name=error_file_name) + data_reader_name=data_reader_name, + data_reader_path=data_reader_path, + exit_after_setup=True, model_path=model_path, + optimizer_name=opt, + output_file_name=output_file_name, + error_file_name=error_file_name) if os.system(cmd) != 0: print("Error detected in " + model_path) #defective_models.append(file_name) @@ -98,31 +110,35 @@ def skeleton_models(cluster, dir_name, executables, compiler_name): working_models.append(cmd) num_defective = len(defective_models) if num_defective != 0: - print('Working models: %d. Defective models: %d', len(working_models), num_defective) + print('Working models: %d. Defective models: %d' % ( + len(working_models), num_defective)) print('Errors for: The following models exited with errors %s' % compiler_name) for model in defective_models: print(model) assert num_defective == 0 + def test_unit_models_clang4(cluster, dirname, exes): skeleton_models(cluster, dirname, exes, 'clang4') + def test_unit_models_gcc4(cluster, dirname, exes): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # assert 8 == 0 skeleton_models(cluster, dirname, exes, 'gcc4') + def test_unit_models_gcc7(cluster, dirname, exes): skeleton_models(cluster, exes, dirname, 'gcc7') + def test_unit_models_intel18(cluster, dirname, exes): skeleton_models(cluster, dirname, exes, 'intel18') + # Run with python -m pytest -s test_unit_check_proto_models.py -k 'test_unit_models_exe' --exe= def test_unit_models_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') + if exe is None: + e = 'test_unit_models_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) exes = {'exe' : exe} skeleton_models(cluster, dirname, exes, 'exe') diff --git a/bamboo/unit_tests/test_unit_checkpoint.py b/bamboo/unit_tests/test_unit_checkpoint.py index 2b0912c5200..25ea6614e3b 100644 --- a/bamboo/unit_tests/test_unit_checkpoint.py +++ b/bamboo/unit_tests/test_unit_checkpoint.py @@ -4,10 +4,16 @@ import pytest import os -def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, compiler_name): + +def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, + compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_checkpoint_lenet_shared: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) exe = executables[compiler_name] + + # No checkpointing, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_no_checkpoint_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_shared_no_checkpoint_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( @@ -23,6 +29,7 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, compiler_na sys.exit(1) os.system('mv ckpt ckpt_baseline') + # Run to checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_checkpoint_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_shared_checkpoint_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( @@ -37,6 +44,7 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, compiler_na sys.stderr.write('LeNet (checkpoint) execution failed, exiting with error') sys.exit(1) + # Pick up from checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_restart_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_shared_restart_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( @@ -55,10 +63,16 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, compiler_na os.system('rm -rf ckpt*') assert diff_test == 0 -def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, compiler_name): + +def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, + compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_checkpoint_lenet_distributed: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) exe = executables[compiler_name] + + # No checkpointing, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_no_checkpoint_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_distributed_no_checkpoint_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( @@ -74,6 +88,7 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, compil sys.exit(1) os.system('mv ckpt ckpt_baseline') + # Run to checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_checkpoint_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_distributed_checkpoint_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( @@ -88,6 +103,7 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, compil sys.stderr.write('LeNet (checkpoint) execution failed, exiting with error') sys.exit(1) + # Pick up from checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_restart_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_distributed_restart_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( @@ -106,26 +122,33 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, compil os.system('rm -rf ckpt*') assert diff_test == 0 + def test_unit_checkpoint_lenet_clang4(cluster, exes, dirname): skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'clang4') skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'clang4') + def test_unit_checkpoint_lenet_gcc4(cluster, exes, dirname): skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'gcc4') skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'gcc4') + def test_unit_checkpoint_lenet_gcc7(cluster, exes, dirname): skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'gcc7') skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'gcc7') + def test_unit_checkpoint_lenet_intel18(cluster, exes, dirname): skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'intel18') skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_checkpoint.py -k 'test_unit_checkpoint_lenet_exe' --exe= def test_unit_checkpoint_lenet_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_checkpoint_lenet_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'exe') skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_layer_clamp.py b/bamboo/unit_tests/test_unit_layer_clamp.py index 6ac7278ab30..8cd7d579374 100644 --- a/bamboo/unit_tests/test_unit_layer_clamp.py +++ b/bamboo/unit_tests/test_unit_layer_clamp.py @@ -4,38 +4,46 @@ import pytest import os + def skeleton_layer_clamp(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_layer_clamp: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/layer_clamp_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/layer_clamp_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, + cluster=cluster, executable=executables[compiler_name], num_nodes=1, + num_processes=2, dir_name=dir_name, data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='clamp', optimizer_name='sgd', + model_folder='tests/layer_tests', model_name='clamp', + optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0 + def test_unit_layer_clamp_clang4(cluster, exes, dirname): skeleton_layer_clamp(cluster, exes, dirname, 'clang4') + def test_unit_layer_clamp_gcc4_check(cluster, exes, dirname): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # assert 34304 == 0 skeleton_layer_clamp(cluster, exes, dirname, 'gcc4') + def test_unit_layer_clamp_gcc7(cluster, exes, dirname): skeleton_layer_clamp(cluster, exes, dirname, 'gcc7') + def test_unit_layer_clamp_intel18(cluster, exes, dirname): skeleton_layer_clamp(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_layer_clamp.py -k 'test_unit_layer_clamp_exe' --exe= def test_unit_layer_clamp_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_layer_clamp_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_layer_clamp(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_layer_covariance.py b/bamboo/unit_tests/test_unit_layer_covariance.py index 41bdb9d985f..e72bca4fb51 100644 --- a/bamboo/unit_tests/test_unit_layer_covariance.py +++ b/bamboo/unit_tests/test_unit_layer_covariance.py @@ -4,38 +4,46 @@ import pytest import os + def skeleton_layer_covariance(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_layer_covariance: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/layer_covariance_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/layer_covariance_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, + cluster=cluster, executable=executables[compiler_name], num_nodes=1, + num_processes=2, dir_name=dir_name, data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='covariance', optimizer_name='sgd', + model_folder='tests/layer_tests', model_name='covariance', + optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0 + def test_unit_layer_covariance_clang4(cluster, exes, dirname): skeleton_layer_covariance(cluster, exes, dirname, 'clang4') + def test_unit_layer_covariance_gcc4_check(cluster, exes, dirname): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # assert 34304 == 0 skeleton_layer_covariance(cluster, exes, dirname, 'gcc4') + def test_unit_layer_covariance_gcc7(cluster, exes, dirname): skeleton_layer_covariance(cluster, exes, dirname, 'gcc7') + def test_unit_layer_covariance_intel18(cluster, exes, dirname): skeleton_layer_covariance(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_covariance_exe' --exe= def test_unit_layer_covariance_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_layer_covariance_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_layer_covariance(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_layer_elu.py b/bamboo/unit_tests/test_unit_layer_elu.py index a121bfcb50f..66b10d1fc5b 100644 --- a/bamboo/unit_tests/test_unit_layer_elu.py +++ b/bamboo/unit_tests/test_unit_layer_elu.py @@ -4,38 +4,46 @@ import pytest import os + def skeleton_layer_elu(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_layer_elu: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/layer_elu_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/layer_elu_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, + cluster=cluster, executable=executables[compiler_name], num_nodes=1, + num_processes=2, dir_name=dir_name, data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='elu', optimizer_name='sgd', + model_folder='tests/layer_tests', model_name='elu', + optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0 + def test_unit_layer_elu_clang4(cluster, exes, dirname): skeleton_layer_elu(cluster, exes, dirname, 'clang4') + def test_unit_layer_elu_gcc4_check(cluster, exes, dirname): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # assert 34304 == 0 skeleton_layer_elu(cluster, exes, dirname, 'gcc4') + def test_unit_layer_elu_gcc7(cluster, exes, dirname): skeleton_layer_elu(cluster, exes, dirname, 'gcc7') + def test_unit_layer_elu_intel18(cluster, exes, dirname): skeleton_layer_elu(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_layer_elu.py -k 'test_unit_layer_elu_exe' --exe= def test_unit_layer_elu_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_layer_elu_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_layer_elu(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_layer_identity.py b/bamboo/unit_tests/test_unit_layer_identity.py index b26f4248d69..86568e946d5 100644 --- a/bamboo/unit_tests/test_unit_layer_identity.py +++ b/bamboo/unit_tests/test_unit_layer_identity.py @@ -4,38 +4,46 @@ import pytest import os + def skeleton_layer_identity(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_layer_identity: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/layer_identity_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/layer_identity_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, + cluster=cluster, executable=executables[compiler_name], num_nodes=1, + num_processes=2, dir_name=dir_name, data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='identity', optimizer_name='sgd', + model_folder='tests/layer_tests', model_name='identity', + optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0 + def test_unit_layer_identity_clang4(cluster, exes, dirname): skeleton_layer_identity(cluster, exes, dirname, 'clang4') + def test_unit_layer_identity_gcc4_check(cluster, exes, dirname): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # assert 34304 == 0 skeleton_layer_identity(cluster, exes, dirname, 'gcc4') + def test_unit_layer_identity_gcc7(cluster, exes, dirname): skeleton_layer_identity(cluster, exes, dirname, 'gcc7') + def test_unit_layer_identity_intel18(cluster, exes, dirname): skeleton_layer_identity(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_layer_identity.py -k 'test_unit_layer_identity_exe' --exe= def test_unit_layer_identity_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_layer_identity_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_layer_identity(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_layer_l1_norm.py b/bamboo/unit_tests/test_unit_layer_l1_norm.py index 1c1ab406106..9abcc2652ce 100644 --- a/bamboo/unit_tests/test_unit_layer_l1_norm.py +++ b/bamboo/unit_tests/test_unit_layer_l1_norm.py @@ -4,38 +4,46 @@ import pytest import os + def skeleton_layer_l1_norm(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_layer_l1_norm: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/layer_l1_norm_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/layer_l1_norm_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, + cluster=cluster, executable=executables[compiler_name], num_nodes=1, + num_processes=2, dir_name=dir_name, data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='l1_norm', optimizer_name='sgd', + model_folder='tests/layer_tests', model_name='l1_norm', + optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0 + def test_unit_layer_l1_norm_clang4(cluster, exes, dirname): skeleton_layer_l1_norm(cluster, exes, dirname, 'clang4') + def test_unit_layer_l1_norm_gcc4_check(cluster, exes, dirname): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # assert 34304 == 0 skeleton_layer_l1_norm(cluster, exes, dirname, 'gcc4') + def test_unit_layer_l1_norm_gcc7(cluster, exes, dirname): skeleton_layer_l1_norm(cluster, exes, dirname, 'gcc7') + def test_unit_layer_l1_norm_intel18(cluster, exes, dirname): skeleton_layer_l1_norm(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_l1_norm_exe' --exe= def test_unit_layer_l1_norm_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_layer_l1_norm_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_layer_l1_norm(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_layer_l2_norm2.py b/bamboo/unit_tests/test_unit_layer_l2_norm2.py index 29233e9ce18..cdbad231498 100644 --- a/bamboo/unit_tests/test_unit_layer_l2_norm2.py +++ b/bamboo/unit_tests/test_unit_layer_l2_norm2.py @@ -4,38 +4,46 @@ import pytest import os + def skeleton_layer_l2_norm2(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_layer_l2_norm2: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/layer_l2_norm2_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/layer_l2_norm2_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, + cluster=cluster, executable=executables[compiler_name], num_nodes=1, + num_processes=2, dir_name=dir_name, data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='l2_norm2', optimizer_name='sgd', + model_folder='tests/layer_tests', model_name='l2_norm2', + optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0 + def test_unit_layer_l2_norm2_clang4(cluster, exes, dirname): skeleton_layer_l2_norm2(cluster, exes, dirname, 'clang4') + def test_unit_layer_l2_norm2_gcc4_check(cluster, exes, dirname): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # assert 34304 == 0 skeleton_layer_l2_norm2(cluster, exes, dirname, 'gcc4') + def test_unit_layer_l2_norm2_gcc7(cluster, exes, dirname): skeleton_layer_l2_norm2(cluster, exes, dirname, 'gcc7') + def test_unit_layer_l2_norm2_intel18(cluster, exes, dirname): skeleton_layer_l2_norm2(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_l2_norm2_exe' --exe= def test_unit_layer_l2_norm2_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_layer_l2_norm2_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_layer_l2_norm2(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_layer_leaky_relu.py b/bamboo/unit_tests/test_unit_layer_leaky_relu.py index d934987e76a..6c90b34ce78 100644 --- a/bamboo/unit_tests/test_unit_layer_leaky_relu.py +++ b/bamboo/unit_tests/test_unit_layer_leaky_relu.py @@ -4,38 +4,46 @@ import pytest import os + def skeleton_layer_leaky_relu(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_layer_leaky_relu: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/layer_leaky_relu_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/layer_leaky_relu_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, + cluster=cluster, executable=executables[compiler_name], num_nodes=1, + num_processes=2, dir_name=dir_name, data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='leaky_relu', optimizer_name='sgd', + model_folder='tests/layer_tests', model_name='leaky_relu', + optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0 + def test_unit_layer_leaky_relu_clang4(cluster, exes, dirname): skeleton_layer_leaky_relu(cluster, exes, dirname, 'clang4') + def test_unit_layer_leaky_relu_gcc4_check(cluster, exes, dirname): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # assert 34304 == 0 skeleton_layer_leaky_relu(cluster, exes, dirname, 'gcc4') + def test_unit_layer_leaky_relu_gcc7(cluster, exes, dirname): skeleton_layer_leaky_relu(cluster, exes, dirname, 'gcc7') + def test_unit_layer_leaky_relu_intel18(cluster, exes, dirname): skeleton_layer_leaky_relu(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_leaky_relu_exe' --exe= def test_unit_layer_leaky_relu_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_layer_leaky_relu_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_layer_leaky_relu(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py index bda8dab5b98..9a47d55754d 100644 --- a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py +++ b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py @@ -4,38 +4,46 @@ import pytest import os + def skeleton_layer_log_sigmoid(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_layer_log_sigmoid: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/layer_log_sigmoid_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/layer_log_sigmoid_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, + cluster=cluster, executable=executables[compiler_name], num_nodes=1, + num_processes=2, dir_name=dir_name, data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='log_sigmoid', optimizer_name='sgd', + model_folder='tests/layer_tests', model_name='log_sigmoid', + optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0 + def test_unit_layer_log_sigmoid_clang4(cluster, exes, dirname): skeleton_layer_log_sigmoid(cluster, exes, dirname, 'clang4') + def test_unit_layer_log_sigmoid_gcc4_check(cluster, exes, dirname): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # assert 34304 == 0 skeleton_layer_log_sigmoid(cluster, exes, dirname, 'gcc4') + def test_unit_layer_log_sigmoid_gcc7(cluster, exes, dirname): skeleton_layer_log_sigmoid(cluster, exes, dirname, 'gcc7') + def test_unit_layer_log_sigmoid_intel18(cluster, exes, dirname): skeleton_layer_log_sigmoid(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_layer_log_sigmoid.py -k 'test_unit_layer_log_sigmoid_exe' --exe= def test_unit_layer_log_sigmoid_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_layer_log_sigmoid_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_layer_log_sigmoid(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_layer_log_softmax.py b/bamboo/unit_tests/test_unit_layer_log_softmax.py index 749cd34dc22..85a20790d31 100644 --- a/bamboo/unit_tests/test_unit_layer_log_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_log_softmax.py @@ -4,38 +4,46 @@ import pytest import os + def skeleton_layer_log_softmax(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_layer_log_softmax: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/layer_log_softmax_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/layer_log_softmax_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, + cluster=cluster, executable=executables[compiler_name], + num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='log_softmax', optimizer_name='sgd', + model_folder='tests/layer_tests', model_name='log_softmax', + optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0 + def test_unit_layer_log_softmax_clang4(cluster, exes, dirname): skeleton_layer_log_softmax(cluster, exes, dirname, 'clang4') + def test_unit_layer_log_softmax_gcc4_check(cluster, exes, dirname): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # assert 34304 == 0 skeleton_layer_log_softmax(cluster, exes, dirname, 'gcc4') + def test_unit_layer_log_softmax_gcc7(cluster, exes, dirname): skeleton_layer_log_softmax(cluster, exes, dirname, 'gcc7') + def test_unit_layer_log_softmax_intel18(cluster, exes, dirname): skeleton_layer_log_softmax(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_log_softmax_exe' --exe= def test_unit_layer_log_softmax_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_layer_log_softmax_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_layer_log_softmax(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py index 62768e6afe8..c21544ed295 100644 --- a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py +++ b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py @@ -4,38 +4,46 @@ import pytest import os + def skeleton_layer_mean_absolute_error(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_layer_mean_absolute_error: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/layer_mean_absolute_error_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/layer_mean_absolute_error_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, + cluster=cluster, executable=executables[compiler_name], num_nodes=1, + num_processes=2, dir_name=dir_name, data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='mean_absolute_error', optimizer_name='sgd', + model_folder='tests/layer_tests', model_name='mean_absolute_error', + optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0 + def test_unit_layer_mean_absolute_error_clang4(cluster, exes, dirname): skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'clang4') + def test_unit_layer_mean_absolute_error_gcc4_check(cluster, exes, dirname): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # assert 34304 == 0 skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'gcc4') + def test_unit_layer_mean_absolute_error_gcc7(cluster, exes, dirname): skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'gcc7') + def test_unit_layer_mean_absolute_error_intel18(cluster, exes, dirname): skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_mean_absolute_error_exe' --exe= def test_unit_layer_mean_absolute_error_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_layer_mean_absolute_error_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_layer_relu.py b/bamboo/unit_tests/test_unit_layer_relu.py index 0b66c9fabb2..c904cce301f 100644 --- a/bamboo/unit_tests/test_unit_layer_relu.py +++ b/bamboo/unit_tests/test_unit_layer_relu.py @@ -4,38 +4,46 @@ import pytest import os + def skeleton_layer_relu(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_layer_relu: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/layer_relu_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/layer_relu_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, + cluster=cluster, executable=executables[compiler_name], + num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='relu', optimizer_name='sgd', + model_folder='tests/layer_tests', model_name='relu', + optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0 + def test_unit_layer_relu_clang4(cluster, exes, dirname): skeleton_layer_relu(cluster, exes, dirname, 'clang4') + def test_unit_layer_relu_gcc4_check(cluster, exes, dirname): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # assert 34304 == 0 skeleton_layer_relu(cluster, exes, dirname, 'gcc4') + def test_unit_layer_relu_gcc7(cluster, exes, dirname): skeleton_layer_relu(cluster, exes, dirname, 'gcc7') + def test_unit_layer_relu_intel18(cluster, exes, dirname): skeleton_layer_relu(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_layer_relu.py -k 'test_unit_layer_relu_exe' --exe= def test_unit_layer_relu_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_layer_relu_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_layer_relu(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_layer_selu.py b/bamboo/unit_tests/test_unit_layer_selu.py index 5fb4cef8d1e..b32f8c9eb71 100644 --- a/bamboo/unit_tests/test_unit_layer_selu.py +++ b/bamboo/unit_tests/test_unit_layer_selu.py @@ -4,38 +4,46 @@ import pytest import os + def skeleton_layer_selu(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_layer_selu: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/layer_selu_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/layer_selu_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, + cluster=cluster, executable=executables[compiler_name], + num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='selu', optimizer_name='sgd', + model_folder='tests/layer_tests', model_name='selu', + optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0 + def test_unit_layer_selu_clang4(cluster, exes, dirname): skeleton_layer_selu(cluster, exes, dirname, 'clang4') + def test_unit_layer_selu_gcc4_check(cluster, exes, dirname): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # assert 34304 == 0 skeleton_layer_selu(cluster, exes, dirname, 'gcc4') + def test_unit_layer_selu_gcc7(cluster, exes, dirname): skeleton_layer_selu(cluster, exes, dirname, 'gcc7') + def test_unit_layer_selu_intel18(cluster, exes, dirname): skeleton_layer_selu(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_layer_selu.py -k 'test_unit_layer_selu_exe' --exe= def test_unit_layer_selu_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_layer_selu_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_layer_selu(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_layer_sigmoid.py b/bamboo/unit_tests/test_unit_layer_sigmoid.py index 2c0cc2d3d4e..268526b7644 100644 --- a/bamboo/unit_tests/test_unit_layer_sigmoid.py +++ b/bamboo/unit_tests/test_unit_layer_sigmoid.py @@ -4,38 +4,46 @@ import pytest import os + def skeleton_layer_sigmoid(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_layer_sigmoid: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/layer_sigmoid_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/layer_sigmoid_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, + cluster=cluster, executable=executables[compiler_name], + num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='sigmoid', optimizer_name='sgd', + model_folder='tests/layer_tests', model_name='sigmoid', + optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0 + def test_unit_layer_sigmoid_clang4(cluster, exes, dirname): skeleton_layer_sigmoid(cluster, exes, dirname, 'clang4') + def test_unit_layer_sigmoid_gcc4_check(cluster, exes, dirname): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # assert 34304 == 0 skeleton_layer_sigmoid(cluster, exes, dirname, 'gcc4') + def test_unit_layer_sigmoid_gcc7(cluster, exes, dirname): skeleton_layer_sigmoid(cluster, exes, dirname, 'gcc7') + def test_unit_layer_sigmoid_intel18(cluster, exes, dirname): skeleton_layer_sigmoid(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_layer_sigmoid.py -k 'test_unit_layer_sigmoid_exe' --exe= def test_unit_layer_sigmoid_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_layer_sigmoid_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_layer_sigmoid(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_layer_softmax.py b/bamboo/unit_tests/test_unit_layer_softmax.py index dd1742a551c..dd4c3add193 100644 --- a/bamboo/unit_tests/test_unit_layer_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_softmax.py @@ -4,38 +4,46 @@ import pytest import os + def skeleton_layer_softmax(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_layer_softmax: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/layer_softmax_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/layer_softmax_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, + cluster=cluster, executable=executables[compiler_name], + num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='softmax', optimizer_name='sgd', + model_folder='tests/layer_tests', model_name='softmax', + optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0 + def test_unit_layer_softmax_clang4(cluster, exes, dirname): skeleton_layer_softmax(cluster, exes, dirname, 'clang4') + def test_unit_layer_softmax_gcc4_check(cluster, exes, dirname): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # assert 34304 == 0 skeleton_layer_softmax(cluster, exes, dirname, 'gcc4') + def test_unit_layer_softmax_gcc7(cluster, exes, dirname): skeleton_layer_softmax(cluster, exes, dirname, 'gcc7') + def test_unit_layer_softmax_intel18(cluster, exes, dirname): skeleton_layer_softmax(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_softmax_exe' --exe= def test_unit_layer_softmax_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_layer_softmax_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_layer_softmax(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_layer_softplus.py b/bamboo/unit_tests/test_unit_layer_softplus.py index bc7d5605988..0c017c6f93e 100644 --- a/bamboo/unit_tests/test_unit_layer_softplus.py +++ b/bamboo/unit_tests/test_unit_layer_softplus.py @@ -4,38 +4,46 @@ import pytest import os + def skeleton_layer_softplus(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_layer_softplus: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/layer_softplus_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/layer_softplus_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, + cluster=cluster, executable=executables[compiler_name], num_nodes=1, + num_processes=2, dir_name=dir_name, data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='softplus', optimizer_name='sgd', + model_folder='tests/layer_tests', model_name='softplus', + optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0 + def test_unit_layer_softplus_clang4(cluster, exes, dirname): skeleton_layer_softplus(cluster, exes, dirname, 'clang4') + def test_unit_layer_softplus_gcc4_check(cluster, exes, dirname): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # assert 34304 == 0 skeleton_layer_softplus(cluster, exes, dirname, 'gcc4') + def test_unit_layer_softplus_gcc7(cluster, exes, dirname): skeleton_layer_softplus(cluster, exes, dirname, 'gcc7') + def test_unit_layer_softplus_intel18(cluster, exes, dirname): skeleton_layer_softplus(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_layer_softplus.py -k 'test_unit_layer_softplus_exe' --exe= def test_unit_layer_softplus_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_layer_softplus_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_layer_softplus(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_layer_softsign.py b/bamboo/unit_tests/test_unit_layer_softsign.py index 667efb172c3..a7bed251425 100644 --- a/bamboo/unit_tests/test_unit_layer_softsign.py +++ b/bamboo/unit_tests/test_unit_layer_softsign.py @@ -4,38 +4,46 @@ import pytest import os + def skeleton_layer_softsign(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_layer_softsign: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/layer_softsign_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/layer_softsign_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, + cluster=cluster, executable=executables[compiler_name], num_nodes=1, + num_processes=2, dir_name=dir_name, data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='softsign', optimizer_name='sgd', + model_folder='tests/layer_tests', model_name='softsign', + optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0 + def test_unit_layer_softsign_clang4(cluster, exes, dirname): skeleton_layer_softsign(cluster, exes, dirname, 'clang4') + def test_unit_layer_softsign_gcc4_check(cluster, exes, dirname): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # assert 34304 == 0 skeleton_layer_softsign(cluster, exes, dirname, 'gcc4') + def test_unit_layer_softsign_gcc7(cluster, exes, dirname): skeleton_layer_softsign(cluster, exes, dirname, 'gcc7') + def test_unit_layer_softsign_intel18(cluster, exes, dirname): skeleton_layer_softsign(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_layer_softsign.py -k 'test_unit_layer_softsign_exe' --exe= def test_unit_layer_softsign_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_layer_softsign_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_layer_softsign(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_layer_squared_difference.py b/bamboo/unit_tests/test_unit_layer_squared_difference.py index 201267757d7..a05bbcc5082 100644 --- a/bamboo/unit_tests/test_unit_layer_squared_difference.py +++ b/bamboo/unit_tests/test_unit_layer_squared_difference.py @@ -4,38 +4,46 @@ import pytest import os + def skeleton_layer_squared_difference(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_layer_squared_difference: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/layer_squared_difference_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/layer_squared_difference_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, + cluster=cluster, executable=executables[compiler_name], num_nodes=1, + num_processes=2, dir_name=dir_name, data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='squared_difference', optimizer_name='sgd', + model_folder='tests/layer_tests', model_name='squared_difference', + optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0 + def test_unit_layer_squared_difference_clang4(cluster, exes, dirname): skeleton_layer_squared_difference(cluster, exes, dirname, 'clang4') + def test_unit_layer_squared_difference_gcc4_check(cluster, exes, dirname): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # assert 34304 == 0 skeleton_layer_squared_difference(cluster, exes, dirname, 'gcc4') + def test_unit_layer_squared_difference_gcc7(cluster, exes, dirname): skeleton_layer_squared_difference(cluster, exes, dirname, 'gcc7') + def test_unit_layer_squared_difference_intel18(cluster, exes, dirname): skeleton_layer_squared_difference(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_layer_squared_difference.py -k 'test_unit_layer_squared_difference_exe' --exe= def test_unit_layer_squared_difference_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_layer_squared_difference_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_layer_squared_difference(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_layer_tessellate.py b/bamboo/unit_tests/test_unit_layer_tessellate.py index 25e30770c63..575bd894f89 100644 --- a/bamboo/unit_tests/test_unit_layer_tessellate.py +++ b/bamboo/unit_tests/test_unit_layer_tessellate.py @@ -4,38 +4,46 @@ import pytest import os + def skeleton_layer_tessellate(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_layer_tessellate: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/layer_tessellate_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/layer_tessellate_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, + cluster=cluster, executable=executables[compiler_name], num_nodes=1, + num_processes=2, dir_name=dir_name, data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='tessellate', optimizer_name='sgd', + model_folder='tests/layer_tests', model_name='tessellate', + optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0 + def test_unit_layer_tessellate_clang4(cluster, exes, dirname): skeleton_layer_tessellate(cluster, exes, dirname, 'clang4') + def test_unit_layer_tessellate_gcc4_check(cluster, exes, dirname): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # assert 34304 == 0 skeleton_layer_tessellate(cluster, exes, dirname, 'gcc4') + def test_unit_layer_tessellate_gcc7(cluster, exes, dirname): skeleton_layer_tessellate(cluster, exes, dirname, 'gcc7') + def test_unit_layer_tessellate_intel18(cluster, exes, dirname): skeleton_layer_tessellate(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_layer_tessellate.py -k 'test_unit_layer_tessellate_exe' --exe= def test_unit_layer_tessellate_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_layer_tessellate_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_layer_tessellate(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_layer_variance.py b/bamboo/unit_tests/test_unit_layer_variance.py index 4b476aedf5b..0db001567d5 100644 --- a/bamboo/unit_tests/test_unit_layer_variance.py +++ b/bamboo/unit_tests/test_unit_layer_variance.py @@ -4,38 +4,46 @@ import pytest import os + def skeleton_layer_variance(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_layer_variance: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/layer_variance_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/layer_variance_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, + cluster=cluster, executable=executables[compiler_name], + num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='variance', optimizer_name='sgd', + model_folder='tests/layer_tests', model_name='variance', + optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0 + def test_unit_layer_variance_clang4(cluster, exes, dirname): skeleton_layer_variance(cluster, exes, dirname, 'clang4') + def test_unit_layer_variance_gcc4_check(cluster, exes, dirname): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # assert 34304 == 0 skeleton_layer_variance(cluster, exes, dirname, 'gcc4') + def test_unit_layer_variance_gcc7(cluster, exes, dirname): skeleton_layer_variance(cluster, exes, dirname, 'gcc7') + def test_unit_layer_variance_intel18(cluster, exes, dirname): skeleton_layer_variance(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_variance_exe' --exe= def test_unit_layer_variance_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_layer_variance_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_layer_variance(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_lbann2_reload.py b/bamboo/unit_tests/test_unit_lbann2_reload.py index 8bad2453fae..4b8491e248f 100644 --- a/bamboo/unit_tests/test_unit_lbann2_reload.py +++ b/bamboo/unit_tests/test_unit_lbann2_reload.py @@ -4,11 +4,21 @@ import pytest import os, sys + def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_lbann2_reload: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) + lbann2 = executables[compiler_name] + '2' + + # Delete directories / files if they happen to be around from the + # previous build. + os.system('rm -rf ckpt') + os.system('rm -rf lbann2_*') + - lbann2 = executables[compiler_name] + '2' + # No checkpointing, printing weights to files. model_path = '{../../model_zoo/models/lenet_mnist/model_lenet_mnist.prototext,../../model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext}' output_file_name = '%s/bamboo/unit_tests/output/lbann2_no_checkpoint_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/lbann2_no_checkpoint_%s_error.txt' % (dir_name, compiler_name) @@ -22,6 +32,7 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): num_epochs=2, output_file_name=output_file_name, error_file_name=error_file_name) + os.mkdir('lbann2_ckpt') return_code = os.system(command) if return_code != 0: @@ -30,6 +41,7 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): os.system('mv lbann2_ckpt lbann2_nockpt') + # Run to checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/lbann2_checkpoint_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/lbann2_checkpoint_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( @@ -42,9 +54,11 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): error_file_name=error_file_name) return_code_ckpt_1 = os.system(command) if return_code_ckpt_1 != 0: - sys.stderr.write('LeNet (checkpoint) execution failed, exiting with error') + sys.stderr.write( + 'LeNet (checkpoint) execution failed, exiting with error') sys.exit(1) + # Pick up from checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/lbann2_restart_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/lbann2_restart_%s_error.txt' % (dir_name, compiler_name) os.mkdir('lbann2_ckpt') @@ -59,30 +73,76 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): error_file_name=error_file_name) return_code_ckpt_2 = os.system(command) if return_code_ckpt_2 != 0: - sys.stderr.write('LBANN2 LeNet weight reload failed, exiting with error') + sys.stderr.write( + 'LBANN2 LeNet weight reload failed, exiting with error') sys.exit(1) os.system('rm lbann2_ckpt/model0-epoch*') os.system('rm lbann2_nockpt/model0-epoch*') - diff_test = os.system('diff -rq lbann2_ckpt/ lbann2_nockpt/') + + diff_result = os.system('diff -rq lbann2_ckpt/ lbann2_nockpt/') + allow_epsilon_diff = False + if allow_epsilon_diff and (diff_result != 0): + equal_within_epsilon = True + ckpt_files = os.listdir('lbann2_ckpt') + for file_name in ckpt_files: + ckpt_file = open('lbann2_ckpt/' + file_name, 'r') + no_ckpt_file = open('lbann2_nockpt/' + file_name, 'r') + for ckpt_line in ckpt_file: + no_ckpt_line = next(no_ckpt_file) + if ckpt_line != no_ckpt_line: + error_string = ('ckpt_line={ckpt_line},' + ' nockpt_line={no_ckpt_line}').format( + ckpt_line=ckpt_line, no_ckpt_line=no_ckpt_line) + try: + ckpt_values = list(map(float, ckpt_line.split())) + no_ckpt_values = list(map(float, no_ckpt_line.split())) + num = len(ckpt_values) + if len(no_ckpt_values) == num: + for i in range(num): + if abs(ckpt_values[i] - no_ckpt_values[i]) > 0.5: + # Not equal within epsilon. + equal_within_epsilon = False + print(error_string) + else: + # Length of lists don't match. + equal_within_epsilon = False + print(error_string) + except ValueError: + # Non-numerical diff. + equal_within_epsilon = False + print(error_string) + if equal_within_epsilon: + diff_result = 0 os.system('rm -rf ckpt') os.system('rm -rf lbann2_*') - assert diff_test == 0 + assert diff_result == 0 + def test_unit_lbann2_reload_clang4(cluster, exes, dirname): + if cluster == 'catalyst': # STILL ERRORS + pytest.skip('FIXME') skeleton_lbann2_reload(cluster, exes, dirname, 'clang4') + def test_unit_lbann2_reload_gcc4(cluster, exes, dirname): skeleton_lbann2_reload(cluster, exes, dirname, 'gcc4') + def test_unit_lbann2_reload_gcc7(cluster, exes, dirname): + if cluster in ['catalyst', 'pascal']: # STILL ERRORS + pytest.skip('FIXME') skeleton_lbann2_reload(cluster, exes, dirname, 'gcc7') + def test_unit_lbann2_reload_intel18(cluster, exes, dirname): skeleton_lbann2_reload(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_lbann2_reload.py -k 'test_unit_lbann2_reload_exe' --exe= def test_unit_lbann2_reload_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_lbann2_reload_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_lbann2_reload(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_lbann_invocation.py b/bamboo/unit_tests/test_unit_lbann_invocation.py index efaf7db4686..a002db49be4 100644 --- a/bamboo/unit_tests/test_unit_lbann_invocation.py +++ b/bamboo/unit_tests/test_unit_lbann_invocation.py @@ -1,10 +1,9 @@ import sys sys.path.insert(0, '../common_python') import tools -import pytest import os, sys -def test_unit_no_params_bad(cluster, exes, dirname): +def test_unit_no_params_bad(cluster, exes): exe = exes['gcc4'] sys.stderr.write('TESTING: run lbann with no params; lbann should throw exception\n') command = tools.get_command( @@ -12,7 +11,8 @@ def test_unit_no_params_bad(cluster, exes, dirname): return_code = os.system(command) assert return_code != 0 -def test_unit_one_model_bad(cluster, exes, dirname): + +def test_unit_one_model_bad(cluster, exes): exe = exes['gcc4'] sys.stderr.write('TESTING: run lbann with no optimizer or reader; lbann should throw exception\n') model_path = 'prototext/model_mnist_simple_1.prototext' @@ -22,7 +22,8 @@ def test_unit_one_model_bad(cluster, exes, dirname): return_code = os.system(command) assert return_code != 0 -def test_unit_two_models_bad(cluster, exes, dirname): + +def test_unit_two_models_bad(cluster, exes): exe = exes['gcc4'] sys.stderr.write('TESTING: run lbann with two models but no optimizer or reader; lbann should throw exception\n') model_path = '{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' @@ -32,7 +33,8 @@ def test_unit_two_models_bad(cluster, exes, dirname): return_code = os.system(command) assert return_code != 0 -def test_unit_two_models_bad2(cluster, exes, dirname): + +def test_unit_two_models_bad2(cluster, exes): exe = exes['gcc4'] sys.stderr.write('TESTING: run lbann with two models with missing {; lbann should throw exception\n') model_path='prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' @@ -42,7 +44,8 @@ def test_unit_two_models_bad2(cluster, exes, dirname): return_code = os.system(command) assert return_code != 0 -def test_unit_missing_optimizer(cluster, exes, dirname): + +def test_unit_missing_optimizer(cluster, exes): exe = exes['gcc4'] sys.stderr.write('TESTING: run lbann with two models, reader, but no optimizer; lbann should throw exception\n') model_path='{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' @@ -54,7 +57,8 @@ def test_unit_missing_optimizer(cluster, exes, dirname): return_code = os.system(command) assert return_code != 0 -def test_unit_missing_reader(cluster, exes, dirname): + +def test_unit_missing_reader(cluster, exes): exe = exes['gcc4'] sys.stderr.write('TESTING: run lbann with two models, reader, but no reader; lbann should throw exception\n') model_path = '{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' @@ -65,14 +69,16 @@ def test_unit_missing_reader(cluster, exes, dirname): return_code = os.system(command) assert return_code != 0 -def test_unit_bad_params(cluster, exes, dirname): + +def test_unit_bad_params(cluster, exes): exe = exes['gcc4'] sys.stderr.write('TESTING: run lbann with ill-formed param (missing -) lbann should throw exception\n') (command_allocate, command_run, _, _) = tools.get_command(cluster=cluster, executable=exe, return_tuple=True) return_code = os.system('%s%s %s -exit_after_setup --reader=prototext/data_reader_mnist.prototext --model={prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext} --optimizer=prototext/opt_sgd.prototext' % (command_allocate, command_run, exe)) assert return_code != 0 -def test_unit_should_work(cluster, exes, dirname): + +def test_unit_should_work(cluster, exes): exe = exes['gcc4'] sys.stderr.write('TESTING: run lbann with two models, reader, and optimizer; lbann should NOT throw exception\n') model_path = '{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' diff --git a/bamboo/unit_tests/test_unit_mnist_conv_graph.py b/bamboo/unit_tests/test_unit_mnist_conv_graph.py index 3437f461273..65a7bd54ad0 100644 --- a/bamboo/unit_tests/test_unit_mnist_conv_graph.py +++ b/bamboo/unit_tests/test_unit_mnist_conv_graph.py @@ -4,40 +4,53 @@ import pytest import os + def skeleton_mnist_conv_graph(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_mnist_conv_graph: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/mnist_conv_graph_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/mnist_conv_graph_%s_error.txt' % (dir_name, compiler_name) + if compiler_name == 'gcc7': + tl = 240 + else: + tl = None command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=1, - dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', model_folder='tests', model_name='mnist_conv_graph', + cluster=cluster, executable=executables[compiler_name], + num_nodes=1, time_limit=tl, num_processes=1, + dir_name=dir_name, + data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', + data_reader_name='mnist', model_folder='tests', + model_name='mnist_conv_graph', optimizer_name='adam', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0 + def test_unit_mnist_conv_graph_clang4(cluster, exes, dirname): skeleton_mnist_conv_graph(cluster, exes, dirname, 'clang4') + def test_unit_mnist_conv_graph_gcc4(cluster, exes, dirname): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # assert 35584 == 0 skeleton_mnist_conv_graph(cluster, exes, dirname, 'gcc4') + def test_unit_mnist_conv_graph_gcc7(cluster, exes, dirname): skeleton_mnist_conv_graph(cluster, exes, dirname, 'gcc7') + def test_unit_mnist_conv_graph_intel18(cluster, exes, dirname): skeleton_mnist_conv_graph(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_conv_graph.py -k 'test_unit_mnist_conv_graph_exe' --exe= def test_unit_mnist_conv_graph_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_mnist_conv_graph_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_mnist_conv_graph(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_mnist_ridge_regression.py b/bamboo/unit_tests/test_unit_mnist_ridge_regression.py index 4390693d99b..0d4d3994837 100644 --- a/bamboo/unit_tests/test_unit_mnist_ridge_regression.py +++ b/bamboo/unit_tests/test_unit_mnist_ridge_regression.py @@ -4,38 +4,47 @@ import pytest import os + def skeleton_mnist_ridge_regression(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_mnist_ridge_regression: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/mnist_ridge_regression_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/mnist_ridge_regression_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=1, dir_name=dir_name, - data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', data_reader_name='mnist', - model_folder='tests', model_name='mnist_ridge_regression', optimizer_name='adam', + cluster=cluster, executable=executables[compiler_name], num_nodes=1, + num_processes=1, dir_name=dir_name, + data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', + data_reader_name='mnist', + model_folder='tests', model_name='mnist_ridge_regression', + optimizer_name='adam', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0 + def test_unit_mnist_ridge_regression_clang4(cluster, exes, dirname): skeleton_mnist_ridge_regression(cluster, exes, dirname, 'clang4') + def test_unit_mnist_ridge_regression_gcc4(cluster, exes, dirname): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # assert 34304 == 0 skeleton_mnist_ridge_regression(cluster, exes, dirname, 'gcc4') + def test_unit_mnist_ridge_regression_gcc7(cluster, exes, dirname): skeleton_mnist_ridge_regression(cluster, exes, dirname, 'gcc7') + def test_unit_mnist_ridge_regression_intel18(cluster, exes, dirname): skeleton_mnist_ridge_regression(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_mnist_ridge_regression_exe' --exe= def test_unit_mnist_ridge_regression_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_mnist_ridge_regression_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_mnist_ridge_regression(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py b/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py index e67ec7e8cb7..8718c0e5802 100644 --- a/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py +++ b/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py @@ -4,38 +4,47 @@ import pytest import os + def skeleton_mnist_softmax_classifier(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: - pytest.skip('default_exes[%s] does not exist' % compiler_name) + e = 'skeleton_mnist_softmax_classifier: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/mnist_softmax_classifier_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/mnist_softmax_classifier_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=1, dir_name=dir_name, - data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', data_reader_name='mnist', - model_folder='tests', model_name='mnist_softmax_classifier', optimizer_name='adam', + cluster=cluster, executable=executables[compiler_name], num_nodes=1, + num_processes=1, dir_name=dir_name, + data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', + data_reader_name='mnist', + model_folder='tests', model_name='mnist_softmax_classifier', + optimizer_name='adam', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0 + def test_unit_mnist_softmax_classifier_clang4(cluster, exes, dirname): skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'clang4') + def test_unit_mnist_softmax_classifier_gcc4(cluster, exes, dirname): - if cluster in ['surface']: - pytest.skip('FIXME') - # Surface Errors: - # assert 34304 == 0 skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'gcc4') + def test_unit_mnist_softmax_classifier_gcc7(cluster, exes, dirname): skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'gcc7') + def test_unit_mnist_softmax_classifier_intel18(cluster, exes, dirname): skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'intel18') + # Run with python -m pytest -s test_unit_softmax_classifier.py -k 'test_unit_mnist_softmax_classifier_exe' --exe= def test_unit_mnist_softmax_classifier_exe(cluster, dirname, exe): - if exe == None: - pytest.skip('Non-local testing') - exes = {'exe' : exe} + if exe is None: + e = 'test_unit_mnist_softmax_classifier_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'exe') diff --git a/cmake/configure_files/lbann_config.hpp.in b/cmake/configure_files/lbann_config.hpp.in index bdf17666422..76b50bc920c 100644 --- a/cmake/configure_files/lbann_config.hpp.in +++ b/cmake/configure_files/lbann_config.hpp.in @@ -31,6 +31,7 @@ #cmakedefine LBANN_HAS_ALUMINUM #cmakedefine LBANN_ALUMINUM_MPI_PASSTHROUGH #cmakedefine LBANN_HAS_CONDUIT +#cmakedefine LBANN_HAS_PYTHON #cmakedefine LBANN_DETERMINISTIC @@ -42,6 +43,9 @@ #cmakedefine LBANN_SYS_SENDFILE_OK +#cmakedefine LBANN_HAS_STD_ANY +#cmakedefine LBANN_HAS_STD_MAKE_UNIQUE + // Define the LBANN datatype namespace lbann { diff --git a/cmake/configure_files/lbann_module.lua.in b/cmake/configure_files/lbann_module.lua.in new file mode 100644 index 00000000000..754d2c6106d --- /dev/null +++ b/cmake/configure_files/lbann_module.lua.in @@ -0,0 +1,69 @@ +-- LMod module file for LBANN + +-- CMAKE_INSTALL_PREFIX: @CMAKE_INSTALL_PREFIX@ +-- CMAKE_BUILD_TYPE: @CMAKE_BUILD_TYPE@ +-- CXX Compiler: @CMAKE_CXX_COMPILER@ +-- CXX FLAGS: @CMAKE_CXX_FLAGS@ +-- CXX FLAGS_DEBUG: @CMAKE_CXX_FLAGS_DEBUG@ +-- CXX FLAGS_RELWITHDEBINFO: @CMAKE_CXX_FLAGS_RELWITHDEBINFO@ +-- CXX FLAGS_RELEASE: @CMAKE_CXX_FLAGS_RELEASE@ +-- LBANN_GNU_LINUX: @LBANN_GNU_LINUX@ +-- LBANN_HAS_HYDROGEN: @LBANN_HAS_HYDROGEN@ +-- LBANN_HAS_OPENCV: @LBANN_HAS_OPENCV@ +-- LBANN_HAS_CEREAL: @LBANN_HAS_CEREAL@ +-- LBANN_HAS_CUDA: @LBANN_HAS_CUDA@ +-- LBANN_HAS_CUDNN: @LBANN_HAS_CUDNN@ +-- LBANN_HAS_NCCL2: @LBANN_HAS_NCCL2@ +-- LBANN_HAS_PROTOBUF: @LBANN_HAS_PROTOBUF@ +-- LBANN_HAS_CNPY: @LBANN_HAS_CNPY@ +-- LBANN_HAS_TBINF: @LBANN_HAS_TBINF@ +-- LBANN_HAS_VTUNE: @LBANN_HAS_VTUNE@ +-- LBANN_NVPROF: @LBANN_NVPROF@ +-- LBANN_HAS_DOXYGEN: @LBANN_HAS_DOXYGEN@ +-- LBANN_HAS_LBANN_PROTO: @LBANN_HAS_LBANN_PROTO@ +-- LBANN_HAS_ALUMINUM: @LBANN_HAS_ALUMINUM@ +-- LBANN_HAS_CONDUIT: @LBANN_HAS_CONDUIT@ +-- LBANN_HAS_PYTHON: @LBANN_HAS_PYTHON@ + +help( +[[ +LBANN version @LBANN_VERSION@. Livermore Big Artificial Neural Network +Toolkit. A distributed memory, HPC-optimized, model and data parallel +training toolkit for deep neural networks. +]]) + +whatis("Package: LBANN") +whatis("Version: @LBANN_VERSION@") +whatis("Description: Livermore Big Artificial Neural Network Toolkit. A distributed memory, HPC-optimized, model and data parallel training toolkit for deep neural networks.") +whatis("URL: https://github.com/llnl/lbann") +whatis("CMAKE_INSTALL_PREFIX: @CMAKE_INSTALL_PREFIX@") +whatis("CMAKE_BUILD_TYPE: @CMAKE_BUILD_TYPE@") +whatis("CXX Compiler: @CMAKE_CXX_COMPILER@") +whatis("CXX FLAGS: @CMAKE_CXX_FLAGS@") +whatis("CXX FLAGS_DEBUG: @CMAKE_CXX_FLAGS_DEBUG@") +whatis("CXX FLAGS_RELWITHDEBINFO: @CMAKE_CXX_FLAGS_RELWITHDEBINFO@") +whatis("CXX FLAGS_RELEASE: @CMAKE_CXX_FLAGS_RELEASE@") +whatis("LBANN_GNU_LINUX: @LBANN_GNU_LINUX@") +whatis("LBANN_HAS_HYDROGEN: @LBANN_HAS_HYDROGEN@") +whatis("LBANN_HAS_OPENCV: @LBANN_HAS_OPENCV@") +whatis("LBANN_HAS_CEREAL: @LBANN_HAS_CEREAL@") +whatis("LBANN_HAS_CUDA: @LBANN_HAS_CUDA@") +whatis("LBANN_HAS_CUDNN: @LBANN_HAS_CUDNN@") +whatis("LBANN_HAS_NCCL2: @LBANN_HAS_NCCL2@") +whatis("LBANN_HAS_PROTOBUF: @LBANN_HAS_PROTOBUF@") +whatis("LBANN_HAS_CNPY: @LBANN_HAS_CNPY@") +whatis("LBANN_HAS_TBINF: @LBANN_HAS_TBINF@") +whatis("LBANN_HAS_VTUNE: @LBANN_HAS_VTUNE@") +whatis("LBANN_NVPROF: @LBANN_NVPROF@") +whatis("LBANN_HAS_DOXYGEN: @LBANN_HAS_DOXYGEN@") +whatis("LBANN_HAS_LBANN_PROTO: @LBANN_HAS_LBANN_PROTO@") +whatis("LBANN_HAS_ALUMINUM: @LBANN_HAS_ALUMINUM@") +whatis("LBANN_HAS_CONDUIT: @LBANN_HAS_CONDUIT@") +whatis("LBANN_HAS_PYTHON: @LBANN_HAS_PYTHON@") + +prepend_path("PATH","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_BINDIR@") +prepend_path("LD_LIBRARY_PATH","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@") +prepend_path("PYTHONPATH","@PYTHON_INSTALL_PREFIX@/@CMAKE_INSTALL_PYTHONDIR@") + +pushenv("LBANN_DIR","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_DIR@") + diff --git a/cmake/configure_files/python_config.ini.in b/cmake/configure_files/python_config.ini.in new file mode 100644 index 00000000000..3ed3de25aef --- /dev/null +++ b/cmake/configure_files/python_config.ini.in @@ -0,0 +1,3 @@ +[Paths] +lbann_pb2.py = @_LBANN_PB2_PY@ +lbann_exe = @_LBANN_EXE@ diff --git a/cmake/configure_files/setup.py.in b/cmake/configure_files/setup.py.in new file mode 100644 index 00000000000..bd6dae0516b --- /dev/null +++ b/cmake/configure_files/setup.py.in @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +import os.path +import setuptools + +# Variables from CMake +version = '@LBANN_VERSION@' +src_dir = '@_LBANN_PYTHON_DIR@' +config_file = '@_PYTHON_CONFIG_INI@' + +# Get relative paths +# Note: setuptools does not accept absolute paths +current_dir = os.path.dirname(os.path.abspath(__file__)) +src_dir = os.path.relpath(os.path.abspath(src_dir), current_dir) +config_file = os.path.relpath(os.path.abspath(config_file), current_dir) + +# Setup package +setuptools.setup( + name='lbann', + description='LBANN: Livermore Big Artificial Neural Network', + version=version, + url='https://github.com/LLNL/lbann', + author='Lawrence Livermore National Security, LLC.', + license='Apache 2.0', + packages=setuptools.find_packages(src_dir), + package_dir={'': src_dir}, + data_files=[('lbann', [config_file])], + install_requires=['graphviz>=0.10.1', + 'matplotlib>=2.0.2', + 'numpy>=1.16.0', + 'onnx>=1.3.0', + 'pandas>=0.24.1', + 'protobuf>=3.6.1', + 'texttable>=1.4.0']) diff --git a/cmake/modules/FindPython.cmake b/cmake/modules/FindPython.cmake new file mode 100644 index 00000000000..62c7945174f --- /dev/null +++ b/cmake/modules/FindPython.cmake @@ -0,0 +1,90 @@ +# Detect Python interpreter and Python C API +# +# This makes several improvements over the FindPython.cmake module +# that comes included with CMake: +# - The stock version ignores user-provded hints if it thinks it has +# found a newer Python version. This is a problem if a virtual +# environment doesn't override the 'python.' +# executable since that executable will take +# precedence. User-provided hints now take precedence. +# - Python C API objects are deduced by querying the Python +# interpreter rather than directly looking for files. This is +# helpful if a virtual environment doesn't create all the +# necessary copies or symlinks. +# +# Hint variables +# +# Python_EXECUTABLE +# Python_ROOT_DIR +# +# Exports the following variables +# +# Python_FOUND +# Python_EXECUTABLE +# Python_VERSION +# Python_VERSION_MAJOR +# Python_VERSION_MINOR +# Python_VERSION_PATCH +# Python_INCLUDE_DIRS +# Python_LIBRARIES +# + +set(Python_FOUND FALSE) + +# Find executable +if (NOT Python_EXECUTABLE) + if (Python_ROOT_DIR) + set(_HINT "${Python_ROOT_DIR}/bin") + endif (Python_ROOT_DIR) + find_program(Python_EXECUTABLE + NAMES python3 python + HINTS "${_HINT}") +endif (NOT Python_EXECUTABLE) +if (NOT Python_EXECUTABLE) + message(WARNING "Could not find Python executable") + return() +endif (NOT Python_EXECUTABLE) + +# Get version +execute_process( + COMMAND "${Python_EXECUTABLE}" "-c" + "import sys; sys.stdout.write('.'.join([str(x) for x in sys.version_info[:3]]))" + OUTPUT_VARIABLE Python_VERSION) +string(REGEX MATCHALL "[0-9]+" _VERSION_PARSED "${Python_VERSION}") +list(GET _VERSION_PARSED 0 Python_VERSION_MAJOR) +list(GET _VERSION_PARSED 1 Python_VERSION_MINOR) +list(GET _VERSION_PARSED 2 Python_VERSION_PATCH) + +# Find Python C API +execute_process( + COMMAND "${Python_EXECUTABLE}" "-c" + "import sys; from distutils.sysconfig import get_python_inc; sys.stdout.write(get_python_inc())" + OUTPUT_VARIABLE Python_INCLUDE_DIRS) +execute_process( + COMMAND "${Python_EXECUTABLE}" "-c" + "import sys; from distutils.sysconfig import get_config_var; sys.stdout.write(get_config_var('LIBDIR'))" + OUTPUT_VARIABLE _LIB_DIR) +if (BUILD_SHARED_LIBS) + set(_GLOB_EXPR "${_LIB_DIR}/libpython*${CMAKE_SHARED_LIBRARY_SUFFIX}") +ELSE (BUILD_SHARED_LIBS) + set(_GLOB_EXPR "${_LIB_DIR}/libpython*${CMAKE_STATIC_LIBRARY_SUFFIX}") +endif (BUILD_SHARED_LIBS) +FILE(GLOB _GLOB_RESULT "${_GLOB_EXPR}") +get_filename_component(Python_LIBRARIES "${_GLOB_RESULT}" ABSOLUTE) + +# Handle the find_package arguments +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + Python + REQUIRED_VARS Python_EXECUTABLE Python_INCLUDE_DIRS Python_LIBRARIES + Python_VERSION_MAJOR Python_VERSION_MINOR Python_VERSION_PATCH + VERSION_VAR Python_VERSION) + +# Build the imported target +if (NOT TARGET Python::Python) + add_library(Python::Python INTERFACE IMPORTED) + set_property(TARGET Python::Python + PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${Python_INCLUDE_DIRS}") + set_property(TARGET Python::Python + PROPERTY INTERFACE_LINK_LIBRARIES "${Python_LIBRARIES}") +endif (NOT TARGET Python::Python) diff --git a/cmake/modules/SetupCXX.cmake b/cmake/modules/SetupCXX.cmake index eeac1e6336e..ef2e1d9415c 100644 --- a/cmake/modules/SetupCXX.cmake +++ b/cmake/modules/SetupCXX.cmake @@ -74,7 +74,8 @@ endif () # Initialize C++ flags lbann_check_and_append_flag(CMAKE_CXX_FLAGS - -fPIC -g -Wall -Wextra -Wno-unused-parameter -Wnon-virtual-dtor -Wshadow) + -fPIC -g -Wall -Wextra -Wno-unused-parameter -Wnon-virtual-dtor -Wshadow + -Wno-deprecated-declarations) # Disable all optimization in debug for better viewing under debuggers # (cmake already adds -g) @@ -150,3 +151,16 @@ endif () # Check if we can use Linux's sys/sendfile.h check_include_file_cxx(sys/sendfile.h LBANN_SYS_SENDFILE_OK) + +# Testing for std::any +include(CheckCXXSourceCompiles) +set(_ANY_TEST_CODE + "#include +int main(int, char* argv[]) { std::any x; }") +check_cxx_source_compiles("${_ANY_TEST_CODE}" LBANN_HAS_STD_ANY) + +set(_MAKE_UNIQUE_TEST_CODE + "#include +int main(int, char* argv[]) { auto x = std::make_unique(); }") +check_cxx_source_compiles( + "${_MAKE_UNIQUE_TEST_CODE}" LBANN_HAS_STD_MAKE_UNIQUE) diff --git a/cmake/modules/SetupProtobuf.cmake b/cmake/modules/SetupProtobuf.cmake index 51fac7f9f33..cfe37b87b30 100644 --- a/cmake/modules/SetupProtobuf.cmake +++ b/cmake/modules/SetupProtobuf.cmake @@ -41,7 +41,7 @@ else () if(NOT Protobuf_FOUND) find_package(Protobuf "${PROTOBUF_MIN_VERSION}" CONFIG QUIET REQUIRED) endif () - message("Found Protobuf: ${Protobuf_DIR}") + message(STATUS "Found Protobuf: ${Protobuf_DIR}") endif () if (NOT Protobuf_FOUND) diff --git a/containers/README.md b/containers/README.md index 224510e9156..c0bceafbb54 100644 --- a/containers/README.md +++ b/containers/README.md @@ -1,42 +1,42 @@ ## Singularity - + [Singularity](http://singularity.lbl.gov/) - + First build a Singularity container with the lbann.def file: ``` sudo singularity build --writable lbann.img lbann.def ``` *Note: Building the image requires root access.* - + *Note: --writable allows users to make changes inside the container (Required for LC).* -This will create a container called lbann.img which can be used to invoke lbann on any system with singularity and openmpi installed. +This will create a container called lbann.img which can be used to invoke lbann on any system with singularity and openmpi installed. ### Customizing Configuration in lbann.def -Singularity is designed to take advantage of underlying HPC resources. The lbann.def file in this directory specifically installs packages necessary for infiniband interconnects (lines 15-19). It builds openmpi outside of the spack step to ensure it is built with infiniband support (lines 37-55). Experienced users should modify these sections to match with the underlying resources they intend to run on. This defintion file also builds gcc version 4.9.3, and uses it to build openmpi and lbann (lines 33-35). This is also customized to run on specific LC resources, and can be modified depending on the users system. +Singularity is designed to take advantage of underlying HPC resources. The lbann.def file in this directory specifically installs packages necessary for infiniband interconnects (lines 15-19). It builds openmpi outside of the spack step to ensure it is built with infiniband support (lines 37-55). Experienced users should modify these sections to match with the underlying resources they intend to run on. This defintion file also builds gcc version 4.9.3, and uses it to build openmpi and lbann (lines 33-35). This is also customized to run on specific LC resources, and can be modified depending on the users system. ### Running LBANN with Singualrity To run LBANN use mpirun and singularity's execute command: ``` salloc -N2 - mpirun -np 4 singularity exec -B /p:/p lbann.img /lbann/spack_builds/singularity_optimizied_test/model_zoo/lbann mpirun -np 4 singularity exec -B /p:/p lbann.img /lbann/spack_builds/singularity/model_zoo/lbann --model=/lbann/model_zoo/tests/model_mnist_distributed_io.prototext --reader=/lbann/model_zoo/data_readers/data_reader_mnist.prototext --optimizer=/lbann/ model_zoo/optimizers/opt_adagrad.prototext +mpirun -np 4 singularity exec -B /p:/p lbann.img /lbann/spack_builds/singularity_optimizied_test/model_zoo/lbann mpirun -np 4 singularity exec -B /p:/p lbann.img /lbann/spack_builds/singularity/model_zoo/lbann --model=/lbann/model_zoo/models/lenet_mnist/model_lenet_mnist.prototext --reader=/lbann/model_zoo/data_readers/data_reader_mnist.prototext --optimizer=/lbann/ model_zoo/optimizers/opt_adagrad.prototext ``` *Note: The -B singularity command, binds directories from the surrounding filesystem to the container. Be sure to include any necessary files using this command (i.e model prototext files, datasets, etc). Alternatively, system admins are capable of allowing a singularity container to utilize the host's filesystem. This is done by changing MOUNT HOSTFS in the singularity config file.* ## Docker - + [Docker](https://www.docker.com/) - + First build a Docker image with the Dockerfile. From whichever directory contains the Dockerfile: ``` docker build -t dockban . ``` - + *Note: The -t flag specifies an identifying tag for this image. "dockban" can be changed to any desired tag.* - + ### Customizing Configuration in Dockerfile - The Dockerfile container defintion is less complicated than its Singularity counterpart. gcc 7.1.0 is built and registered with spack in lines 19-21. Users can change this, as well as LBANN specific build options in spack (line 22). For instance, to add gpu support a user can add "+gpu" to this line. - + The Dockerfile container defintion is less complicated than its Singularity counterpart. gcc 7.1.0 is built and registered with spack in lines 19-21. Users can change this, as well as LBANN specific build options in spack (line 22). For instance, to add gpu support a user can add "+gpu" to this line. + ### Running LBANN with Docker -This LBANN build also uses openmpi, so lbann can be launched with mpirun here as well. However, this example will just show the single process invocation. +This LBANN build also uses openmpi, so lbann can be launched with mpirun here as well. However, this example will just show the single process invocation. Start a docker container from the previously created image, and attach to it. Make sure to bind any necessary directories using -v: ``` @@ -44,5 +44,5 @@ docker run -it -v $HOME/MNIST:/MNIST dockban ``` Run LBANN as you would outside of a container: ``` -./spack_build/docker_build/model_zoo/lbann --model=model_zoo/models/lenet_mnist/model_lenet_mnist.prototext --reader=model_zoo/data_readers/data_reader_mnist.prototext --optimizer=model_zoo/optimizers/opt_sgd.prototext +./spack_build/docker_build/model_zoo/lbann --model=model_zoo/models/lenet_mnist/model_lenet_mnist.prototext --reader=model_zoo/data_readers/data_reader_mnist.prototext --optimizer=model_zoo/optimizers/opt_sgd.prototext ``` diff --git a/docs/BuildRSTDocs.py b/docs/BuildRSTDocs.py new file mode 100644 index 00000000000..5a783e493c6 --- /dev/null +++ b/docs/BuildRSTDocs.py @@ -0,0 +1,355 @@ +#from RSTDocsFlavorText import * + +import xml.etree.ElementTree as etree +import os, runpy + +rst_docs_globals = runpy.run_path("RSTDocsFlavorText.py") +lbann_rst_headers = rst_docs_globals["lbann_rst_headers"] +lbann_rst_flavor_text = rst_docs_globals["lbann_rst_flavor_text"] + +# Some globals cuz lazy +xml_root_dir = 'doxy_out/xml/' + +def strip_template(class_name): + ind = class_name.find('<') + if ind > 0: + return class_name[0:ind] + return class_name + +# This will return a list of length 2, [longest_namespace, class_name] +# E.g., split_namespace("A::B::C::myclass") will return ["A::B::C","myclass"]. +# +# If no namespace, the first entry will be empty string. +def split_namespace(class_name): + ind = class_name.rfind(':') + if ind > 0: + return class_name[0:ind-1], class_name[ind+1:] + return "", class_name + +def strip_namespace(class_name): + ind = class_name.rfind(':') + if ind > 0: + return class_name[ind+1:] + return class_name + +def get_known_subdirs(topdir, all_dirs): + subdirs = [] + for d in all_dirs: + if d == topdir: continue + commonprefix = os.path.commonprefix([topdir, d]) + if commonprefix == topdir: + if os.path.dirname(d) == topdir: + subdirs.append(d) + return subdirs + +def is_abstract_class_from_element(class_element): + abstract = class_element.get('abstract') + if abstract is None or abstract == 'no': + return False + return abstract == 'yes' + +def is_abstract_class(class_name, xml_file): + class_tree = etree.parse(xml_file) + class_root = class_tree.getroot() + + class_element = class_root.find('compounddef') + if class_element.findtext('compoundname') != class_name: + raise Exception('bad compoundname') + return is_abstract_class_from_element(class_element) + +def is_base_class_from_element(class_element): + base_element = class_element.find('basecompoundref') + if base_element is not None: + return False + return True + +def is_base_class(class_name, xml_file): + class_tree = etree.parse(xml_file) + class_root = class_tree.getroot() + + class_element = class_root.find('compounddef') + if class_element.findtext('compoundname') != class_name: + raise Exception('bad compoundname') + return is_base_class_from_element(class_element) + +def get_class_directory_from_element(class_element): + loc = class_element.find('location') + if loc is None: + raise Exception("Class has no location") + filename = loc.get('bodyfile') + if filename is None: + filename = loc.get('file') + if filename is None: + raise Exception("No file or bodyfile in location") + return os.path.dirname(filename) + +def get_class_directory(class_name, xml_file): + class_tree = etree.parse(xml_file) + class_root = class_tree.getroot() + class_name = strip_template(class_name) + + class_element = class_root.find('compounddef') + if class_element.findtext('compoundname') != class_name: + raise Exception('compoundname "' + class_element.findtext('compoundname') + '" does not match "' + class_name + '"') + return get_class_directory_from_element(class_element) + +def is_base_class_rel_to_dir(class_name, xml_file): + class_tree = etree.parse(xml_file) + class_root = class_tree.getroot() + + class_element = class_root.find('compounddef') + if class_element.findtext('compoundname') != class_name: + raise Exception('bad compoundname') + + this_class_dir = get_class_directory_from_element(class_element) + base_element = class_element.find('basecompoundref') + + if base_element is None: + return True + + base_name = base_element.text + base_class_refid = base_element.get('refid') + if base_class_refid is None: # Base class not found + return True + + base_class_xml = os.path.join( + xml_root_dir,base_class_refid+'.xml') + base_class_dir = get_class_directory(base_name, base_class_xml) + + return base_class_dir != this_class_dir + +def is_public_class_from_element(class_element): + return class_element.get('prot') == 'public' + +def is_public_class(class_name, xml_file): + class_tree = etree.parse(xml_file) + class_root = class_tree.getroot() + + class_element = class_root.find('compounddef') + if class_element.findtext('compoundname') != class_name: + raise Exception('bad compoundname') + + return is_public_class_from_element(class_element) + +# Write a simple RST file for a class +def write_class_rst_file(class_name, breathe_project_name, *args, **kwargs): + namespace = kwargs.get('namespace', '') + display_name = kwargs.get('display_name', class_name) + description = kwargs.get('description', '') + header_string = kwargs.get('header_string', '') + output_dir = kwargs.get('output_dir', os.getcwd()) + output_filename = kwargs.get('output_filename', '') + subclasses = kwargs.get('subclasses', {}) + + # Handle defaults more rigorously + if namespace == '': + namespace, class_name = split_namespace(class_name) + if not namespace == '': + namespace = namespace + '::' + + # Possibly rebuild the structure since breathe needs namespace + # information + full_class_name = namespace + class_name + + if output_filename == '': + output_filename = class_name + '.rst' + + if header_string == '': + header_string = "Documentation of "+display_name + + equal_string = '=' * (len(header_string) + 5) + + output_file = os.path.join(output_dir,output_filename) + with open(output_file, 'w') as f: + f.write(header_string + '\n') + f.write(equal_string + '\n\n') + if description != '': + f.write(description + '\n\n') + else: + f.write('\n') + f.write('.. doxygenclass:: ' + full_class_name + '\n') + f.write(' :project: ' + breathe_project_name + '\n') + f.write(' :members:\n\n') + if len(subclasses) > 0: + f.write('.. toctree::\n') + f.write(' :maxdepth: 1\n') + f.write(' :caption: Derived Classes\n\n') + for sc, sc_out_dir in subclasses.items(): + sc_no_ns = strip_namespace(sc) + if sc_out_dir == output_dir: + sc_rst_path = sc_no_ns + else: + sc_rst_path = os.path.join( + os.path.relpath(sc_out_dir, output_dir), + sc_no_ns); + f.write(' ' + sc_no_ns + ' <' + sc_rst_path + '>\n') + return + +# Adds things from rhs into lhs. Keys are anything, values are lists +def merge_dir_class_maps(lhs, rhs): + for d, cls in rhs.items(): + if d not in lhs: + lhs[d] = cls + else: + lhs[d] += cls + +# Writes a file called "strip_namespace(class_name).rst" in "output_dir" +def process_class(class_name, xml_file, output_root_dir): + + # Get the XML tree for this class + class_tree = etree.parse(xml_file) + class_root = class_tree.getroot() + + # Get the description of this class + compounds_in_file = class_root.findall('compounddef') + if len(compounds_in_file) > 1: + raise Exception("Found multiple compounds in file: "+xml_file) + + compound = compounds_in_file[0] + + # Ensure there's nothing funky in the file. + if compound.findtext('compoundname') != class_name: + raise Exception("Found unexpected compounddef \"" + + compound.findtext('compoundname') + + "\" in file: " + xml_file) + if compound.get('kind') != "class": + raise Exception("\"" + class_name + "\" does not have kind=\"class\". " + + "File: " + xml_file) + + # Build the output directory path + class_dir = get_class_directory_from_element(compound) + output_dir = os.path.relpath(class_dir, "../include/lbann/") + # Add the base prefix + file_output_dir = os.path.normpath( + os.path.join(output_root_dir, output_dir)) + if not os.path.exists(file_output_dir): + os.makedirs(file_output_dir) + + # Build output for all derived classes + subclasses = {} + output_dir_class_map = {} + for derived in compound.iter('derivedcompoundref'): + derived_name = strip_template(derived.text) + derived_xml = os.path.join(xml_root_dir, + derived.get('refid') + ".xml") + + sc_out_dir, sc_dir_class_map = process_class( + derived_name, derived_xml, output_root_dir) + + merge_dir_class_maps(output_dir_class_map, sc_dir_class_map) + subclasses[derived_name] = sc_out_dir + + # Write the RST for this class + header_string = "Documentation of " + class_name + write_class_rst_file(class_name, "lbann", + output_dir=file_output_dir, + subclasses=subclasses) + + # Add this class to the map + if file_output_dir not in output_dir_class_map: + output_dir_class_map[file_output_dir] = [class_name] + else: + output_dir_class_map[file_output_dir].append(class_name) + + return file_output_dir, output_dir_class_map + +# +# Actual code starts here +# Let's see if I can write everything +# + +# Set the XML output directory relative to this directory +xml_root_dir = 'doxy_out/xml/' +index_tree = etree.parse(xml_root_dir + 'index.xml') +index_root = index_tree.getroot() + +# Set the RST output directory relative to this directory +rst_base_dir = "lbann" + +# Find all classes in the index +class_to_file_map = {} +for neighbor in index_root.iter('compound'): + if neighbor.get('kind') == 'class': + class_to_file_map[neighbor.findtext('name')] \ + = os.path.join(xml_root_dir,neighbor.get('refid') + '.xml') + +# Build all of the class documentation +dir_all_class_map = {} +for cls, fn in class_to_file_map.items(): + if is_base_class(cls, fn) and is_public_class(cls, fn): + out_dir, all_out_dirs = process_class(cls, fn, rst_base_dir) + merge_dir_class_maps(dir_all_class_map, all_out_dirs) + +# Write the high-level files, one file per directory except where +# noted below +ignore_dirs = [ os.path.join(rst_base_dir,d) for d in + ['data_distributions','utils/impl']] + +# Remove the ignored dirs from the map +for d in ignore_dirs: + if d in dir_all_class_map: + del dir_all_class_map[d] + +all_dirs = list(dir_all_class_map.keys()) +for d in all_dirs: + dir_without_base = os.path.relpath(d, rst_base_dir) + if dir_without_base in lbann_rst_headers: + header_string = lbann_rst_headers[dir_without_base] + else: + header_string = os.path.basename(dir_without_base) + + equal_string = '=' * (len(header_string) + 5) + + if dir_without_base in lbann_rst_flavor_text: + flavor_text = lbann_rst_flavor_text[dir_without_base] + else: + flavor_text = None + + abstract_classes = [] + concrete_classes = [] + for c in dir_all_class_map[d]: + if is_abstract_class(c, class_to_file_map[c]): + abstract_classes.append(strip_namespace(c)) + else: + concrete_classes.append(strip_namespace(c)) + + abstract_classes.sort() + concrete_classes.sort() + + subdirs = [os.path.basename(d) for d in get_known_subdirs(d, all_dirs)] + subdirs.sort() + + if dir_without_base == '.': + filename = os.path.join(rst_base_dir, "lbann.rst") + else: + filename = os.path.join(d,os.path.basename(d)+'_dir.rst') + + with open(filename, 'w') as f: + f.write(header_string+'\n') + f.write(equal_string+'\n\n') + if flavor_text is not None: + f.write(flavor_text+'\n') + + if len(abstract_classes) > 0: + f.write('\n') + f.write('.. toctree::'+'\n') + f.write(' :maxdepth: 1'+'\n') + f.write(' :caption: Abstract Classes\n\n') + for cls in abstract_classes: + f.write(' class '+cls+' <'+cls+'>\n') + + if len(concrete_classes) > 0: + f.write('\n') + f.write('.. toctree::'+'\n') + f.write(' :maxdepth: 1'+'\n') + f.write(' :caption: Concrete Classes\n\n') + for cls in concrete_classes: + f.write(' class '+cls+' <'+cls+'>\n') + + if len(subdirs) > 0: + f.write('\n') + f.write('.. toctree::'+'\n') + f.write(' :maxdepth: 1'+'\n') + f.write(' :caption: Subdirectories\n\n') + for sdir in subdirs: + f.write(' '+sdir+'/'+sdir+'_dir\n') diff --git a/docs/BuildingLBANN.md b/docs/BuildingLBANN.md deleted file mode 100644 index 4643c7d3354..00000000000 --- a/docs/BuildingLBANN.md +++ /dev/null @@ -1,533 +0,0 @@ -# Building LBANN -## Download - -LBANN source code can be obtained from the [Github -repo](https://github.com/LLNL/lbann). - -## Dependencies - -The following packages and tools are required to build LBANN. All -packages listed below may be installed using -[Spack](https://github.com/llnl/spack). See -below -for more details on using Spack to build a complete LBANN -environment. - -The following basic tools are **required**. - -+ A C++11-compliant compiler. -+ OpenMP, version 3.0 or newer. -+ An MPI-3.0 implementation. -+ [CEREAL](https://github.com/USCiLab/cereal) is used to handle - complex serialization tasks. -+ [CMake](https://cmake.org), version 3.9 or newer. - -The following LLNL-maintained packages are **required**. - -+ [Hydrogen](https://github.com/llnl/elemental) is a fork of the - [Elemental](https://github.com/elemental/elemental) distributed - dense linear-algebra library and it may be installed via - [Spack](https://github.com/llnl/spack) using the package name - "hydrogen". If CUDA support is enabled in Hydrogen, LBANN will - inherit this support. - -The following third-party packages are **required**. - -+ [CNPY](https://github.com/rogersce/cnpy.git) is used to ingest data - in NumPy format. In principle this should be optional, but at time - of writing, LBANN will not build without it. -+ [OpenCV](https://github.com/opencv/opencv) is used to preprocess - image data. For performance reasons, it is recommend to build OpenCV - with [JPEG-turbo](https://github.com/libjpeg-turbo/libjpeg-turbo) - for JPEG format support. -+ [ProtoBuf](https://github.com/protocolbuffers/protobuf) is used to - express models in a portable format. - -The following LLNL-maintained packages are **optional**. - -+ [Aluminum](https://github.com/llnl/aluminum) is a - communication library optimized for machine learning and interaction - with GPUs. We cannot recommend its use strongly enough. It can be - built using [Spack](https://github.com/llnl/spack). -+ [CONDUIT](https://github.com/llnl/conduit) is used to ingest - structured data produced by scientific simulations. - -The following third-party packages are **optional**. - -+ [CUDA](https://developer.nvidia.com/cuda-toolkit). The development - team currently uses CUDA version 9.2. Building with CUDA support - requires that Hydrogen has been built with CUDA support (see below). - + [cuDNN](https://developer.nvidia.com/cudnn) is required if - building LBANN with CUDA support. It is freely available as a binary - distribution from NVIDIA. -+ [HWLOC](https://www.open-mpi.org/projects/hwloc/). HWLOC enables - LBANN to make certain optimizations based on the hardware - topology. Its use is strongly recommended. -+ NVTX. LBANN supports some improved annotations for NVPROF using - NVTX. NVTX is provided as part of the CUDA toolkit. -+ VTune. LBANN supports some improved annotations for VTune. - - -## Building with [Spack](https://github.com/llnl/spack) - -### Setup Spack and local base tools - -1. Download and install [Spack](https://github.com/llnl/spack). - Additionally setup shell support as discussed - [here](https://spack.readthedocs.io/en/latest/module_file_support.html#id2). - - ```bash - . ${SPACK_ROOT}/share/spack/setup-env.sh - ``` - -2. Setup your compiler and external software environment. For example, - on LLNL\'s LC machines, one might load the following modules: - ```bash - ml gcc/7.3.0 mvapich2/2.3 cuda/10.0.130 # Pascal - ``` - or - ```bash - ml gcc/7.3.1 cuda/9.2.148 spectrum-mpi/rolling-release # Lassen / Sierra - ``` - - + Note to unload unwanted modules you can execute `ml` with - package names prepended with a dash, e.g.: `ml -intel`. To - unload all currently loaded modules, use `ml purge`. - -### Building & Installing LBANN as a user - -This section is work in progress. For now, follow the developer -instructions below. We are working to simplify this process. - -### Building & Installing LBANN as a developer - -Developers of LBANN will often need to interact with the source code -and/or advanced configuration options for Aluminum, Hydrogen, and -LBANN while the other dependencies remain constant. The Spack -installation instructions below set up a Spack environment with the -remaining dependencies, requiring the developer to build Aluminum, -Hydrogen, and LBANN separately, by whatever means they choose. - -1. Establish a Spack environment and install software dependencies. - Note that there are four environments to pick from along two axes: - - 1. developers or users - 2. x86_64 and ppc64le - - For example if you are a developer and want to build the inside of - the git repo use the following instructions: - ```bash - export LBANN_HOME=/path/to/lbann/git/repo - export LBANN_BUILD_DIR=/path/to/a/build/directory - export LBANN_INSTALL_DIR=/path/to/an/install/directory - cd ${LBANN_BUILD_DIR} - spack env create -d . ${LBANN_HOME}/spack_environments/developer_release__cuda_spack.yaml # where = x86_64 | ppc64le - spack install - spack env loads # Spack creates a file named loads that has all of the correct modules - source loads - unset LIBRARY_PATH - ``` - - + Note that the environments provided here have a set of external - packages and compilers that are installed on an LLNL LC CZ - system. Please update these for your system environment. - Alternatively, you can create baseline versions of the - user-level Spack configuration files and remove the externals - and compilers from the `spack.yaml` file. More details are - provided [here](spack_environment.md). - - + Note that the initial build of all of the standard packages in Spack - will take a while. - - + Note that the Spack module files set the `LIBRARY_PATH` environment - variable. This behavior allows autotools-based builds to pickup the - correct libraries but interferes with the way that CMake sets up - RPATHs. To correctly establish the RPATH, please unset the variable - as noted above, or you can explicitly pass the RPATH fields to CMake - using a command such as: - ```bash - cmake -DCMAKE_INSTALL_RPATH=$(sed 's/:/;/g' <<< "${LIBRARY_PATH}") \ - -DCMAKE_BUILD_RPATH=$(sed 's/:/;/g' <<< "${LIBRARY_PATH}") \ - ... - ``` - -2. Build LBANN locally from source and build Hydrogen and Aluminum - using the superbuild. See - below - for a list and descriptions of all CMake flags known to LBANN's - "Superbuild" build system. A representative CMake command line - that expects `LBANN_HOME`, `LBANN_BUILD_DIR`, `LBANN_INSTALL_DIR` - environment variables might be: - ```bash - cd ${LBANN_BUILD_DIR} - cmake \ - -G Ninja \ - -D LBANN_SB_BUILD_ALUMINUM=ON \ - -D ALUMINUM_ENABLE_MPI_CUDA=OFF \ - -D ALUMINUM_ENABLE_NCCL=ON \ - -D LBANN_SB_BUILD_HYDROGEN=ON \ - -D Hydrogen_ENABLE_CUDA=ON \ - -D LBANN_SB_BUILD_LBANN=ON \ - -D CMAKE_BUILD_TYPE:STRING=Release \ - -D LBANN_WITH_CUDA:BOOL=ON \ - -D LBANN_WITH_NVPROF:BOOL=ON \ - -D LBANN_DATATYPE:STRING=float \ - -D LBANN_WITH_TOPO_AWARE:BOOL=ON \ - -D LBANN_WITH_ALUMINUM:BOOL=ON \ - -D LBANN_WITH_CONDUIT:BOOL=ON \ - -D LBANN_WITH_CUDA:BOOL=ON \ - -D LBANN_WITH_CUDNN:BOOL=ON \ - -D LBANN_WITH_NCCL:BOOL=ON \ - -D LBANN_WITH_SOFTMAX_CUDA:BOOL=ON \ - -D LBANN_SEQUENTIAL_INITIALIZATION:BOOL=OFF \ - -D LBANN_WITH_TBINF=OFF \ - -D LBANN_WITH_VTUNE:BOOL=OFF \ - -D LBANN_DATATYPE=float \ - -D CMAKE_INSTALL_PREFIX:PATH=${LBANN_INSTALL_DIR} \ - ${LBANN_HOME}/superbuild - - ninja - ``` - -## Building with [CMake](https://cmake.org) - -LBANN uses [CMake](https://cmake.org) for its build system and a -version newer than or equal to 3.9.0 is required. LBANN development is -done primarily on UNIX-based platforms. As such, the build is tested -regularly on Linux-based machines, occasionally on OSX, and never on -Windows machines. - -It is required that LBANN be built out-of-source. That is, CMake must -not be invoked in a directory containing a CMakeLists. - -### LBANN CMake options -The following options are exposed in the CMake build system. - -+ `LBANN_WITH_ALUMINUM` (Default: `OFF`): Use the Aluminum communication - package. This will be set to `ON` automatically if Hydrogen was - built with Aluminum. - -+ `LBANN_WITH_CNPY` (Default: `ON`): Build with support for CNPY for reading - Numpy data. - -+ `LBANN_WITH_CONDUIT` (Default: `OFF`): Build with support for CONDUIT. - -+ `LBANN_WITH_NVPROF` (Default: `OFF`): Build with extra annotations for NVPROF. - -+ `LBANN_WITH_TOPO_AWARE` (Default: `ON`): Use HWLOC for topology-aware choices. - -+ `LBANN_WITH_TBINF` (Default: `ON`): Enable the Tensorboard interace. - -+ `LBANN_WITH_VTUNE` (Default: `OFF`): Build with extra annotations for VTune. - -+ `LBANN_DETERMINISTIC` (Default: `OFF`): Force as much of the code as possible - to be deterministic. This is not a guarantee as certain operations - in third-party libraries cannot be forced into a deterministic mode, - especially for CUDA-enabled builds. - -+ `LBANN_SEQUENTIAL_INITIALIZATION` (Default: `OFF`): Force sequentially - consistent initialization of data structures. - -+ `LBANN_WARNINGS_AS_ERRORS` (Default: `OFF`): Promote compiler - warnings to errors. This should be used by developers - only. Developers are encouraged to build with this `ON` prior to - merging any code into the repository. - -+ `LBANN_USE_PROTOBUF_MODULE` (Default: `OFF`): Search for Protobuf - using CMake's `FindProtobuf.cmake` module instead of the Protobuf - config file. This is useful on platforms with differently - architected compute nodes or when the config method is inexplicably - failing. - -The following variables may also be set: - -+ `LBANN_DATATYPE` (Default: `float`): The datatype to use for - training. Currently this must be `float` or `double`. - -The following variable has been deprecated and removed: - -+ `LBANN_WITH_CUDA`. The "CUDA-ness" of LBANN is now tied 1:1 with the - "CUDA-ness" of Hydrogen. At present, it seems like unnecessary - overhead to support the situation in which Hydrogen has CUDA support - but LBANN doesn't want to use it until a compelling use-case reveals - itself. - -### Controlling dependency resolution -The following variables may be set with CMake to identify dependencies -that are not installed into the "typical" locations that CMake -searches by default. They may be either exported into the environment -used by CMake using whatever mechanisms are allowed by the shell or -passed to CMake as a cache variable -(e.g., `cmake -DPKG_DIR=/path/to/pkg`). -The latter option is recommended. - -+ `Aluminum_DIR` or `ALUMINUM_DIR` or `AL_DIR`: The path to _either_ - the Aluminum installation prefix _or_ the AluminumConfig.cmake - file. If Hydrogen has not been built with Aluminum support, set - `LBANN_WITH_ALUMINUM=ON` to enable Aluminum support. -+ `CEREAL_DIR`: The path to _either_ the CEREAL installation prefix - _or_ the cereal-config.cmake file. -+ `CNPY_DIR`: The path to the CNPY installation prefix. Must set - `LBANN_WITH_CNPY=ON` to enable CNPY support. -+ `CONDUIT_DIR` or `CONDUIT_DIR`: The path to _either_ the - CONDUIT installation prefix _or_ the conduit.cmake file. Must set - `LBANN_WITH_CONDUIT=ON` to enable CONDUIT support. - + `HDF5_DIR`: The path to _either_ the HDF5 installation prefix _or_ - the hdf5_config.cmake file. There is a known issue with CONDUIT - that it may link to HDF5 but not properly export that dependency. -+ `HWLOC_DIR`: The path to the HWLOC installation prefix. Must set - `LBANN_WITH_HWLOC=ON` to enable HWLOC support. -+ `Hydrogen_DIR` or `HYDROGEN_DIR`: The path to _either_ the Hydrogen - installation prefix _or_ the HydrogenConfig.cmake file. -+ `NVTX_DIR`: The path the the prefix of NVTX. This should not be used - except in circumstances in which one might want to link to a - different NVTX installation than the CUDA toolkit. Under normal - circumstances, if CUDA was found without issue, NVTX should be as - well. -+ `OpenCV_DIR` or `OPENCV_DIR`: The path to _either_ the OpenCV - installation prefix _or_ the OpenCVConfig.cmake file. -+ `Protobuf_DIR` or `PROTOBUF_DIR`: The path to _either_ the Protobuf - installation prefix _or_ the protobuf-config.cmake file. -+ `VTUNE_DIR`: The path to the prefix of the VTune (or Intel compiler - suite) installation. - -Compilers, include CUDA compilers, are found using the default CMake -mechanisms, as are OpenMP and MPI. Thus, the process of finding these -tools can be manipulated using the usual CMake mechanisms and/or cache -variables as [documented by CMake](https://cmake.org/documentation). - -Except where otherwise noted, this list attempts to address the first -level of dependencies of LBANN, that is, those that are one edge away -in the DAG. If deeper dependency issues appear, please consult the -documentation of the packages that are causing the issues as they may -require additional CMake/environment flags to be set before properly -resolving. - -### Example CMake invocation -A sample CMake build for LBANN might look like the following. -```bash -cmake \ - -D LBANN_WITH_CUDA:BOOL=ON \ - -D LBANN_WITH_NVPROF:BOOL=ON \ - -D LBANN_DATATYPE:STRING=float \ - -D Hydrogen_DIR:PATH=/path/to/hydrogen \ - -D HWLOC_DIR:PATH=/path/to/hwloc \ - /path/to/lbann -``` - -## Building an entire ecosystem with the "Superbuild" - -__WARNING__: This is primarily for developer convenience and is not -meant to be robust to all possible use-cases for LBANN. - -LBANN includes CMake `ExternalProject` definitions for a large portion -of its dependency graph. The following dependencies are -supported. These are one or two edges from LBANN in the -dependency DAG. - -+ Aluminum -+ CNPY -+ CONDUIT -+ [CUB](https://github.com/nvlabs/cub). This is used by Hydrogen for - efficiently managing GPU memory. -+ [HDF5](https://www.hdfgroup.org/solutions/hdf5). This is a - dependency of CONDUIT. -+ Hydrogen -+ [JPEG-turbo](https://github.com/libjpeg-turbo/libjpeg-turbo). This - is a dependency of OpenCV. -+ [OpenBLAS](https://github.com/xianyi/OpenBLAS.git). This is an - optional dependency of Hydrogen. It is recommended if your system - does not have a system-optimized BLAS distribution (e.g., Intel's MKL). -+ OpenCV -+ Protobuf - -The following dependencies are known to exist but for some reason or -another are not supported by the superbuild framework. - -+ cuDNN is a freely available binary package available from NVIDIA. -+ NCCL is a freely available binary package available from - NVIDIA. Inspired users may also build it from source from its - [github repository](https://github.com/nvidia/nccl). -+ HWLOC is often installed by default, especially on large - supercomputers. Certain components may require superuser access to - configure, but these features are not used by LBANN. If it is not - available, ask the system administrators, consult the package - manager, install using Spack, or build from - [source](https://www.open-mpi.org/projects/hwloc/). - -The superbuild system is itself a CMake project rooted in -`$LBANN_HOME/superbuild` (distinct from the LBANN CMake project rooted -in `$LBANN_HOME`). Options that control the superbuild system are -prefixed with `LBANN_SB_`; other options that appear in a CMake -invocation for the superbuild are either interpreted on a sub-project -basis or forwarded to certain sub-projects. - -### Choosing packages to build in the Superbuild -The superbuild system is _constructive_ or _additive_; that is, it -will only build the packages that it is asked to build. Any required -package that is not requested is assumed to exist on the system by the -time it is needed by whichever package requires it. For example, if -HDF5 is provided by the system administrators on a system, it does not -need to be built and CONDUIT can be built by pointing its build to the -system HDF5. - -Packages are included in a superbuild by passing -`LBANN_SB_BUILD_` options to CMake _for each package_ that it -should build, including LBANN itself. E.g., -```bash -cmake \ - -DLBANN_SB_BUILD_ALUMINUM=ON \ - -DLBANN_SB_BUILD_HYDROGEN=ON \ - -DLBANN_SB_BUILD_LBANN=ON \ - /path/to/lbann/superbuild -``` -will invoke the superbuild to build Aluminum, Hydrogen, and LBANN -_only_. Acceptable values for `` are `ALUMINUM`, `CNPY`, -`CONDUIT`, `CUB`, `HDF5`, `HYDROGEN`, `JPEG_TURBO`, `OPENCV`, -`PROTOBUF` and `LBANN`. - -### Forwarding options to sub-projects -The subprojects are largely pre-configured to "do the right thing" for -building LBANN. However, there are some variables that users of the -superbuild system may need to control. These are exposed as regular -CMake options in the individual projects' CMakeLists and can be viewed -by running, e.g., - -```bash -cmake -L superbuild//CMakeLists.txt -``` - -Several significant CMake flags are automatically forwarded from the -superbuild CMake to subprojects. These are generally "typical" CMake -flags (but not all; if something is missing, open please -[an issue](https://github.com/llnl/lbann/issues)). Some examples are - -+ `CMAKE_INSTALL_PREFIX` -+ `CMAKE_BUILD_TYPE` -+ `CMAKE__COMPILER` -+ `CMAKE__FLAGS` - -To accommodate developers working on edge-cases with these -dependencies, any flag may be forwarded to any CMake-built package -using the following syntax: -`LBANN_SB_FWD__