diff --git a/.gitignore b/.gitignore index f7fbdf6..2ae7281 100644 --- a/.gitignore +++ b/.gitignore @@ -33,9 +33,16 @@ wrench*/* # outputs +data/sgbatch/* tmp *.pdf +*.png *.csv +*.txt + +# logs and tests +logs +test # test/profiling outputs *.ipynb diff --git a/.vscode/settings.json b/.vscode/settings.json index f405ae6..1d410ae 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -62,8 +62,12 @@ "typeindex": "cpp", "typeinfo": "cpp", "valarray": "cpp", - "variant": "cpp" + "variant": "cpp", + "codecvt": "cpp", + "iomanip": "cpp", + "strstream": "cpp", + "cfenv": "cpp" }, "cmake.configureOnOpen": false, - "cmake.sourceDirectory": "${workspaceFolder}/sgbatch" + "cmake.sourceDirectory": "${workspaceFolder}" } \ No newline at end of file diff --git a/sgbatch/CMakeLists.txt b/CMakeLists.txt similarity index 81% rename from sgbatch/CMakeLists.txt rename to CMakeLists.txt index 204eaad..2cfd9c7 100644 --- a/sgbatch/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,7 @@ set(CMAKE_CXX_STANDARD 17) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/CMakeModules/") find_package(SimGrid REQUIRED) -find_package(Boost COMPONENTS program_options REQUIRED) +find_package(Boost COMPONENTS program_options regex REQUIRED) # include directories for dependencies and WRENCH libraries @@ -20,21 +20,28 @@ include_directories(src/ ${SimGrid_INCLUDE_DIR}/include /usr/local/include /opt/ set(SOURCE_FILES src/SimpleExecutionController.h src/SimpleExecutionController.cpp + src/SimpleSimulator.h src/SimpleSimulator.cpp src/JobSpecification.h + src/LRU_FileList.h + src/MonitorAction.h + src/MonitorAction.cpp + src/util/DefaultValues.h + src/util/Enums.h src/computation/CacheComputation.h src/computation/CacheComputation.cpp src/computation/StreamedComputation.h src/computation/StreamedComputation.cpp src/computation/CopyComputation.h src/computation/CopyComputation.cpp - src/SimpleSimulator.h - src/LRU_FileList.h ) # test files set(TEST_FILES src/JobSpecification.h + src/MonitorAction.h + src/util/DefaultValues.h + src/util/Enums.h src/computation/CacheComputation.h src/computation/StreamedComputation.h src/computation/CopyComputation.h @@ -44,32 +51,27 @@ set(TEST_FILES # wrench library and dependencies find_library(WRENCH_LIBRARY NAMES wrench) -find_library(PUGIXML_LIBRARY NAMES pugixml) find_library(GTEST_LIBRARY NAMES gtest) find_library(SimGrid_LIBRARY NAMES simgrid) # generating the executable -add_executable(sgbatch-sim ${SOURCE_FILES}) +add_executable(dc-sim ${SOURCE_FILES}) if (ENABLE_BATSCHED) -target_link_libraries(sgbatch-sim +target_link_libraries(dc-sim ${WRENCH_LIBRARY} - ${WRENCH_PEGASUS_WORKFLOW_PARSER_LIBRARY} ${SimGrid_LIBRARY} ${Boost_LIBRARIES} - ${PUGIXML_LIBRARY} -lzmq ) else() -target_link_libraries(sgbatch-sim +target_link_libraries(dc-sim ${WRENCH_LIBRARY} - ${WRENCH_PEGASUS_WORKFLOW_PARSER_LIBRARY} ${SimGrid_LIBRARY} ${Boost_LIBRARIES} - ${PUGIXML_LIBRARY} ) endif() -install(TARGETS sgbatch-sim DESTINATION bin) +install(TARGETS dc-sim DESTINATION bin) # generating unit tests add_executable(unit_tests EXCLUDE_FROM_ALL diff --git a/sgbatch/CMakeModules/FindSimGrid.cmake b/CMakeModules/FindSimGrid.cmake similarity index 100% rename from sgbatch/CMakeModules/FindSimGrid.cmake rename to CMakeModules/FindSimGrid.cmake diff --git a/README.md b/README.md index a926921..c4293d9 100644 --- a/README.md +++ b/README.md @@ -2,24 +2,91 @@ Simulator for the simulation of high energy physics workflows on distributed computing systems with caching. + ## Install instructions -To install `git clone` this repository and execute the checkout script inside: + +### Option 1 +To get a fresh installation on your own system, either `git clone` this repository and execute the checkout script inside: ```bash -source checkout.sh +source checkout_scripts/checkout_with_sudo.sh ``` Mind that you will need super-user rights to do so, as well as `cmake`, `git`, `clang` and `boost` installed on your system. -This will install the executable `sgbatch-sim` for this simulator and all its software dependencies. +This will install the executable `dc-sim` for this simulator and all its software dependencies. + +### Option 2 +Create a `conda` environment using the provided script +```bash +checkout_scripts/install_conda_environment.sh +``` +This will automatically take care of all the dependencies needed and include them into the environment. + +To work with this environment interactively, you first have to initialize conda on your system. This can be done via: + +```bash +/bin/conda init +``` + +This would adapt your `~/.bashrc` to be able to call `conda` directly. So please re-open your shell or `source ~/.bashrc`. + +To activate the environment, execute +```bash +conda activate dcsim-env +``` +and deactivate it accordingly with +```bash +conda deactivate +``` + +### Tips for Conda + +With a `conda` environment, you would be able to install the full software setup without super-user rights. +More information on how to work with and develop in a `conda` environment can be found in the [Conda Documentation](https://docs.anaconda.com/) + +Furthermore, it is possible to put a complete conda environment into a tarball to be able to export it to a different machine, e.g. a batch system node. To do that execute: + +```bash +conda activate dcsim-env # in case you don't have it activated yet +conda-pack +``` + +The created tarball `dcsim-env.tar.gz` can then be uploaded to a storage element and copied from there to a different machine. + ## Usage -When you have successfully installed the simulator you can run +When you have successfully installed the simulator or activated the conda environment you can run ```bash -sgbatch-sim --help +dc-sim --help ``` -to see possible options. +to see all possible execution options. -Obligatory parameters are a platform file and a path and name for the resulting simulation output CSV-file: +Mandatory parameters are a platform file and a path and name for the resulting simulation output CSV-file: ```bash -sgbatch-sim -p -o +dc-sim -p -o ``` The platform file has to follow the [SimGrid-defined DTD](https://simgrid.org/doc/latest/Platform.html). +Example files can be found in `data/platform-files`. The output-path can be any relative or absolute path of your file-system where you are allowed to write to. +Instead of manually setting up all workflow parameters via command line options, +there is also the option to provide a JSON file, which contains all necessary information about a workflow by adding the option: +```bash +--workflow-configurations +``` +The workflow should contain the full information as it would be set via the command line, e.g.: +```json +{ + "name":"stream_and_compute_workflow", + "num_jobs": 60, + "infiles_per_job":10, + "average_flops":2164428000000, + "sigma_flops":216442800000, + "average_memory":2000000000, + "sigma_memory":200000000, + "average_infile_size":3600000000, + "sigma_infile_size":360000000, + "average_outfile_size":18000000000, + "sigma_outfile_size":1800000000, + "workflow_type":"streaming" +} +``` +It is also possible to give a list of workflow configuration files, which enables to simulate the execution of multiple workflows. +Example configuration files covering different workflow-types are given in `data/workflow-configs`. diff --git a/checkout.sh b/checkout_scripts/checkout_with_sudo.sh similarity index 58% rename from checkout.sh rename to checkout_scripts/checkout_with_sudo.sh index fb6e691..7ca6572 100755 --- a/checkout.sh +++ b/checkout_scripts/checkout_with_sudo.sh @@ -15,74 +15,93 @@ set -e this_file="$( [ ! -z "$ZSH_VERSION" ] && echo "${(%):-%x}" || echo "${BASH_SOURCE[0]}" )" this_dir="$( cd "$( dirname "$this_file" )" && pwd )" +work_dir="$PWD" # checking out packages from git as prerequisites for WRENCH: # # 1) pugixml, docu: https://pugixml.org/docs/manual.html, git: https://github.com/zeux/pugixml echo "Installing C++ XML processing library pugixml..." -wget http://github.com/zeux/pugixml/releases/download/v1.11/pugixml-1.11.tar.gz -tar -xf pugixml-1.11.tar.gz -rm pugixml-1.11.tar.gz -pushd pugixml-1.11 +if [ ! -d "$work_dir/pugixml-1.12.1" ]; then + wget http://github.com/zeux/pugixml/releases/download/v1.12.1/pugixml-1.12.1.tar.gz + tar -xf pugixml-1.12.1.tar.gz + rm pugixml-1.12.1.tar.gz +fi +pushd pugixml-1.12 mkdir -p build cd build -cmake -DCMAKE_BUILD_TYPE=Debug .. +cmake .. +# cmake -DCMAKE_BUILD_TYPE=Debug .. make -j 6; sudo make install popd # 2) nlohmann json, docu: https://json.nlohmann.me/, git: https://github.com/nlohmann/json echo "Installing C++ JSON library..." -wget https://github.com/nlohmann/json/archive/refs/tags/v3.10.4.tar.gz -tar -xf v3.10.4.tar.gz -rm v3.10.4.tar.gz -pushd json-3.10.4 +if [ ! -d "$work_dir/json-3.11.2" ]; then + wget https://github.com/nlohmann/json/archive/refs/tags/v3.11.2.tar.gz + tar -xf v3.11.2.tar.gz + rm v3.11.2.tar.gz +fi +pushd json-3.11.2 mkdir -p build cd build -cmake -DCMAKE_BUILD_TYPE=Debug .. +cmake .. +# cmake -DCMAKE_BUILD_TYPE=Debug .. make -j 6; sudo make install popd # 3) googletest, docu & git: https://github.com/google/googletest echo "Installing C++ code testing library googletest..." -wget https://github.com/google/googletest/archive/refs/tags/release-1.11.0.tar.gz -tar -xf release-1.11.0.tar.gz -rm release-1.11.0.tar.gz -pushd googletest-release-1.11.0 +if [ ! -d "$work_dir/googletest-release-1.12.1" ]; then + wget https://github.com/google/googletest/archive/refs/tags/release-1.12.1.tar.gz + tar -xf release-1.12.1.tar.gz + rm release-1.12.1.tar.gz +fi +pushd googletest-release-1.12.1 mkdir -p build cd build -cmake -DCMAKE_BUILD_TYPE=Debug .. +cmake .. +# cmake -DCMAKE_BUILD_TYPE=Debug .. make -j 6; sudo make install popd # 4) simgrid, docu: https://simgrid.org/doc/latest/, git: https://framagit.org/simgrid/simgrid echo "Installing SimGrid..." -wget https://framagit.org/simgrid/simgrid/-/archive/v3.30/simgrid-v3.30.tar.gz -tar -xf simgrid-v3.30.tar.gz -rm simgrid-v3.30.tar.gz -pushd simgrid-v3.30 +if [ ! -d "$work_dir/simgrid-v3.32" ]; then + wget https://framagit.org/simgrid/simgrid/-/archive/v3.32/simgrid-v3.32.tar.gz + tar -xf simgrid-v3.32.tar.gz + rm simgrid-v3.32.tar.gz +fi +pushd simgrid-v3.32 mkdir -p build cd build -cmake -DCMAKE_BUILD_TYPE=Debug .. +cmake .. +# cmake -DCMAKE_BUILD_TYPE=Debug .. make -j 6; sudo make install popd # installing WRENCH 2.0: echo "Installing WRENCH..." -git clone --branch wrench-2.0 git@github.com:wrench-project/wrench.git -pushd wrench +if [ ! -d "$work_dir/wrench-2.1" ]; then + wget https://github.com/wrench-project/wrench/archive/refs/tags/v2.1.tar.gz + tar -xf v2.1.tar.gz + rm v2.1.tar.gz +fi +pushd wrench-2.1 mkdir -p build cd build -cmake -DCMAKE_BUILD_TYPE=Debug .. +cmake .. +# cmake -DCMAKE_BUILD_TYPE=Debug .. make -j 6; sudo make install # make -j 6 examples; sudo make install examples # needed additionally, since not done by default popd # install the sgbatch simulator echo "Installing the DistCacheSim simulator..." -pushd $this_dir/sgbatch +pushd $this_dir/../ mkdir -p build cd build -cmake -DCMAKE_BUILD_TYPE=Debug .. +cmake .. +# cmake -DCMAKE_BUILD_TYPE=Debug .. make -j 6; sudo make install popd diff --git a/checkout_scripts/install_conda_environment.sh b/checkout_scripts/install_conda_environment.sh new file mode 100755 index 0000000..181cf75 --- /dev/null +++ b/checkout_scripts/install_conda_environment.sh @@ -0,0 +1,109 @@ +#! /usr/bin/bash +ulimit -s unlimited + +NCORES=12 + +echo "INITIAL ENVIRONMENT START" +env +echo "INITIAL ENVIRONMENT END" +echo "" + +if [ -x "$(command -v conda)" ] +then + source $(pwd)/miniconda/bin/activate +else + wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh + chmod u+x Miniconda3-latest-Linux-x86_64.sh + ./Miniconda3-latest-Linux-x86_64.sh -b -p $(pwd)/miniconda + source $(pwd)/miniconda/bin/activate + conda config --set auto_activate_base false + conda config --add channels conda-forge + conda config --set channel_priority strict + conda update -n base conda -y + conda create -n dcsim-env cmake python=3.10 pip gcc gxx make gfortran boost git conda-pack -y + conda activate dcsim-env + python3 -m pip install pip setuptools numpy matplotlib scipy pandas --upgrade --no-input + conda env config vars set LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${CONDA_PREFIX}/lib64:${CONDA_PREFIX}/lib32 + conda env config vars set CMAKE_PREFIX_PATH=${CONDA_PREFIX} +fi + +conda activate dcsim-env + +echo "FINAL CONDA ENVIRONMENT START" +env +echo "FINAL CONDA ENVIRONMENT END" +echo "" + +echo "CONDA PACKAGES:" +conda list +echo "PIP PACKAGES:" +python -m pip list +echo "PIP PACKAGES OUTDATED:" +python -m pip list --outdated + +echo "INSTALLING SIMULATION SOFWARE PACKAGES" +mkdir -p CachingSimulation; cd CachingSimulation + +# pugixml +git clone https://github.com/zeux/pugixml.git +mkdir -p pugixml/build +pushd pugixml/build +git checkout tags/v1.12.1 +cmake -DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX} ../ +make -j${NCORES} +make install +popd + +# json +git clone https://github.com/nlohmann/json.git +mkdir -p json/build +pushd json/build +git checkout tags/v3.11.2 +cmake -DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX} ../ +make -j${NCORES} +make install +popd + +# googletest +git clone https://github.com/google/googletest.git +mkdir -p googletest/build +pushd googletest/build +git checkout tags/release-1.12.1 +cmake -DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX} ../ +make -j${NCORES} +make install +popd + +# simgrid +git clone https://framagit.org/simgrid/simgrid.git +mkdir -p simgrid/build +pushd simgrid/build +git checkout tags/v3.32 +cmake -DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX} ../ +make -j${NCORES} +make install +popd + +# wrench +git clone https://github.com/wrench-project/wrench.git +mkdir -p wrench/build +pushd wrench/build +git checkout tags/v.2.1 +cmake -DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX} ../ +make -j${NCORES} +make install +popd + +# DCSim +git clone https://github.com/HEPCompSim/DCSim.git +mkdir -p DCSim/build +pushd DCSim/build +git checkout extension/platform +cmake -DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX} ../ +make -j${NCORES} +make install +popd + +echo "RUNNING TEST COMMAND:" +cd DCSim +/usr/bin/time -v dc-sim -p data/platform-files/sgbatch_scaletest.xml -o test.csv -n 60 diff --git a/data/platform-files/ETPbatch.xml b/data/platform-files/ETPbatch.xml new file mode 100644 index 0000000..e74bc22 --- /dev/null +++ b/data/platform-files/ETPbatch.xml @@ -0,0 +1,250 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/data/platform-files/ETPbatch_reduced.xml b/data/platform-files/ETPbatch_reduced.xml new file mode 100644 index 0000000..6911ca2 --- /dev/null +++ b/data/platform-files/ETPbatch_reduced.xml @@ -0,0 +1,254 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/data/platform-files/ETPbatch_reduced_scaled.xml b/data/platform-files/ETPbatch_reduced_scaled.xml new file mode 100644 index 0000000..306fb50 --- /dev/null +++ b/data/platform-files/ETPbatch_reduced_scaled.xml @@ -0,0 +1,255 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/data/platform-files/ETPbatch_reduced_simple.xml b/data/platform-files/ETPbatch_reduced_simple.xml new file mode 100644 index 0000000..6822aa8 --- /dev/null +++ b/data/platform-files/ETPbatch_reduced_simple.xml @@ -0,0 +1,211 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/data/platform-files/ETPbatch_scaled.xml b/data/platform-files/ETPbatch_scaled.xml new file mode 100644 index 0000000..6254703 --- /dev/null +++ b/data/platform-files/ETPbatch_scaled.xml @@ -0,0 +1,254 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/data/platform-files/ETPbatch_simple.xml b/data/platform-files/ETPbatch_simple.xml new file mode 100644 index 0000000..181f904 --- /dev/null +++ b/data/platform-files/ETPbatch_simple.xml @@ -0,0 +1,210 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/data/platform-files/oneworker.xml b/data/platform-files/oneworker.xml new file mode 100644 index 0000000..dd5a108 --- /dev/null +++ b/data/platform-files/oneworker.xml @@ -0,0 +1,52 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/data/platform-files/sgbatch.xml b/data/platform-files/sgbatch.xml new file mode 100644 index 0000000..4a6d436 --- /dev/null +++ b/data/platform-files/sgbatch.xml @@ -0,0 +1,77 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/data/platform-files/sgbatch_nested.xml b/data/platform-files/sgbatch_nested.xml new file mode 100644 index 0000000..ace3f63 --- /dev/null +++ b/data/platform-files/sgbatch_nested.xml @@ -0,0 +1,93 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/sgbatch/data/platform-files/host_scaletest.xml b/data/platform-files/sgbatch_scaletest.xml similarity index 73% rename from sgbatch/data/platform-files/host_scaletest.xml rename to data/platform-files/sgbatch_scaletest.xml index 07cfd55..f04caa7 100644 --- a/sgbatch/data/platform-files/host_scaletest.xml +++ b/data/platform-files/sgbatch_scaletest.xml @@ -1,9 +1,14 @@ + + + + + @@ -11,6 +16,7 @@ + @@ -18,6 +24,7 @@ + @@ -25,39 +32,35 @@ + - + + + + - + - + - + - - - + diff --git a/data/platform-files/sgbatch_validation.xml b/data/platform-files/sgbatch_validation.xml new file mode 100644 index 0000000..0953109 --- /dev/null +++ b/data/platform-files/sgbatch_validation.xml @@ -0,0 +1,98 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/data/platform-files/twoworkers.xml b/data/platform-files/twoworkers.xml new file mode 100644 index 0000000..891becd --- /dev/null +++ b/data/platform-files/twoworkers.xml @@ -0,0 +1,64 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/data/workflow-configs/calc_workflow.json b/data/workflow-configs/calc_workflow.json new file mode 100644 index 0000000..0557a44 --- /dev/null +++ b/data/workflow-configs/calc_workflow.json @@ -0,0 +1,14 @@ +{ + "name":"calc_workflow", + "num_jobs": 60, + "infiles_per_job":0, + "average_flops":2164428000000, + "sigma_flops":216442800000, + "average_memory":2000000000, + "sigma_memory":200000000, + "average_infile_size":0, + "sigma_infile_size":0, + "average_outfile_size":18000000, + "sigma_outfile_size":1800000, + "workflow_type":"calculation" +} diff --git a/data/workflow-configs/copy_and_compute_workflow.json b/data/workflow-configs/copy_and_compute_workflow.json new file mode 100644 index 0000000..aba3b7c --- /dev/null +++ b/data/workflow-configs/copy_and_compute_workflow.json @@ -0,0 +1,14 @@ +{ + "name":"copy_and_compute_workflow", + "num_jobs": 60, + "infiles_per_job":10, + "average_flops":2164428000000, + "sigma_flops":216442800000, + "average_memory":2000000000, + "sigma_memory":200000000, + "average_infile_size":3600000000, + "sigma_infile_size":360000000, + "average_outfile_size":18000000000, + "sigma_outfile_size":1800000000, + "workflow_type":"copy" +} diff --git a/data/workflow-configs/mc_workflow.json b/data/workflow-configs/mc_workflow.json new file mode 100644 index 0000000..21174b3 --- /dev/null +++ b/data/workflow-configs/mc_workflow.json @@ -0,0 +1,14 @@ +{ + "name":"mc_workflow", + "num_jobs": 60, + "infiles_per_job":0, + "average_flops":2164428000000, + "sigma_flops":216442800000, + "average_memory":2000000000, + "sigma_memory":200000000, + "average_infile_size":0, + "sigma_infile_size":0, + "average_outfile_size":18000000000, + "sigma_outfile_size":1800000000, + "workflow_type":"calculation" +} diff --git a/data/workflow-configs/stream_and_compute_workflow.json b/data/workflow-configs/stream_and_compute_workflow.json new file mode 100644 index 0000000..5d5cb8c --- /dev/null +++ b/data/workflow-configs/stream_and_compute_workflow.json @@ -0,0 +1,14 @@ +{ + "name":"stream_and_compute_workflow", + "num_jobs": 60, + "infiles_per_job":10, + "average_flops":2164428000000, + "sigma_flops":216442800000, + "average_memory":2000000000, + "sigma_memory":200000000, + "average_infile_size":3600000000, + "sigma_infile_size":360000000, + "average_outfile_size":18000000000, + "sigma_outfile_size":1800000000, + "workflow_type":"streaming" +} diff --git a/sgbatch/data/platform-files/hosts.xml b/sgbatch/data/platform-files/hosts.xml deleted file mode 100644 index 7f728f0..0000000 --- a/sgbatch/data/platform-files/hosts.xml +++ /dev/null @@ -1,74 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/sgbatch/src/SimpleExecutionController.cpp b/sgbatch/src/SimpleExecutionController.cpp deleted file mode 100644 index 8fa8c7e..0000000 --- a/sgbatch/src/SimpleExecutionController.cpp +++ /dev/null @@ -1,301 +0,0 @@ -/** - * Copyright (c) 2020. . - * Generated with the wrench-init.in tool. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - */ -#include - -#include "SimpleExecutionController.h" -#include "JobSpecification.h" -#include "computation/StreamedComputation.h" -#include "computation/CopyComputation.h" - -XBT_LOG_NEW_DEFAULT_CATEGORY(simple_wms, "Log category for SimpleExecutionController"); - -/** - * @brief Create a SimpleExecutionController with a workload specification instance, a list of storage services and a list of compute services - * - * @param workload_spec: the workload specification - * @param htcondor_compute_service: an HTCondor compute service - * @param storage_services: set of storage services holding input files //! currently only remote storages needed - * @param hostname: host where the WMS runs - * @param outputdump_name: name of the file to dump simulation information - */ -SimpleExecutionController::SimpleExecutionController( - const std::map &workload_spec, - const std::set>& htcondor_compute_services, - const std::set>& storage_services, - const std::string& hostname, - const std::string& outputdump_name) : wrench::ExecutionController( - hostname, - "condor-simple") { - this->workload_spec = workload_spec; - this->htcondor_compute_services = htcondor_compute_services; - this->storage_services = storage_services; - this->filename = outputdump_name; -} - -/** - * @brief main method of the SimpleExecutionController daemon - * - * @return 0 on completion - * - * @throw std::runtime_error - */ -int SimpleExecutionController::main() { - - wrench::TerminalOutput::setThisProcessLoggingColor(wrench::TerminalOutput::COLOR_GREEN); - - /* initialize output-dump file */ - this->filedump.open(this->filename, ios::out | ios::trunc); - if (this->filedump.is_open()) { - this->filedump << "job.tag" << ",\t"; // << "job.ncpu" << ",\t" << "job.memory" << ",\t" << "job.disk" << ",\t"; - this->filedump << "machine.name" << ",\t"; - this->filedump << "job.start" << ",\t" << "job.end" << ",\t" << "job.computetime" << ",\t"; - this->filedump << "infiles.transfertime" << ",\t" << "infiles.size" << ",\t" << "outfiles.transfertime" << ",\t" << "outfiles.size" << std::endl; - this->filedump.close(); - - WRENCH_INFO("Wrote header of the output dump into file %s", this->filename.c_str()); - } - else { - throw std::runtime_error("Couldn't open output-file " + this->filename + " for dump!"); - } - - WRENCH_INFO("Starting on host %s", wrench::Simulation::getHostName().c_str()); - WRENCH_INFO("About to execute a workload of %lu jobs", workload_spec.size()); - - - // Create a job manager - this->job_manager = this->createJobManager(); - WRENCH_INFO("Created a job manager"); - - // Create a data movement manager - this->data_movement_manager = this->createDataMovementManager(); - WRENCH_INFO("Created a data manager"); - - - // Get the available compute services - // TODO: generalize to arbitrary numbers of HTCondorComputeServices - if (this->htcondor_compute_services.empty()) { - throw std::runtime_error("Aborting - No compute services available!"); - } - if (this->htcondor_compute_services.size() != 1) { - throw std::runtime_error("This example Simple HTCondor Scheduler requires a single compute service"); - } - - auto htcondor_compute_service = *(this->htcondor_compute_services.begin()); - WRENCH_INFO("Found %ld HTCondor Service(s) on %s", htcondor_compute_services.size(), htcondor_compute_service->getHostname().c_str()); - - - // Get the available storage services - // and split between workers and remote storages - std::set> worker_storage_services; - std::set> remote_storage_services; - for (auto storage : this->storage_services) { - std::string hostname = storage->getHostname(); - std::for_each(hostname.begin(), hostname.end(), [](char& c){c = std::tolower(c);}); - if (hostname.find("remote") != std::string::npos) { - remote_storage_services.insert(storage); - } else { - worker_storage_services.insert(storage); - } - } - // Check that the right remote_storage_service is passed for outputfile storage - // TODO: generalize to arbitrary numbers of remote storages - if (remote_storage_services.size() != 1) { - throw std::runtime_error("This example Simple Simulator requires a single remote_storage_service"); - } - auto remote_storage_service = *remote_storage_services.begin(); - WRENCH_INFO("Found %ld Remote Storage Service(s) on %s", remote_storage_services.size(), remote_storage_service->getHostname().c_str()); - - - // Create and submit all the jobs! - WRENCH_INFO("There are %ld jobs to schedule", this->workload_spec.size()); - for (auto job_name_spec: this->workload_spec) { - std::string job_name = job_name_spec.first; - auto job_spec = &this->workload_spec[job_name]; - - auto job = job_manager->createCompoundJob(job_name); - - // Combined read-input-file-and-run-computation actions - std::shared_ptr run_action; - if (! SimpleSimulator::use_blockstreaming) { - auto copy_computation = std::shared_ptr( - new CopyComputation(this->storage_services, job_spec->infiles, job_spec->total_flops) - ); - - run_action = job->addCustomAction( - "copycompute_" + job_name, - job_spec->total_mem, 1, - *copy_computation, - [](std::shared_ptr action_executor) { - WRENCH_INFO("Copy computation done") - } - ); - } else { - auto streamed_computation = std::shared_ptr( - new StreamedComputation(this->storage_services, job_spec->infiles, job_spec->total_flops) - ); - - run_action = job->addCustomAction( - "streaming_" + job_name, - job_spec->total_mem, 1, - *streamed_computation, - [](std::shared_ptr action_executor) { - WRENCH_INFO("Streaming computation done"); - // Do nothing - } - ); - } - - // Create the file write action - auto fw_action = job->addFileWriteAction( - "file_write_" + job_name, - job_spec->outfile, - job_spec->outfile_destination - ); - //TODO: Think of a determination of storage_service to hold output data - // auto fw_action = job->addCustomAction( - // "file_write_" + job_name, - // job_spec->total_mem, 0, - // [](std::shared_ptr action_executor) { - // // TODO: Which storage service should we write output on? - // // TODO: Probably random selection is fine, or just a fixed - // // TODO: one that's picked by the "user"? - // // TODO: Write the file at once - // }, - // [](std::shared_ptr action_executor) { - // WRENCH_INFO("Output file was successfully written!") - // // Do nothing - // } - // ); - - // Add necessary dependencies - job->addActionDependency(run_action, fw_action); - - // Submit the job for execution! - //TODO: generalize to arbitrary numbers of htcondor services - job_manager->submitJob(job, htcondor_compute_service); - WRENCH_INFO("Submitted job %s", job->getName().c_str()); - - } - - WRENCH_INFO("Done with creation/submission of all compound jobs"); - - - this->num_completed_jobs = 0; - while (this->num_completed_jobs != this->workload_spec.size()) { - // Wait for a workflow execution event, and process it - try { - this->waitForAndProcessNextEvent(); - } catch (wrench::ExecutionException &e) { - WRENCH_INFO("Error while getting next execution event (%s)... ignoring and trying again", (e.getCause()->toString().c_str())); - continue; - } - - if (this->abort || this->num_completed_jobs == this->workload_spec.size()) { - break; - } - } - - wrench::Simulation::sleep(10); - - WRENCH_INFO("--------------------------------------------------------") - if (this->num_completed_jobs == this->workload_spec.size()){ - WRENCH_INFO("Workload execution is complete!"); - } else{ - WRENCH_INFO("Workload execution is incomplete!") - } - - WRENCH_INFO("SimpleExecutionController daemon started on host %s terminating", wrench::Simulation::getHostName().c_str()); - - this->job_manager.reset(); - - return 0; -} - - -/** - * @brief Process a ExecutionEvent::COMPOUND_JOB_FAILURE - * Abort simulation once there is a failure. - * - * @param event: an execution event - */ -void SimpleExecutionController::processEventCompoundJobFailure(std::shared_ptr event) { - WRENCH_INFO("Notified that compound job %s has failed!", event->job->getName().c_str()); - WRENCH_INFO("Failure cause: %s", event->failure_cause->toString().c_str()); - WRENCH_INFO("As a SimpleExecutionController, I abort as soon as there is a failure"); - this->abort = true; -} - - -/** -* @brief Process a ExecutionEvent::COMPOUND_JOB_COMPLETION. -* This also writes out a dump of job information returned by the simulation. -* -* @param event: an execution event -*/ -void SimpleExecutionController::processEventCompoundJobCompletion(std::shared_ptr event) { - - /* Retrieve the job that this event is for */ - WRENCH_INFO("Notified that job %s with %ld actions has completed", event->job->getName().c_str(), event->job->getActions().size()); - - this->num_completed_jobs++; - - /* Figure out execution host. All actions run on the same host, so let's just pick an arbitrary one */ - std::string execution_host = (*(event->job->getActions().begin()))->getExecutionHistory().top().physical_execution_host; - - /* Remove all actions from memory and compute incremental output values in one loop */ - double incr_compute_time = 0.; - double incr_infile_transfertime = 0.; - double incr_infile_size = 0.; - double incr_outfile_transfertime = 0.; - double incr_outfile_size = 0.; - double start_date = DBL_MAX; - double end_date = 0; - - // Figure out timings - for (auto const &action : event->job->getActions()) { - double elapsed = action->getEndDate() - action->getStartDate(); - WRENCH_DEBUG("Running action: %s, elapsed in s: %.2f", action->getName().c_str(), elapsed); - start_date = std::min(start_date, action->getStartDate()); - end_date = std::max(end_date, action->getEndDate()); - // TODO: Better: Check for action type rather than doing string matching - if (action->getName().find("file_read_") != std::string::npos) { - incr_infile_transfertime += elapsed; - } else if (action->getName().find("copycompute_") != std::string::npos || action->getName().find("streaming_") != std::string::npos) { - incr_compute_time += elapsed; - } else if (action->getName().find("file_write_") != std::string::npos) { - incr_outfile_transfertime += elapsed; - } - } - - // Figure out file sizes - for (auto const &f : this->workload_spec[event->job->getName()].infiles) { - incr_infile_size += f->getSize(); - } - incr_outfile_size += this->workload_spec[event->job->getName()].outfile->getSize(); - - - /* Dump relevant information to file */ - this->filedump.open(this->filename, ios::out | ios::app); - if (this->filedump.is_open()) { - - this->filedump << event->job->getName() << ",\t"; //<< std::to_string(job->getMinimumRequiredNumCores()) << ",\t" << std::to_string(job->getMinimumRequiredMemory()) << ",\t" << /*TODO: find a way to get disk usage on scratch space */ << ",\t" ; - this->filedump << execution_host << ",\t"; - this->filedump << std::to_string(start_date) << ",\t" << std::to_string(end_date) << ",\t" << std::to_string(incr_compute_time) << ",\t" << std::to_string(incr_infile_transfertime) << ",\t" ; - this->filedump << std::to_string(incr_infile_size) << ",\t" << std::to_string(incr_outfile_transfertime) << ",\t" << std::to_string(incr_outfile_size) << std::endl; - - this->filedump.close(); - - WRENCH_INFO("Information for job %s has been dumped into file %s", event->job->getName().c_str(), this->filename.c_str()); - } - else { - throw std::runtime_error("Couldn't open output-file " + this->filename + " for dump!"); - } - -} diff --git a/sgbatch/src/SimpleSimulator.cpp b/sgbatch/src/SimpleSimulator.cpp deleted file mode 100644 index d5ce482..0000000 --- a/sgbatch/src/SimpleSimulator.cpp +++ /dev/null @@ -1,403 +0,0 @@ -/** - * Copyright (c) 2020. . - * Generated with the wrench-init.in tool. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - */ -#include -#include "SimpleSimulator.h" -#include "SimpleExecutionController.h" -#include "JobSpecification.h" - -#include -#include - -#include - -namespace po = boost::program_options; - -/** - * - * "Global" static variables. Some here are a bit ugly of course, but they should help - * with memory footprint by avoiding passing around / storing items that apply to - * all jobs. - */ -std::map, LRU_FileList> SimpleSimulator::global_file_map; -std::mt19937 SimpleSimulator::gen(42); // random number generator -bool SimpleSimulator::use_blockstreaming = true; // flag to chose between simulated job types: streaming or copy jobs -double SimpleSimulator::xrd_block_size = 1.*1000*1000*1000; // maximum size of the streamed file blocks in bytes for the XRootD-ish streaming -// TODO: The initialized below is likely bogus (at compile time?) -std::normal_distribution* SimpleSimulator::flops_dist; -std::normal_distribution* SimpleSimulator::mem_dist; -std::normal_distribution* SimpleSimulator::insize_dist; -std::normal_distribution* SimpleSimulator::outsize_dist; - - - - -/** - * @brief helper function to process simulation options and parameters - * - * @param argc - * @param argv - * - */ -po::variables_map process_program_options(int argc, char** argv) { - - // default values - double hitrate = 0.0; - - double average_flops = 2164.428*1000*1000*1000; - double sigma_flops = 0.1*average_flops; - double average_memory = 2.*1000*1000*1000; - double sigma_memory = 0.1*average_memory; - size_t infiles_per_job = 10; - double average_infile_size = 3600000000.; - double sigma_infile_size = 0.1*average_infile_size; - double average_outfile_size = 0.5*infiles_per_job*average_infile_size; - double sigma_outfile_size = 0.1*average_outfile_size; - - size_t duplications = 1; - - bool no_blockstreaming = false; - - double xrd_block_size = 1000.*1000*1000; - - po::options_description desc("Allowed options"); - desc.add_options() - ("help,h", "show brief usage message\n") - - ("platform,p", po::value()->value_name("")->required(), "platform description file, written in XML following the SimGrid-defined DTD") - ("hitrate,H", po::value()->default_value(hitrate), "initial fraction of staged input-files on caches at simulation start") - - ("njobs,n", po::value()->required(), "number of jobs to simulate") - ("flops", po::value()->default_value(average_flops), "amount of floating point operations jobs need to process") - ("sigma-flops", po::value()->default_value(sigma_flops), "jobs' distribution spread in FLOPS") - ("mem,m", po::value()->default_value(average_memory), "average size of memory needed for jobs to run") - ("sigma-mem", po::value()->default_value(sigma_memory), "jobs' sistribution spread in memory-needs") - ("ninfiles", po::value()->default_value(infiles_per_job), "number of input-files each job has to process") - ("insize", po::value()->default_value(average_infile_size), "average size of input-files jobs read") - ("sigma-insize", po::value()->default_value(sigma_infile_size), "jobs' distribution spread in input-file size") - ("outsize", po::value()->default_value(average_outfile_size), "average size of output-files jobs write") - ("sigma-outsize", po::value()->default_value(sigma_outfile_size), "jobs' distribution spread in output-file size") - - ("duplications,d", po::value()->default_value(duplications), "number of duplications of the workflow to feed into the simulation") - - ("no-streaming", po::bool_switch()->default_value(no_blockstreaming), "switch to turn on/off block-wise streaming of input-files") - - ("output-file,o", po::value()->value_name("")->required(), "path for the CSV file containing output information about the jobs in the simulation") - - ("xrd-blocksize,x", po::value()->default_value(xrd_block_size), "size of the blocks XRootD uses for data streaming") - ; - - po::variables_map vm; - po::store( - po::parse_command_line(argc, argv, desc), - vm - ); - - if (vm.count("help")) { - std::cerr << desc << std::endl; - exit(EXIT_SUCCESS); - } - - try { - po::notify(vm); - } catch (std::exception& e) { - std::cerr << "Error: " << e.what() << std::endl << std::endl; - std::cerr << desc << std::endl; - exit(EXIT_FAILURE); - } - - // Here, all options should be properly set - std::cerr << "Using platform " << vm["platform"].as() << std::endl; - - return vm; -} - - -/** - * @brief fill a Workflow consisting of jobs with job specifications, - * which include the inputfile and outputfile dependencies. - * It can be chosen between jobs streaming input data and perform computations simultaneously - * or jobs copying the full input-data and compute afterwards. - * - * @param num_jobs: number of tasks - * @param infiles_per_task: number of input-files each job processes - * @param average_flops: expectation value of the flops (truncated gaussian) distribution - * @param sigma_flops: std. deviation of the flops (truncated gaussian) distribution - * @param average_memory: expectation value of the memory (truncated gaussian) distribution - * @param sigma_memory: std. deviation of the memory (truncated gaussian) distribution - * @param average_infile_size: expectation value of the input-file size (truncated gaussian) distribution - * @param sigma_infile_size: std. deviation of the input-file size (truncated gaussian) distribution - * @param average_outfile_size: expectation value of the output-file size (truncated gaussian) distribution - * @param sigma_outfile_size: std. deviation of the output-file size (truncated gaussian) distribution - * @param duplications: number of duplications of the workflow to feed into the simulation - * - * @throw std::runtime_error - */ -std::map fill_streaming_workflow ( - size_t num_jobs, - size_t infiles_per_task, - double average_flops, double sigma_flops, - double average_memory, double sigma_memory, - double average_infile_size, double sigma_infile_size, - double average_outfile_size, double sigma_outfile_size, - size_t duplications -) { - - // map to store the workload specification - std::map workload; - - // Initialize random number generators - std::normal_distribution<> flops_dist(average_flops, sigma_flops); -// TODO: WHAT TO DO WITH MEMORY? - std::normal_distribution<> mem_dist(average_memory, sigma_memory); - std::normal_distribution<> insize_dist(average_infile_size, sigma_infile_size); - std::normal_distribution<> outsize_dist(average_outfile_size,sigma_outfile_size); - - for (size_t j = 0; j < num_jobs; j++) { - - // Create a job specification - JobSpecification job_specification; - - // Sample strictly positive task flops - double dflops = flops_dist(SimpleSimulator::gen); - while ((average_flops+sigma_flops) < dflops || dflops < 0.) dflops = flops_dist(SimpleSimulator::gen); - job_specification.total_flops = dflops; - - // Sample strictly positive task memory requirements - double dmem = mem_dist(SimpleSimulator::gen); - while ((average_memory+sigma_memory) < dmem || dmem < 0.) dmem = mem_dist(SimpleSimulator::gen); - job_specification.total_mem = dmem; - - for (size_t f = 0; f < infiles_per_task; f++) { - // Sample inputfile sizes - double dinsize = insize_dist(SimpleSimulator::gen); - while ((average_infile_size+3*sigma_infile_size) < dinsize || dinsize < 0.) dinsize = insize_dist(SimpleSimulator::gen); - job_specification.infiles.push_back(wrench::Simulation::addFile("infile_" + std::to_string(j) + "_" + std::to_string(f), dinsize)); - } - - // Sample outfile sizes - double doutsize = outsize_dist(SimpleSimulator::gen); - while ((average_outfile_size+3*sigma_outfile_size) < doutsize || doutsize < 0.) doutsize = outsize_dist(SimpleSimulator::gen); - job_specification.outfile = wrench::Simulation::addFile("outfile_" + std::to_string(j), doutsize); - - for (size_t d=0; d < duplications; d++) { - workload["job_" + std::to_string(j+d)] = job_specification; - } - } - return workload; -} - - -int main(int argc, char **argv) { - - // instantiate a simulation - auto simulation = wrench::Simulation::createSimulation(); - - // Initialization of the simulation - simulation->init(&argc, argv); - - /* Parsing of the command-line arguments for this WRENCH simulation */ - auto vm = process_program_options(argc, argv); - - // The first argument is the platform description file, written in XML following the SimGrid-defined DTD - std::string platform_file = vm["platform"].as(); - - // output-file name containing simulation information - std::string filename = vm["output-file"].as(); - - size_t num_jobs = vm["njobs"].as(); - size_t infiles_per_job = vm["ninfiles"].as(); - double hitrate = vm["hitrate"].as(); - - double average_flops = vm["flops"].as(); - double sigma_flops = vm["sigma-flops"].as(); - double average_memory = vm["mem"].as(); - double sigma_memory = vm["sigma-mem"].as(); - double average_infile_size = vm["insize"].as(); - double sigma_infile_size = vm["sigma-insize"].as(); - double average_outfile_size = vm["outsize"].as(); - double sigma_outfile_size = vm["sigma-outsize"].as(); - - size_t duplications = vm["duplications"].as(); - - // Flags to turn on/off blockwise streaming of input-files - SimpleSimulator::use_blockstreaming = !(vm["no-streaming"].as()); - - // Set XRootD block size - SimpleSimulator::xrd_block_size = vm["xrd-blocksize"].as(); - - - /* Create a workload */ - std::cerr << "Constructing workload specification..." << std::endl; - - auto workload_spec = fill_streaming_workflow( - num_jobs, infiles_per_job, - average_flops, sigma_flops, - average_memory,sigma_memory, - average_infile_size, sigma_infile_size, - average_outfile_size, sigma_outfile_size, - duplications - ); - - std::cerr << "The workflow has " << std::to_string(num_jobs) << " jobs" << std::endl; - - - /* Read and parse the platform description file to instantiate a simulation platform */ - std::cerr << "Instantiating SimGrid platform..." << std::endl; - simulation->instantiatePlatform(platform_file); - - - /* Create storage and compute services and add them to the simulation */ - // Loop over vector of all the hosts in the simulated platform - std::vector hostname_list = simulation->getHostnameList(); - // Create a list of storage services that will be used by the WMS - std::set> storage_services; - // Split into cache storages - std::set> cache_storage_services; - // and a remote storage that is able to serve all file requests - std::set> remote_storage_services; - // Create a list of compute services that will be used by the HTCondorService - std::set> condor_compute_resources; - std::string wms_host = "WMSHost"; - for (std::vector::iterator hostname = hostname_list.begin(); hostname != hostname_list.end(); ++hostname) { - std::string hostname_transformed = *hostname; - std::for_each(hostname_transformed.begin(), hostname_transformed.end(), [](char& c){c = std::tolower(c);}); - // Instantiate storage services - // WMSHost doesn't need a StorageService - if (*hostname != wms_host) { - std::string storage_host = *hostname; - std::cerr << "Instantiating a SimpleStorageService on " << storage_host << "..." << std::endl; - auto storage_service = simulation->add(new wrench::SimpleStorageService(storage_host, {"/"})); - if (hostname_transformed.find("remote") != std::string::npos) { - remote_storage_services.insert(storage_service); - } else { - cache_storage_services.insert(storage_service); - } - storage_services.insert(storage_service); - } - // Instantiate bare-metal compute-services - if ( - (*hostname != wms_host) && - (hostname_transformed.find("storage") == std::string::npos) - ) { - condor_compute_resources.insert( - simulation->add( - new wrench::BareMetalComputeService( - *hostname, - {std::make_pair( - *hostname, - std::make_tuple( - wrench::Simulation::getHostNumCores(*hostname), - wrench::Simulation::getHostMemoryCapacity(*hostname) - ) - )}, - "" - ) - ) - ); - } - } - - // Instantiate a HTcondorComputeService and add it to the simulation - std::set> htcondor_compute_services; - htcondor_compute_services.insert(simulation->add( - new wrench::HTCondorComputeService( - wms_host, - condor_compute_resources, - { - {wrench::HTCondorComputeServiceProperty::NEGOTIATOR_OVERHEAD, "1.0"}, - {wrench::HTCondorComputeServiceProperty::GRID_PRE_EXECUTION_DELAY, "10.0"}, - {wrench::HTCondorComputeServiceProperty::GRID_POST_EXECUTION_DELAY, "10.0"}, - {wrench::HTCondorComputeServiceProperty::NON_GRID_PRE_EXECUTION_DELAY, "5.0"}, - {wrench::HTCondorComputeServiceProperty::NON_GRID_POST_EXECUTION_DELAY, "5.0"} - }, - {} - ) - )); - - - - /* Instantiate a file registry service */ - std::cerr << "Instantiating a FileRegistryService on " << wms_host << "..." << std::endl; - auto file_registry_service = simulation->add(new wrench::FileRegistryService({wms_host})); - - - /* Instantiate an Execution Controller */ - auto wms = simulation->add( - new SimpleExecutionController( - workload_spec, - htcondor_compute_services, - //TODO: at this point only remote storage services should be sufficient - storage_services, - wms_host, - //hitrate, - filename - ) - ); - - /* Instantiate inputfiles and set outfile destinations*/ - // Check that the right remote_storage_service is passed for initial inputfile storage - // TODO: generalize to arbitrary numbers of remote storages - if (remote_storage_services.size() != 1) { - throw std::runtime_error("This example Simple Simulator requires a single remote_storage_service"); - } - auto remote_storage_service = *remote_storage_services.begin(); - - std::cerr << "Creating and staging input files plus set destination of output files..." << std::endl; - try { - for (auto &job_name_spec: wms->get_workload_spec()) { - // job specifications - auto &job_spec = job_name_spec.second; - std::shuffle(job_spec.infiles.begin(), job_spec.infiles.end(), SimpleSimulator::gen); // Shuffle the input files - // Compute the task's incremental inputfiles size - double incr_inputfile_size = 0.; - for (auto const &f : job_spec.infiles) { - incr_inputfile_size += f->getSize(); - } - // Distribute the infiles on all caches until desired hitrate is reached - double cached_files_size = 0.; - for (auto const &f : job_spec.infiles) { - simulation->stageFile(f, remote_storage_service); - SimpleSimulator::global_file_map[remote_storage_service].touchFile(f); - if (cached_files_size < hitrate*incr_inputfile_size) { - for (const auto& cache : cache_storage_services) { - simulation->stageFile(f, cache); - SimpleSimulator::global_file_map[cache].touchFile(f); - } - cached_files_size += f->getSize(); - } - } - if (cached_files_size/incr_inputfile_size < hitrate) { - throw std::runtime_error("Desired hitrate was not reached!"); - } - - // Set outfile destinations - // TODO: Think of a way to generalize - job_spec.outfile_destination = wrench::FileLocation::LOCATION(remote_storage_service); - } - } catch (std::runtime_error &e) { - std::cerr << "Exception: " << e.what() << std::endl; - return 0; - } - - - /* Launch the simulation */ - std::cerr << "Launching the Simulation..." << std::endl; - try { - simulation->launch(); - } catch (std::runtime_error &e) { - std::cerr << "Exception: " << e.what() << std::endl; - return 0; - } - std::cerr << "Simulation done!" << std::endl; - - - return 0; -} - diff --git a/sgbatch/src/SimpleSimulator.h b/sgbatch/src/SimpleSimulator.h deleted file mode 100644 index 41dfc55..0000000 --- a/sgbatch/src/SimpleSimulator.h +++ /dev/null @@ -1,36 +0,0 @@ - - -#ifndef S_SIMPLESIMULATOR_H -#define S_SIMPLESIMULATOR_H - -#include "LRU_FileList.h" - -class SimpleSimulator { - -public: - std::set> storage_services; - - static bool use_blockstreaming; - static std::map, LRU_FileList> global_file_map; - static double xrd_block_size; - static std::mt19937 gen; - - // Flops distribution - static double mean_flops; - static double sigma_flops; - static std::normal_distribution* flops_dist; - // Memory distribution - static double mean_mem; - static double sigma_mem; - static std::normal_distribution* mem_dist; - // Input-file distribution - static double mean_insize; - static double sigma_insize; - static std::normal_distribution* insize_dist; - // Output-file distribution - static double mean_outsize; - static double sigma_outsize; - static std::normal_distribution* outsize_dist; -}; - -#endif //S_SIMPLESIMULATOR_H diff --git a/sgbatch/src/computation/CacheComputation.cpp b/sgbatch/src/computation/CacheComputation.cpp deleted file mode 100644 index 5f36ad7..0000000 --- a/sgbatch/src/computation/CacheComputation.cpp +++ /dev/null @@ -1,164 +0,0 @@ -#include - -XBT_LOG_NEW_DEFAULT_CATEGORY(cache_computation, "Log category for CacheComputation"); - -#include "CacheComputation.h" - -/** - * @brief Construct a new CacheComputation::CacheComputation object - * to be used within a compute action, which shall take caching of input-files into account. - * - * @param storage_services Storage services reachable to retrieve input files (caches plus remote) - * @param files Input files of the job to process - * @param total_flops Total #FLOPS of the whole compute action of the job - */ -CacheComputation::CacheComputation(std::set> &storage_services, - std::vector> &files, - double total_flops) { - this->storage_services = storage_services; - this->files = files; - this->total_flops = total_flops; - this->total_data_size = determineTotalDataSize(files); -} - -/** - * @brief Cache by the job required files on local host's storage service. - * Free space when needed according to an LRU scheme. - * - * TODO: Find some optimal sources serving and destinations providing files to jobs. - * TODO: Find solutions for possible race conditions, when several jobs require same files. - * - * @param hostname Name of the host, where the job runs - */ -void CacheComputation::determineFileSources(std::string hostname) { - // Identify all storage services that run on this host, which runs the streaming action - // TODO: HENRI QUESTION: IS IT REALLY THE CASE THERE ARE COULD BE MULTIPLE LOCAL STORAGE SERVICES??? - std::vector> matched_storage_services; - for (auto const &ss : this->storage_services) { - if (ss->getHostname() == hostname) { - matched_storage_services.push_back(ss); - } - } - - // TODO: right now, there are loopkupFile() calls, which simulate overhead. Could be replaced - // TODO: by a lookup of the SimpleExecutionController::global_file_map data structure in case - // TODO: simulating that overhead is not desired/necessary.Perhaps an option of the simulator? - - // For each file, identify where to read it from and/or deal with cache updates, etc. - for (auto const &f : this->files) { - // find a source providing the required file - std::shared_ptr source_ss; - // See whether the file is already available in a "local" storage service - for (auto const &ss : matched_storage_services) { - if (ss->lookupFile(f, wrench::FileLocation::LOCATION(ss))) { - source_ss = ss; - break; - } - } - // If yes, we're done - if (source_ss) { - SimpleSimulator::global_file_map[source_ss].touchFile(f); - this->file_sources[f] = wrench::FileLocation::LOCATION(source_ss); - continue; - } - // If not, then we have to copy the file from some source to some local storage service - // TODO: Find the optimal source, whatever that means (right now it's whichever one works first) - for (auto const &ss : this->storage_services) { - if (ss->lookupFile(f, wrench::FileLocation::LOCATION(ss))) { - source_ss = ss; - break; - } - } - if (!source_ss) { - throw std::runtime_error("CacheComputation(): Couldn't find file " + f->getID() + " on any storage service!"); - } else { - SimpleSimulator::global_file_map[source_ss].touchFile(f); - } - - // TODO: Find the optimal destination, whatever that means (right now it's random, with a bad RNG!) - // TODO: But then perhaps matched_storage_services.size() is always 1? (see QUESTION above) - auto destination_ss = matched_storage_services.at(rand() % matched_storage_services.size()); - - // Evict files while to create space, using an LRU scheme! - double free_space = destination_ss->getFreeSpace().begin()->second; - while (free_space < f->getSize()) { - auto to_evict = SimpleSimulator::global_file_map[destination_ss].removeLRUFile(); - WRENCH_INFO("Evicting file %s from storage service on host %s", - to_evict->getID().c_str(), destination_ss->getHostname().c_str()); - destination_ss->deleteFile(to_evict, wrench::FileLocation::LOCATION(destination_ss)); - free_space += to_evict->getSize(); - } - - - // Instead of doing this file copy right here, instantly create the file locally for next jobs - //? Alternative: Wait for computation to finish and copy file then - // TODO: Better idea perhaps: have the first job that streams the file update a counter - // TODO: of file blocks available at the storage service, and subsequent jobs - // TODO: can read a block only if it's available (e.g., by waiting on some - // TODO: condition variable, which is signaled by the first job each time it - // TODO: reads a block). - // wrench::StorageService::copyFile(f, wrench::FileLocation::LOCATION(source_ss), wrench::FileLocation::LOCATION(destination_ss)); - wrench::StorageService::createFile(f, wrench::FileLocation::LOCATION(destination_ss)); - - SimpleSimulator::global_file_map[destination_ss].touchFile(f); - - // this->file_sources[f] = wrench::FileLocation::LOCATION(destination_ss); - this->file_sources[f] = wrench::FileLocation::LOCATION(source_ss); - } -} - -//? Question for Henri: put this into determineFileSources function to prevent two times the same loop? -/** - * @brief Determine the incremental size of all input-files of a job - * - * @param files Input files of the job to consider - * @return double - */ -double CacheComputation::determineTotalDataSize(const std::vector> &files) { - double incr_file_size; - for (auto const &f : this->files) { - incr_file_size += f->getSize(); - } - return incr_file_size; -} - -/** - * @brief Functor operator to be usable as lambda in custom action - * - * @param action_executor - */ -void CacheComputation::operator () (std::shared_ptr action_executor) { - std::string hostname = action_executor->getHostname(); - - // Identify all file sources (and deal with caching, evictions, etc. - WRENCH_INFO("Determining file sources for streamed computation"); - this->determineFileSources(hostname); - - this->performComputation(hostname); - -} - -/** - * @brief Determine the share on the total number of FLOPS to be computed - * in the step processing a fraction of the full input data - * - * @param data_size Size of the input-data block considered - * @param total_data_size Total incremental size of all input-files - * @return double - */ -double CacheComputation::determineFlops(double data_size, double total_data_size) { - double flops = this->total_flops * data_size / total_data_size; - return flops; -} - -/** - * @brief Perform the computation within the simulation of the job - * - * @param hostname DEPRECATED: Actually not needed anymore - */ -void CacheComputation::performComputation(std::string &hostname) { - throw std::runtime_error( - "Base class CacheComputation has no performComputation implemented! \ - It is meant only as an placeholder. Use one of the derived classes for the compute action!" - ); -} diff --git a/sgbatch/src/computation/CopyComputation.cpp b/sgbatch/src/computation/CopyComputation.cpp deleted file mode 100644 index da9ddbb..0000000 --- a/sgbatch/src/computation/CopyComputation.cpp +++ /dev/null @@ -1,50 +0,0 @@ -#include - -XBT_LOG_NEW_DEFAULT_CATEGORY(copy_computation, "Log category for CopyComputation"); - -#include "CopyComputation.h" - -/** - * @brief Construct a new CopyComputation::CopyComputation object - * to be used within a compute action, which shall take caching of input-files into account. - * File read of all input-files and compute steps are performed sequentially. - * - * @param storage_services Storage services reachable to retrieve input files (caches plus remote) - * @param files Input files of the job to process - * @param total_flops Total #FLOPS of the whole compute action of the job - */ -CopyComputation::CopyComputation(std::set> &storage_services, - std::vector> &files, - double total_flops) : CacheComputation::CacheComputation( - storage_services, - files, - total_flops - ) {} - -/** - * @brief Perform the computation within the simulation of the job. - * First read all input-files and then compute the whole number of FLOPS. - * - * @param hostname DEPRECATED: Actually not needed anymore - */ -void CopyComputation::performComputation(std::string &hostname) { - WRENCH_INFO("Performing copy computation!"); - // Incremental size of all input files to process - double total_data_size = this->total_data_size; - // Read all input files before computation - double data_size = 0; - for (auto const &fs : this->file_sources) { - WRENCH_INFO("Reading file %s from storage service on host %s", - fs.first->getID().c_str(), fs.second->getStorageService()->getHostname().c_str()); - fs.second->getStorageService()->readFile(fs.first, fs.second); - data_size += fs.first->getSize(); - } - if (! (std::abs(data_size-total_data_size) < 1.)) { - throw std::runtime_error("Something went wrong in the data size computation!"); - } - // Perform the computation as needed - double flops = determineFlops(data_size, total_data_size); - WRENCH_INFO("Computing %.2lf flops", flops); - wrench::Simulation::compute(flops); -} - diff --git a/sgbatch/src/computation/CopyComputation.h b/sgbatch/src/computation/CopyComputation.h deleted file mode 100644 index f5f0747..0000000 --- a/sgbatch/src/computation/CopyComputation.h +++ /dev/null @@ -1,24 +0,0 @@ - - -#ifndef S_COPYCOMPUTATION_H -#define S_COPYCOMPUTATION_H - -#include - -#include "CacheComputation.h" - -class CopyComputation : public CacheComputation { - -public: - // TODO: REMOVE MOST THINGS IN HERE AND RELY ON THE GLOBALS IN SimpleSimulation::... - CopyComputation(std::set> &storage_services, - std::vector> &files, - double total_flops); - - void performComputation(std::string &hostname) override; - -private: - -}; - -#endif //S_COPYCOMPUTATION_H diff --git a/sgbatch/src/computation/StreamedComputation.cpp b/sgbatch/src/computation/StreamedComputation.cpp deleted file mode 100644 index 9b9cd83..0000000 --- a/sgbatch/src/computation/StreamedComputation.cpp +++ /dev/null @@ -1,71 +0,0 @@ -#include - -XBT_LOG_NEW_DEFAULT_CATEGORY(streamed_computation, "Log category for StreamedComputation"); - -#include "StreamedComputation.h" - - -/** - * @brief Construct a new StreamedComputation::StreamedComputation object - * to be used within a compute action, which shall take caching of input-files into account. - * File read is performed asynchronously in blocks and the according coompute step is executed - * once the corresponding block is available. - * - * @param storage_services Storage services reachable to retrieve input files (caches plus remote) - * @param files Input files of the job to process - * @param total_flops Total #FLOPS of the whole compute action of the job - */ -StreamedComputation::StreamedComputation(std::set> &storage_services, - std::vector> &files, - double total_flops) : CacheComputation::CacheComputation( - storage_services, - files, - total_flops - ) {} - -/** - * @brief Perform the computation within the simulation of the job. - * Asynchronously read the input files (don't wait for previous computation to finish) in blocks - * and compute the according share of FLOPS once read finished. - * - * @param hostname DEPRECATED: Actually not needed anymore - */ -void StreamedComputation::performComputation(std::string &hostname) { - WRENCH_INFO("Performing streamed computation!"); - // Incremental size of all input files to be processed - auto total_data_size = this->total_data_size; - for (auto const &fs : this->file_sources) { - WRENCH_INFO("Streaming computation for input file %s", fs.first->getID().c_str()); - double data_to_process = fs.first->getSize(); - - // Compute the number of blocks - int num_blocks = int(std::ceil(data_to_process / (double) SimpleSimulator::xrd_block_size)); - - // Read the first block - fs.second->getStorageService()->readFile(fs.first, fs.second, std::min(SimpleSimulator::xrd_block_size, data_to_process)); - - // Process next blocks: compute block i while reading block i+i - for (int i=0; i < num_blocks - 1; i++) { - double num_bytes = std::min(SimpleSimulator::xrd_block_size, data_to_process); - double num_flops = determineFlops(num_bytes, total_data_size); -// WRENCH_INFO("Chunk: %.2lf bytes / %.2lf flops", num_bytes, num_flops); - // Start the computation asynchronously - simgrid::s4u::ExecPtr exec = simgrid::s4u::this_actor::exec_init(num_flops); - exec->start(); - // Read data from the file - fs.second->getStorageService()->readFile(fs.first, fs.second, num_bytes); - // Wait for the computation to be done - exec->wait(); - data_to_process -= num_bytes; - } - - // Process last block - double num_flops = determineFlops(std::min(SimpleSimulator::xrd_block_size, data_to_process), total_data_size); - simgrid::s4u::ExecPtr exec = simgrid::s4u::this_actor::exec_init(num_flops); - exec->start(); - exec->wait(); - - } - -} - diff --git a/sgbatch/src/computation/StreamedComputation.h b/sgbatch/src/computation/StreamedComputation.h deleted file mode 100644 index 5f4e264..0000000 --- a/sgbatch/src/computation/StreamedComputation.h +++ /dev/null @@ -1,24 +0,0 @@ - - -#ifndef S_STREAMEDCOMPUTATION_H -#define S_STREAMEDCOMPUTATION_H - -#include - -#include "CacheComputation.h" - -class StreamedComputation : public CacheComputation { - -public: - // TODO: REMOVE MOST THINGS IN HERE AND RELY ON THE GLOBALS IN SimpleSimulation::... - StreamedComputation(std::set> &storage_services, - std::vector> &files, - double total_flops); - - void performComputation(std::string &hostname) override; - -private: - -}; - -#endif //S_STREAMEDCOMPUTATION_H diff --git a/sgbatch/src/JobSpecification.h b/src/JobSpecification.h similarity index 88% rename from sgbatch/src/JobSpecification.h rename to src/JobSpecification.h index b0dd245..46a8f83 100644 --- a/sgbatch/src/JobSpecification.h +++ b/src/JobSpecification.h @@ -5,6 +5,8 @@ #include +#include "util/Enums.h" + /** * @brief Container to hold all job specific information * @@ -21,6 +23,8 @@ struct JobSpecification { double total_flops; // Memory consumption of the job double total_mem; + // Usage of block streaming + WorkflowType workflow_type; }; #endif //S_JOB_SPECIFICATION_H diff --git a/sgbatch/src/LRU_FileList.h b/src/LRU_FileList.h similarity index 71% rename from sgbatch/src/LRU_FileList.h rename to src/LRU_FileList.h index 88cf526..9fb56bf 100644 --- a/sgbatch/src/LRU_FileList.h +++ b/src/LRU_FileList.h @@ -13,7 +13,7 @@ class LRU_FileList { * * @param file */ - void touchFile(std::shared_ptr file) { + void touchFile(wrench::DataFile *file) { // If the file is new, then it's easy if (this->indexed_files.find(file) == this->indexed_files.end()) { this->indexed_files[file] = 0; @@ -38,14 +38,24 @@ class LRU_FileList { auto file = this->lru_list.back(); this->lru_list.pop_back(); this->indexed_files.erase(file); - return file; + return wrench::Simulation::getFileByID(file->getID()); } + /** + * @brief Checks whether a file is in the LRU list + * @param file : a data file + * @return true if the file is there, false otherwise + */ + bool hasFile(std::shared_ptr file) { + return (this->indexed_files.find(file.get()) != this->indexed_files.end()); + } + + private: // File collection mapped to index - std::map, ssize_t> indexed_files; + std::map indexed_files; // Ordered list of files in file collection -- front is most recently used. - std::vector> lru_list; + std::vector lru_list; }; diff --git a/src/MonitorAction.cpp b/src/MonitorAction.cpp new file mode 100644 index 0000000..656d249 --- /dev/null +++ b/src/MonitorAction.cpp @@ -0,0 +1,21 @@ +#include "MonitorAction.h" + +/** + * @brief Constructor that adds some more parameters for monitoring purposes + */ +MonitorAction::MonitorAction( + const std::string &name, + double ram, + unsigned long num_cores, + const std::function action_executor)> &lambda_execute, + const std::function action_executor)> &lambda_terminate +) : CustomAction( + name, ram, num_cores, + std::move(lambda_execute), + std::move(lambda_terminate) +) { + this->calculation_time = DefaultValues::UndefinedDouble; + this->infile_transfer_time = DefaultValues::UndefinedDouble; + // this->outfile_transfer_time = 0.; + this->hitrate = DefaultValues::UndefinedDouble; +} diff --git a/src/MonitorAction.h b/src/MonitorAction.h new file mode 100644 index 0000000..0abad8a --- /dev/null +++ b/src/MonitorAction.h @@ -0,0 +1,66 @@ + + +#ifndef MY_CACHE_COMPUTE_ACTION_H +#define MY_CACHE_COMPUTE_ACTION_H + +#include +#include "util/DefaultValues.h" + +/** + * @brief Extension of CustomAction to monitor job execution + */ +class MonitorAction : public wrench::CustomAction { +public: + /** + * @brief Constructor that adds some more parameters for monitoring purposes + */ + MonitorAction( + const std::string &name, + double ram, + unsigned long num_cores, + const std::function action_executor)> &lambda_execute, + const std::function action_executor)> &lambda_terminate + ); + + double get_infile_transfer_time() { + return infile_transfer_time; + } + double get_calculation_time() { + return calculation_time; + } + // double get_outfile_transfer_time() { + // return outfile_transfer_time; + // } + double get_hitrate() { + return hitrate; + } + + void set_infile_transfer_time(double value) { + this->infile_transfer_time = value; + } + void set_calculation_time(double value) { + this->calculation_time = value; + } + // void set_outfile_transfer_time(double value) { + // this->outfile_transfer_time = value; + // } + void set_hitrate(double value) { + this->hitrate = value; + } + +protected: + /** @brief Attribute monitoring accumulated transfer-time of input-files. + * Non-zero for jobs where infile-read and compute steps are separated. */ + double infile_transfer_time; + /** @brief Attribute monitoring the accumulated computation time (CPU time).*/ + double calculation_time; + // /** @brief Atrribute monitoring accumulated transfer-time of output files. + // * Currently not in use. */ + // double outfile_transfer_time; // transfer time for output files + /** @brief Attribute monitoring fraction of input-files read from cache. + * This might be dependent on the cache definition. */ + double hitrate; + +}; + +#endif //MY_SIMPLE_EXECUTION_CONTROLLER_H \ No newline at end of file diff --git a/src/SimpleExecutionController.cpp b/src/SimpleExecutionController.cpp new file mode 100644 index 0000000..e47aeea --- /dev/null +++ b/src/SimpleExecutionController.cpp @@ -0,0 +1,361 @@ +/** + * Copyright (c) 2020. . + * Generated with the wrench-init.in tool. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + */ +#include +#include "util/DefaultValues.h" + +#include "SimpleExecutionController.h" +#include "JobSpecification.h" +#include "computation/StreamedComputation.h" +#include "computation/CopyComputation.h" +#include "MonitorAction.h" + +XBT_LOG_NEW_DEFAULT_CATEGORY(simple_wms, "Log category for SimpleExecutionController"); + + +/** + * @brief A simple ExecutionController building jobs from job-specifications, + * submitting them and monitoring their execution + * + * @param workload_spec collection of job specifications + * @param htcondor_compute_services collection of HTCondorComputeServices submitting jobs + * @param grid_storage_services GRID storages holding files "for ever" + * @param cache_storage_services local caches evicting files when needed + * @param hostname host running the execution controller + * @param outputdump_name name of the file where the simulation's job information is stored + * + */ +SimpleExecutionController::SimpleExecutionController( + const std::map& workload_spec, + const std::set>& htcondor_compute_services, + const std::set>& grid_storage_services, + const std::set>& cache_storage_services, + const std::string& hostname, + const std::string& outputdump_name) : wrench::ExecutionController( + hostname, + "condor-simple") { + this->workload_spec = workload_spec; + this->htcondor_compute_services = htcondor_compute_services; + this->grid_storage_services = grid_storage_services; + this->cache_storage_services = cache_storage_services; + this->filename = outputdump_name; +} + +/** + * @brief main method of the SimpleExecutionController daemon + * + * @return 0 on completion + * + * @throw std::runtime_error + */ +int SimpleExecutionController::main() { + + wrench::TerminalOutput::setThisProcessLoggingColor(wrench::TerminalOutput::COLOR_GREEN); + + /* initialize output-dump file */ + this->filedump.open(this->filename, ios::out | ios::trunc); + if (this->filedump.is_open()) { + this->filedump << "job.tag" << ", "; // << "job.ncpu" << ", " << "job.memory" << ", " << "job.disk" << ", "; + this->filedump << "machine.name" << ", "; + this->filedump << "hitrate" << ", "; + this->filedump << "job.start" << ", " << "job.end" << ", " << "job.computetime" << ", "; + this->filedump << "infiles.transfertime" << ", " << "infiles.size" << ", " << "outfiles.transfertime" << ", " << "outfiles.size" << std::endl; + this->filedump.close(); + + WRENCH_INFO("Wrote header of the output dump into file %s", this->filename.c_str()); + } + else { + throw std::runtime_error("Couldn't open output-file " + this->filename + " for dump!"); + } + + WRENCH_INFO("Starting on host %s", wrench::Simulation::getHostName().c_str()); + WRENCH_INFO("About to execute a workload of %lu jobs", this->workload_spec.size()); + + + // Create a job manager + this->job_manager = this->createJobManager(); + WRENCH_INFO("Created a job manager"); + + // Create a data movement manager + this->data_movement_manager = this->createDataMovementManager(); + WRENCH_INFO("Created a data manager"); + + + // Get the available compute services + // TODO: generalize to arbitrary numbers of HTCondorComputeServices + if (this->htcondor_compute_services.empty()) { + throw std::runtime_error("Aborting - No compute services available!"); + } + if (this->htcondor_compute_services.size() != 1) { + throw std::runtime_error("This execution controller running on " + this->getHostname() + " requires a single HTCondorCompute service"); + } + WRENCH_INFO("Found %ld HTCondor Service(s) on:", this->htcondor_compute_services.size()); + for (auto htcondor_compute_service: this->htcondor_compute_services) { + WRENCH_INFO("\t%s", htcondor_compute_service->getHostname().c_str()); + } + auto htcondor_compute_service = *this->htcondor_compute_services.begin(); + + + // Create and submit all the jobs! + WRENCH_INFO("There are %ld jobs to schedule", this->workload_spec.size()); + for (auto job_name_spec: this->workload_spec) { + std::string job_name = job_name_spec.first; + auto job_spec = &this->workload_spec[job_name]; + + auto job = job_manager->createCompoundJob(job_name); + + // Combined read-input-file-and-run-computation actions + std::shared_ptr run_action; + std::shared_ptr compute_action; + if (job_spec->workflow_type == WorkflowType::Copy) { + auto copy_computation = std::shared_ptr( + new CopyComputation(this->cache_storage_services, this->grid_storage_services, job_spec->infiles, job_spec->total_flops) + ); + + //? Split this into a caching file read and a standard compute action? + run_action = std::make_shared( + "copycompute_" + job_name, + job_spec->total_mem, 1, + *copy_computation, + [](std::shared_ptr action_executor) { + WRENCH_INFO("Copy computation terminating") + } + ); + job->addCustomAction(run_action); + } + else if (job_spec->workflow_type == WorkflowType::Streaming){ + auto streamed_computation = std::shared_ptr( + new StreamedComputation(this->cache_storage_services, this->grid_storage_services, job_spec->infiles, job_spec->total_flops, SimpleSimulator::prefetching_on) + ); + + run_action = std::make_shared( + "streaming_" + job_name, + job_spec->total_mem, 1, + *streamed_computation, + [](std::shared_ptr action_executor) { + WRENCH_INFO("Streaming computation terminating"); + // Do nothing + } + ); + job->addCustomAction(run_action); + } + else if (job_spec->workflow_type == WorkflowType::Calculation) { + // TODO: figure out what is the best value for the ability tp parallelize HEP workflows on a CPU. Setting currently to 1.0. + compute_action = job->addComputeAction("calculation_" + job_name,job_spec->total_flops, job_spec->total_mem, 1, 1, wrench::ParallelModel::CONSTANTEFFICIENCY(1.0)); + } + + // Create the file write action + auto fw_action = job->addFileWriteAction( + "file_write_" + job_name, + job_spec->outfile, + job_spec->outfile_destination + ); + // //TODO: Think of a determination of storage_service to hold output data + // // auto fw_action = job->addCustomAction( + // // "file_write_" + job_name, + // // job_spec->total_mem, 0, + // // [](std::shared_ptr action_executor) { + // // // TODO: Which storage service should we write output on? + // // // TODO: Probably random selection is fine, or just a fixed + // // // TODO: one that's picked by the "user"? + // // // TODO: Write the file at once + // // }, + // // [](std::shared_ptr action_executor) { + // // WRENCH_INFO("Output file was successfully written!") + // // // Do nothing + // // } + // // ); + + // // Add necessary dependencies + if (job_spec->workflow_type == WorkflowType::Streaming || job_spec->workflow_type == WorkflowType::Copy) { + job->addActionDependency(run_action, fw_action); + } + else if (job_spec->workflow_type == WorkflowType::Calculation) { + job->addActionDependency(compute_action, fw_action); + } + + // Submit the job for execution! + //TODO: generalize to arbitrary numbers of htcondor services + job_manager->submitJob(job, htcondor_compute_service); + WRENCH_INFO("Submitted job %s", job->getName().c_str()); + + } + + WRENCH_INFO( + "Job manager %s: Done with creation/submission of all compound jobs on host %s", + job_manager->getName().c_str(), job_manager->getHostname().c_str() + ); + + + this->num_completed_jobs = 0; + while (this->workload_spec.size() > 0) { + // Wait for a workflow execution event, and process it + try { + this->waitForAndProcessNextEvent(); + } catch (wrench::ExecutionException &e) { + WRENCH_INFO("Error while getting next execution event (%s)... ignoring and trying again", (e.getCause()->toString().c_str())); + continue; + } + + if (this->abort || this->workload_spec.size() == 0) { + break; + } + } + + wrench::Simulation::sleep(10); + + WRENCH_INFO("--------------------------------------------------------") + if (this->workload_spec.size() == 0){ + WRENCH_INFO("Workload execution on %s is complete!", this->getHostname().c_str()); + } else{ + WRENCH_INFO("Workload execution on %s is incomplete!", this->getHostname().c_str()); + } + + WRENCH_INFO("SimpleExecutionController daemon started on host %s terminating", wrench::Simulation::getHostName().c_str()); + + this->job_manager.reset(); + + return 0; +} + + +/** + * @brief Process a ExecutionEvent::COMPOUND_JOB_FAILURE + * Abort simulation once there is a failure. + * + * @param event: an execution event + */ +void SimpleExecutionController::processEventCompoundJobFailure(std::shared_ptr event) { + WRENCH_INFO("Notified that compound job %s has failed!", event->job->getName().c_str()); + WRENCH_INFO("Failure cause: %s", event->failure_cause->toString().c_str()); + WRENCH_INFO("As a SimpleExecutionController, I abort as soon as there is a failure"); + this->abort = true; +} + + +/** +* @brief Process a ExecutionEvent::COMPOUND_JOB_COMPLETION. +* This also writes out a dump of job information returned by the simulation. +* +* @param event: an execution event +*/ +void SimpleExecutionController::processEventCompoundJobCompletion(std::shared_ptr event) { + + /* Retrieve the job that this event is for */ + WRENCH_INFO("Notified that job %s with %ld actions has completed", event->job->getName().c_str(), event->job->getActions().size()); + + this->num_completed_jobs++; + + /* Figure out execution host. All actions run on the same host, so let's just pick an arbitrary one */ + std::string execution_host = (*(event->job->getActions().begin()))->getExecutionHistory().top().physical_execution_host; + + /* Remove all actions from memory and compute incremental output values in one loop */ + double incr_compute_time = DefaultValues::UndefinedDouble; + double incr_infile_transfertime = 0.; + double incr_infile_size = 0.; + double incr_outfile_transfertime = 0.; + double incr_outfile_size = 0.; + double global_start_date = DBL_MAX; + double global_end_date = DBL_MIN; + double hitrate = DefaultValues::UndefinedDouble; + + bool found_computation_action = false; + + // Figure out timings + for (auto const &action : event->job->getActions()) { + double start_date = action->getStartDate(); + double end_date = action->getEndDate(); + global_start_date = std::min(global_start_date, start_date); + global_end_date = std::max(global_end_date, end_date); + if (start_date < 0. || end_date < 0.) { + throw std::runtime_error( + "Start date " + std::to_string(start_date) + + " or end date " + std::to_string(end_date) + + " of action " + action->getName() + " out of scope!" + ); + } + double elapsed = end_date - start_date; + WRENCH_DEBUG("Analyzing action: %s, started in s: %.2f, ended in s: %.2f, elapsed in s: %.2f", action->getName().c_str(), start_date, end_date, elapsed); + + if (auto file_read_action = std::dynamic_pointer_cast(action)) { + incr_infile_transfertime += elapsed; + } else if (auto monitor_action = std::dynamic_pointer_cast(action)) { + if (found_computation_action) { + throw std::runtime_error("There was more than one computation action in job " + event->job->getName()); + } + found_computation_action = true; + if (incr_infile_transfertime <= 0. && incr_compute_time < 0. && hitrate < 0.) { + incr_infile_transfertime = monitor_action->get_infile_transfer_time(); + incr_compute_time = monitor_action->get_calculation_time(); + hitrate = monitor_action->get_hitrate(); + } else { + throw std::runtime_error( + "Some of the job information for action " + monitor_action->getName() + + " has already been filled. Abort!" + ); + } + } else if (auto file_write_action = std::dynamic_pointer_cast(action)) { + if (end_date >= start_date) { + incr_outfile_transfertime += end_date - start_date; + } else { + throw std::runtime_error( + "Writing outputfile " + this->workload_spec[event->job->getName()].outfile->getID() + + " for job " + event->job->getName() + " finished before start!" + ); + } + } + else if (auto compute_action = std::dynamic_pointer_cast(action)) { + if (end_date >= start_date) { + if(incr_compute_time == DefaultValues::UndefinedDouble){ + incr_compute_time = end_date - start_date; + } + else { + incr_compute_time += end_date - start_date; + } + } + else { + throw std::runtime_error( + "Computation for job " + event->job->getName() + " finished before start!" + ); + } + } + } + + // Figure out file sizes + for (auto const &f : this->workload_spec[event->job->getName()].infiles) { + incr_infile_size += f->getSize(); + } + incr_outfile_size += this->workload_spec[event->job->getName()].outfile->getSize(); + + //? Remove job from containers like this? + this->workload_spec.erase(event->job->getName()); + + /* Dump relevant information to file */ + this->filedump.open(this->filename, ios::out | ios::app); + if (this->filedump.is_open()) { + + this->filedump << event->job->getName() << ", "; + // << std::to_string(job->getMinimumRequiredNumCores()) << ", " + // << std::to_string(job->getMinimumRequiredMemory()) << ", " + // << /*TODO: find a way to get disk usage on scratch space */ << ", "; + this->filedump << execution_host << ", " << hitrate << ", "; + this->filedump << std::to_string(global_start_date) << ", " << std::to_string(global_end_date) << ", "; + this->filedump << std::to_string(incr_compute_time) << ", "; + this->filedump << std::to_string(incr_infile_transfertime) << ", " << std::to_string(incr_infile_size) << ", " ; + this->filedump << std::to_string(incr_outfile_transfertime) << ", " << std::to_string(incr_outfile_size) << std::endl; + + this->filedump.close(); + + WRENCH_INFO("Information for job %s has been dumped into file %s", event->job->getName().c_str(), this->filename.c_str()); + } + else { + throw std::runtime_error("Couldn't open output-file " + this->filename + " for dump!"); + } + +} diff --git a/sgbatch/src/SimpleExecutionController.h b/src/SimpleExecutionController.h similarity index 85% rename from sgbatch/src/SimpleExecutionController.h rename to src/SimpleExecutionController.h index 58f2af7..26c2727 100644 --- a/sgbatch/src/SimpleExecutionController.h +++ b/src/SimpleExecutionController.h @@ -17,18 +17,18 @@ #include "JobSpecification.h" #include "LRU_FileList.h" +#include "util/Enums.h" + class Simulation; -/** - * @brief A simple ExecutionController implementation - */ class SimpleExecutionController : public wrench::ExecutionController { public: // Constructor SimpleExecutionController( const std::map &workload_spec, const std::set>& htcondor_compute_services, - const std::set>& storage_services, + const std::set>& grid_storage_services, + const std::set>& cache_storage_services, //const std::set>& network_proximity_services, //std::shared_ptr file_registry_service, const std::string& hostname, @@ -36,7 +36,11 @@ class SimpleExecutionController : public wrench::ExecutionController { const std::string& outputdump_name); std::map& get_workload_spec() { - return workload_spec; + return this->workload_spec; + } + + void set_workload_spec(std::map w) { + this->workload_spec = w; } @@ -47,7 +51,8 @@ class SimpleExecutionController : public wrench::ExecutionController { private: std::set> htcondor_compute_services; - std::set> storage_services; + std::set> grid_storage_services; + std::set> cache_storage_services; std::map workload_spec; diff --git a/src/SimpleSimulator.cpp b/src/SimpleSimulator.cpp new file mode 100644 index 0000000..932aed8 --- /dev/null +++ b/src/SimpleSimulator.cpp @@ -0,0 +1,819 @@ +/** + * Copyright (c) 2020. . + * Generated with the wrench-init.in tool. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + */ +#include +#include "SimpleSimulator.h" +#include "SimpleExecutionController.h" +#include "JobSpecification.h" + +#include "util/Enums.h" + +#include +#include + +#include +#include +#include +#include + + +namespace po = boost::program_options; + +/** + * + * "Global" static variables. Some here are a bit ugly of course, but they should help + * with memory footprint by avoiding passing around / storing items that apply to + * all jobs. + */ +const std::vector workflow_keys = { + "name", + "num_jobs","infiles_per_job", + "average_flops","sigma_flops", + "average_memory", "sigma_memory", + "average_infile_size", "sigma_infile_size", + "average_outfile_size", "sigma_outfile_size", + "workflow_type" + }; +std::map, LRU_FileList> SimpleSimulator::global_file_map; +std::mt19937 SimpleSimulator::gen(42); // random number generator +bool SimpleSimulator::infile_caching_on = true; // flag to turn off/on the caching of job input-files +bool SimpleSimulator::prefetching_on = true; // flag to enable prefetching during streaming +double SimpleSimulator::xrd_block_size = 1.*1000*1000*1000; // maximum size of the streamed file blocks in bytes for the XRootD-ish streaming +// TODO: The initialized below is likely bogus (at compile time?) +std::normal_distribution* SimpleSimulator::flops_dist; +std::normal_distribution* SimpleSimulator::mem_dist; +std::normal_distribution* SimpleSimulator::insize_dist; +std::normal_distribution* SimpleSimulator::outsize_dist; +std::set SimpleSimulator::cache_hosts; +std::set SimpleSimulator::storage_hosts; +std::set SimpleSimulator::worker_hosts; +std::set SimpleSimulator::scheduler_hosts; +std::set SimpleSimulator::executors; +std::set SimpleSimulator::file_registries; +std::set SimpleSimulator::network_monitors; +std::map> SimpleSimulator::hosts_in_zones; +bool SimpleSimulator::local_cache_scope = false; // flag to consider only local caches + + +/** + * @brief Simple Choices class for cache scope program option + * used as Custom Validator: https://www.boost.org/doc/libs/1_48_0/doc/html/program_options/howto.html#id2445062 + */ +struct cacheScope { + cacheScope(std::string const& val): value(val) {} + std::string value; +}; +/** + * @brief Operator<< for the cacheScope class + * + * @param os + * @param val + * @return std::ostream& + */ +std::ostream& operator<<(std::ostream &os, const cacheScope &val) { + os << val.value << " "; + return os; +} + +/** + * @brief Overload of boost::program_options validate method + * to check for custom validator classes + */ +void validate(boost::any& v, std::vector const& values, cacheScope* /* target_type */, int) { + using namespace boost::program_options; + + // Make sure no previous assignment to 'v' was made. + validators::check_first_occurrence(v); + + // Extract the first string from 'values'. If there is more than + // one string, it's an error, and exception will be thrown. + std::string const& s = validators::get_single_string(values); + + if (s == "local" || s == "network" || s == "siblingnetwork") { + v = boost::any(cacheScope(s)); + } else { + throw validation_error(validation_error::invalid_option_value); + } +} + +/** + * @brief Simple Choices class for workflow type program option + * used as Custom Validator: https://www.boost.org/doc/libs/1_48_0/doc/html/program_options/howto.html#id2445062 + */ +struct WorkflowTypeStruct { + WorkflowTypeStruct(std::string const& val): value(boost::to_lower_copy(val)) {} + std::string value; + // getter function + WorkflowType get() const{ + return get_workflow_type(value); + } +}; + +/** + * @brief Operator<< for the WorkflowTypeStruct class + * + * @param os + * @param val + * @return std::ostream& + */ +std::ostream& operator<<(std::ostream &os, const WorkflowTypeStruct &val) { + os << val.value << " "; + return os; +} + +/** + * @brief Overload of boost::program_options validate method + * to check for custom validator classes + */ +void validate(boost::any& v, std::vector const& values, WorkflowTypeStruct* /* target_type */, int) { + using namespace boost::program_options; + + // Make sure no previous assignment to 'v' was made. + validators::check_first_occurrence(v); + + // Extract the first string from 'values'. If there is more than + // one string, it's an error, and exception will be thrown. + std::string const& s = validators::get_single_string(values); + + auto w = WorkflowTypeStruct(s); + try { + w.get(); + v = boost::any(w); + } + catch(std::runtime_error &e) { + throw validation_error(validation_error::invalid_option_value); + } +} + +/** + * @brief Simple Choices class for workflow type program option + * used as Custom Validator: https://www.boost.org/doc/libs/1_48_0/doc/html/program_options/howto.html#id2445062 + */ +struct StorageServiceBufferValue { + StorageServiceBufferValue(std::string const& val): value(boost::to_lower_copy(val)) {} + std::string value; + StorageServiceBufferType type; + // getter function + StorageServiceBufferType getType() const{ + return get_ssbuffer_type(value); + } + std::string get() const{ + return value; + } +}; + +/** + * @brief Operator<< for the StorageServiceBufferValue class + * + * @param os + * @param val + * @return std::ostream& + */ +std::ostream& operator<<(std::ostream &os, const StorageServiceBufferValue &val) { + os << val.value << " "; + return os; +} + +/** + * @brief Overload of boost::program_options validate method + * to check for custom validator classes + */ +void validate(boost::any& v, std::vector const& values, StorageServiceBufferValue* /* target_type */, int) { + using namespace boost::program_options; + + // Make sure no previous assignment to 'v' was made. + validators::check_first_occurrence(v); + + // Extract the first string from 'values'. If there is more than + // one string, it's an error, and exception will be thrown. + std::string const& s = validators::get_single_string(values); + + auto ssp = StorageServiceBufferValue(s); + try { + ssp.getType(); + v = boost::any(ssp); + } + catch(std::runtime_error &e) { + throw validation_error(validation_error::invalid_option_value); + } +} + +/** + * @brief helper function to process simulation options and parameters + * + * @param argc + * @param argv + * + */ +po::variables_map process_program_options(int argc, char** argv) { + + // default values + double hitrate = 0.0; + + double average_flops = 2164.428*1000*1000*1000; + double sigma_flops = 0.1*average_flops; + double average_memory = 2.*1000*1000*1000; + double sigma_memory = 0.1*average_memory; + size_t infiles_per_job = 10; + double average_infile_size = 3600000000.; + double sigma_infile_size = 0.1*average_infile_size; + double average_outfile_size = 0.5*infiles_per_job*average_infile_size; + double sigma_outfile_size = 0.1*average_outfile_size; + + size_t duplications = 1; + + bool no_caching = false; + bool prefetch_off = false; + + double xrd_block_size = 1000.*1000*1000; + std::string storage_service_buffer_size = "1048576"; // 1MiB + + po::options_description desc("Allowed options"); + desc.add_options() + ("help,h", "show brief usage message\n") + + ("platform,p", po::value()->value_name("")->required(), "platform description file, written in XML following the SimGrid-defined DTD") + ("hitrate,H", po::value()->default_value(hitrate), "initial fraction of staged input-files on caches at simulation start") + + ("workflow-configurations", po::value>()->multitoken()->default_value(std::vector{}, ""), "List of paths to .json files with workflow configurations. Note that all job-specific commandline options will be ignored in case at least one configuration is provided.") + + ("njobs,n", po::value()->default_value(60), "number of jobs to simulate") + ("flops", po::value()->default_value(average_flops), "amount of floating point operations jobs need to process") + ("sigma-flops", po::value()->default_value(sigma_flops), "jobs' distribution spread in FLOPS") + ("mem,m", po::value()->default_value(average_memory), "average size of memory needed for jobs to run") + ("sigma-mem", po::value()->default_value(sigma_memory), "jobs' sistribution spread in memory-needs") + ("ninfiles", po::value()->default_value(infiles_per_job), "number of input-files each job has to process") + ("insize", po::value()->default_value(average_infile_size), "average size of input-files jobs read") + ("sigma-insize", po::value()->default_value(sigma_infile_size), "jobs' distribution spread in input-file size") + ("outsize", po::value()->default_value(average_outfile_size), "average size of output-files jobs write") + ("sigma-outsize", po::value()->default_value(sigma_outfile_size), "jobs' distribution spread in output-file size") + + ("duplications,d", po::value()->default_value(duplications), "number of duplications of the workflow to feed into the simulation") + + ("workflow-type", po::value()->default_value(WorkflowTypeStruct("streaming")), "switch to define the type of the workflow. Please choose from 'calculation', 'streaming', or 'copy'") + ("no-caching", po::bool_switch()->default_value(no_caching), "switch to turn on/off the caching of jobs' input-files") + ("prefetch-off", po::bool_switch()->default_value(prefetch_off), "switch to turn on/off prefetching for streaming of input-files") + + ("output-file,o", po::value()->value_name("")->required(), "path for the CSV file containing output information about the jobs in the simulation") + + ("xrd-blocksize,x", po::value()->default_value(xrd_block_size), "size of the blocks XRootD uses for data streaming") + ("storage-buffer-size,b", po::value()->default_value(StorageServiceBufferValue(storage_service_buffer_size)), "buffer size used by the storage services when communicating data") + + ("cache-scope", po::value()->default_value(cacheScope("local")), "Set the network scope in which caches can be found:\n local: only caches on same machine\n network: caches in same network zone\n siblingnetwork: also include caches in sibling networks") + ; + + po::variables_map vm; + po::store( + po::parse_command_line(argc, argv, desc), + vm + ); + + if (vm.count("help")) { + std::cerr << desc << std::endl; + exit(EXIT_SUCCESS); + } + + try { + po::notify(vm); + } catch (std::exception& e) { + std::cerr << "Error: " << e.what() << std::endl << std::endl; + std::cerr << desc << std::endl; + exit(EXIT_FAILURE); + } + + // Here, all options should be properly set + std::cerr << "Using platform " << vm["platform"].as() << std::endl; + + return vm; +} + + +/** + * @brief Fill a Workflow consisting of jobs with job specifications, + * which include the inputfile and outputfile dependencies. + * It can be chosen between jobs streaming input data and perform computations simultaneously + * or jobs copying the full input-data and compute afterwards. + * + * @param num_jobs: number of tasks + * @param infiles_per_task: number of input-files each job processes + * @param average_flops: expectation value of the flops (truncated gaussian) distribution + * @param sigma_flops: std. deviation of the flops (truncated gaussian) distribution + * @param average_memory: expectation value of the memory (truncated gaussian) distribution + * @param sigma_memory: std. deviation of the memory (truncated gaussian) distribution + * @param average_infile_size: expectation value of the input-file size (truncated gaussian) distribution + * @param sigma_infile_size: std. deviation of the input-file size (truncated gaussian) distribution + * @param average_outfile_size: expectation value of the output-file size (truncated gaussian) distribution + * @param sigma_outfile_size: std. deviation of the output-file size (truncated gaussian) distribution + * @param workflow_type: flag to specifiy, whether the job should run with streaming or not + * @param jobname_suffix: part of job name to distinguish between different workflows + * + * @throw std::runtime_error + */ +std::map fill_workflow ( + size_t num_jobs, + size_t infiles_per_task, + double average_flops, double sigma_flops, + double average_memory, double sigma_memory, + double average_infile_size, double sigma_infile_size, + double average_outfile_size, double sigma_outfile_size, + WorkflowType workflow_type, std::string jobname_suffix +) { + + // Map to store the workload specification + std::map workload; + std::string potential_separator = "_"; + if(jobname_suffix == ""){ + potential_separator = ""; + } + + // Initialize random number generators + std::normal_distribution<> flops_dist(average_flops, sigma_flops); + std::normal_distribution<> mem_dist(average_memory, sigma_memory); + std::normal_distribution<> insize_dist(average_infile_size, sigma_infile_size); + std::normal_distribution<> outsize_dist(average_outfile_size,sigma_outfile_size); + + for (size_t j = 0; j < num_jobs; j++) { + + // Create a job specification + JobSpecification job_specification; + + // Sample strictly positive task flops + double dflops = flops_dist(SimpleSimulator::gen); + while ((average_flops+sigma_flops) < dflops || dflops < 0.) dflops = flops_dist(SimpleSimulator::gen); + job_specification.total_flops = dflops; + + // Sample strictly positive task memory requirements + double dmem = mem_dist(SimpleSimulator::gen); + while ((average_memory+sigma_memory) < dmem || dmem < 0.) dmem = mem_dist(SimpleSimulator::gen); + job_specification.total_mem = dmem; + + for (size_t f = 0; f < infiles_per_task; f++) { + // Sample inputfile sizes + double dinsize = insize_dist(SimpleSimulator::gen); + while ((average_infile_size+3*sigma_infile_size) < dinsize || dinsize < 0.) dinsize = insize_dist(SimpleSimulator::gen); + job_specification.infiles.push_back(wrench::Simulation::addFile("infile_" + jobname_suffix + potential_separator + std::to_string(j) + "_" + std::to_string(f), dinsize)); + } + + // Sample outfile sizes + double doutsize = outsize_dist(SimpleSimulator::gen); + while ((average_outfile_size+3*sigma_outfile_size) < doutsize || doutsize < 0.) doutsize = outsize_dist(SimpleSimulator::gen); + job_specification.outfile = wrench::Simulation::addFile("outfile_" + jobname_suffix + potential_separator + std::to_string(j), doutsize); + + job_specification.workflow_type = workflow_type; + + workload["job_" + jobname_suffix + potential_separator + std::to_string(j)] = job_specification; + } + return workload; +} + +/** + * @brief Method to duplicate the jobs of a workload + * + * @param workload Workload containing jobs to duplicate + * @param duplications Number of duplications each job is duplicated + * @return std::map + */ +std::map duplicateJobs(std::map workload, size_t duplications, std::set> grid_storage_services) { + size_t num_jobs = workload.size(); + std::map dupl_workload; + for (auto & job_spec: workload) { + + boost::smatch job_index_matches; + boost::regex job_index_expression{"\\d+"}; + boost::regex_search(job_spec.first, job_index_matches, job_index_expression); + for (size_t d=0; d < duplications; d++) { + size_t dup_index; + std::stringstream job_index_sstream(job_index_matches[job_index_matches.size()-1]); + job_index_sstream >> dup_index; + dup_index += num_jobs * d; + std::string dupl_job_id = boost::replace_last_copy(job_spec.first, job_index_matches[job_index_matches.size()-1], std::to_string(dup_index)); + JobSpecification dupl_job_specs = job_spec.second; + if (d > 0) { + // TODO: Check if this works as intended + dupl_job_specs.outfile = wrench::Simulation::addFile(boost::replace_last_copy(dupl_job_specs.outfile->getID(), job_index_matches[job_index_matches.size()-1], std::to_string(dup_index)), dupl_job_specs.outfile->getSize()); + // TODO: Think of a better way to copy the outfile destination + for (auto ss : grid_storage_services) { + dupl_job_specs.outfile_destination = wrench::FileLocation::LOCATION(ss); + break; + } + } + dupl_workload.insert(std::make_pair(dupl_job_id, dupl_job_specs)); + } + } + return dupl_workload; +} + + +/** + * @brief Identify demanded services on hosts to run based on configured "type" property tag + * + * @param simulation Simulation object with already instantiated hosts + * + * @throw std::runtime_error, std::invalid_argument + */ +void SimpleSimulator::identifyHostTypes(std::shared_ptr simulation){ + std::vector hostname_list = simulation->getHostnameList(); + if (hostname_list.size() == 0) { + throw std::runtime_error("Empty hostname list! Have you instantiated the platform already?"); + } + for (const auto& hostname: hostname_list) { + auto hostProperties = wrench::S4U_Simulation::getHostProperty(hostname, "type"); + bool validType = false; + if (hostProperties == ""){ + throw std::runtime_error("Configuration property \"type\" missing for host " + hostname); + } + if (hostProperties.find("executor") != std::string::npos) { + SimpleSimulator::executors.insert(hostname); + } + if (hostProperties.find("fileregistry") != std::string::npos) { + SimpleSimulator::file_registries.insert(hostname); + } + if (hostProperties.find("networkmonitor") != std::string::npos) { + SimpleSimulator::network_monitors.insert(hostname); + } + if (hostProperties.find("storage") != std::string::npos) { + SimpleSimulator::storage_hosts.insert(hostname); + } + if (hostProperties.find("cache") != std::string::npos) { + SimpleSimulator::cache_hosts.insert(hostname); + } + if (hostProperties.find("worker") != std::string::npos) { + SimpleSimulator::worker_hosts.insert(hostname); + } + if (hostProperties.find("scheduler") != std::string::npos) { + SimpleSimulator::scheduler_hosts.insert(hostname); + } + //TODO: Check for invalid types + // if (! validType) { + // throw std::runtime_error("Invalid type " + hostProperties + " configuration in host " + hostname + "!"); + // } + } +} + +/** + * @brief Method to be executed once at simulation start, + * which finds all hosts in zone and all same level accopanying zones (siblings) + * and fills them into static map. + * @param include_subzones Flag to alse include all hosts in sibling subzones. Default: false + */ +void SimpleSimulator::fillHostsInSiblingZonesMap(bool include_subzones = false) { + std::map> zones_in_zones = wrench::S4U_Simulation::getAllSubZoneIDsByZone(); + std::map> hostnames_in_zones = wrench::S4U_Simulation::getAllHostnamesByZone(); + std::map> tmp_hosts_in_zones; + + if (include_subzones) { // include all hosts in child-zones into a hostname set + for (const auto& zones_in_zone: zones_in_zones) { + std::cerr << "Zone: " << zones_in_zone.first << std::endl; + for (const auto& zone: zones_in_zone.second) { + std::cerr << "\tSubzone: " << zone << std::endl; + for (const auto& host: hostnames_in_zones[zone]) { + std::cerr << "\t\tHost: " << host << std::endl; + tmp_hosts_in_zones[zones_in_zone.first].insert(host); + tmp_hosts_in_zones[zone].insert(host); + } + } + } + } else { // just convert the vector of hostnames to set in map + for (const auto& hostnamesByZone: hostnames_in_zones) { + std::vector hostnamesVec = hostnamesByZone.second; + std::set hostnamesSet(hostnamesVec.begin(), hostnamesVec.end()); + tmp_hosts_in_zones[hostnamesByZone.first] = hostnamesSet; + } + } + // identify all sibling zones and append their hosts + for (const auto& hosts_in_zone: tmp_hosts_in_zones) { + std::string zone = hosts_in_zone.first; + auto parent_zone = simgrid::s4u::Engine::get_instance()->netzone_by_name_or_null(zone)->get_parent(); + auto hosts = hosts_in_zone.second; + for (const auto& sibling: parent_zone->get_children()) { + auto hosts = tmp_hosts_in_zones[sibling->get_name()]; + SimpleSimulator::hosts_in_zones[zone].insert(hosts.begin(), hosts.end()); + } + } +} + + +int main(int argc, char **argv) { + + // instantiate a simulation + auto simulation = wrench::Simulation::createSimulation(); + + // Initialization of the simulation + simulation->init(&argc, argv); + + /* Parsing of the command-line arguments for this WRENCH simulation */ + auto vm = process_program_options(argc, argv); + + // The first argument is the platform description file, written in XML following the SimGrid-defined DTD + std::string platform_file = vm["platform"].as(); + + // output-file name containing simulation information + std::string filename = vm["output-file"].as(); + + size_t num_jobs = vm["njobs"].as(); + size_t infiles_per_job = vm["ninfiles"].as(); + double hitrate = vm["hitrate"].as(); + + double average_flops = vm["flops"].as(); + double sigma_flops = vm["sigma-flops"].as(); + double average_memory = vm["mem"].as(); + double sigma_memory = vm["sigma-mem"].as(); + double average_infile_size = vm["insize"].as(); + double sigma_infile_size = vm["sigma-insize"].as(); + double average_outfile_size = vm["outsize"].as(); + double sigma_outfile_size = vm["sigma-outsize"].as(); + + size_t duplications = vm["duplications"].as(); + std::vector workflow_configurations = vm["workflow-configurations"].as>(); + + // Flags to turn on/off the caching of jobs' input-files + SimpleSimulator::infile_caching_on = !(vm["no-caching"].as()); + + // Flags to turn prefetching for streaming of input-files + std::cerr << "Prefetching switch off?: " << vm["prefetch-off"].as() << std::endl; + SimpleSimulator::prefetching_on = !(vm["prefetch-off"].as()); + + // Set XRootD block size + SimpleSimulator::xrd_block_size = vm["xrd-blocksize"].as(); + + // Set StorageService buffer size/type + std::string buffer_size = vm["storage-buffer-size"].as().get(); + + // Choice of cache locality scope + std::string scope_caches = vm["cache-scope"].as().value; + bool rec_netzone_caches = false; + if (scope_caches.find("network") == std::string::npos) { + SimpleSimulator::local_cache_scope = true; + } else { + if (scope_caches.find("sibling") != std::string::npos) { + rec_netzone_caches = true; + } + } + + + /* Create a workload */ + std::cerr << "Constructing workload specification..." << std::endl; + + std::map workload_spec = {}; + + + if(workflow_configurations.size() == 0){ + workload_spec = fill_workflow( + num_jobs, infiles_per_job, + average_flops, sigma_flops, + average_memory,sigma_memory, + average_infile_size, sigma_infile_size, + average_outfile_size, sigma_outfile_size, + vm["workflow-type"].as().get(), "" + ); + + std::cerr << "The workflow has " << std::to_string(num_jobs) << " unique jobs" << std::endl; + } + else { + for(auto &wf_confpath : workflow_configurations){ + std::ifstream wf_conf(wf_confpath); + nlohmann::json wf_json = nlohmann::json::parse(wf_conf); + + // Checking json syntax to match workflow spec + for (auto &wf_key : workflow_keys){ + try { + if(!wf_json.contains(wf_key)){ + throw std::invalid_argument("ERROR: the workflow configuration " + wf_confpath + " must contain " + wf_key + " as information."); + } + } + catch(std::invalid_argument& e){ + std::cerr << e.what() << std::endl; + exit(EXIT_FAILURE); + } + } + std::string workflow_type_lower = boost::to_lower_copy(std::string(wf_json["workflow_type"])); + auto workflow_spec = fill_workflow( + wf_json["num_jobs"], wf_json["infiles_per_job"], + wf_json["average_flops"], wf_json["sigma_flops"], + wf_json["average_memory"], wf_json["sigma_memory"], + wf_json["average_infile_size"], wf_json["sigma_infile_size"], + wf_json["average_outfile_size"], wf_json["sigma_outfile_size"], + get_workflow_type(workflow_type_lower), wf_json["name"] + ); + workload_spec.insert(workflow_spec.begin(), workflow_spec.end()); + std::cerr << "The workflow " << std::string(wf_json["name"]) << " has " << wf_json["num_jobs"] << " unique jobs" << std::endl; + } + } + + + /* Read and parse the platform description file to instantiate a simulation platform */ + std::cerr << "Instantiating SimGrid platform..." << std::endl; + simulation->instantiatePlatform(platform_file); + + + /* Identify demanded and create storage and compute services and add them to the simulation */ + SimpleSimulator::identifyHostTypes(simulation); + + // Fill reachable caches map + if (rec_netzone_caches) { + SimpleSimulator::fillHostsInSiblingZonesMap(); + } else { + for (const auto& hostnamesByZone: wrench::S4U_Simulation::getAllHostnamesByZone()) { + std::vector hostnamesVec = hostnamesByZone.second; + std::set hostnamesSet(hostnamesVec.begin(), hostnamesVec.end()); + SimpleSimulator::hosts_in_zones[hostnamesByZone.first] = hostnamesSet; + } + } + + // Create a list of cache storage services + std::set> cache_storage_services; + for (auto host: SimpleSimulator::cache_hosts) { + //TODO: Support more than one type of cache mounted differently? + //TODO: This might not be necessary since different cache layers are typically on different hosts + auto storage_service = simulation->add( + new wrench::SimpleStorageService( + host, {"/"}, + {{wrench::SimpleStorageServiceProperty::BUFFER_SIZE, buffer_size}}, + {} + ) + ); + cache_storage_services.insert(storage_service); + } + + // and remote storages that are able to serve all file requests + //TODO: Think of a way to support grid storages serving only some datasets + std::set> grid_storage_services; + for (auto host: SimpleSimulator::storage_hosts) { + auto storage_service = simulation->add( + new wrench::SimpleStorageService( + host, {"/"}, + {{wrench::SimpleStorageServiceProperty::BUFFER_SIZE, buffer_size}}, + {} + ) + ); + grid_storage_services.insert(storage_service); + } + + // Create a list of compute services that will be used by the HTCondorService + std::set> condor_compute_resources; + for (auto host: SimpleSimulator::worker_hosts) { + condor_compute_resources.insert( + simulation->add( + new wrench::BareMetalComputeService( + host, + {std::make_pair( + host, + std::make_tuple( + wrench::Simulation::getHostNumCores(host), + wrench::Simulation::getHostMemoryCapacity(host) + ) + )}, + "" + ) + ) + ); + } + + // Instantiate a HTcondorComputeService and add it to the simulation + std::set> htcondor_compute_services; + //TODO: Think of a way to support more than one HTCondor scheduler + if (SimpleSimulator::scheduler_hosts.size() != 1) { + throw std::runtime_error("Currently this simulator supports only a single HTCondor scheduler!"); + } + for (auto host: SimpleSimulator::scheduler_hosts) { + htcondor_compute_services.insert( + simulation->add( + new wrench::HTCondorComputeService( + host, + condor_compute_resources, + { + {wrench::HTCondorComputeServiceProperty::NEGOTIATOR_OVERHEAD, "1.0"}, + {wrench::HTCondorComputeServiceProperty::GRID_PRE_EXECUTION_DELAY, "10.0"}, + {wrench::HTCondorComputeServiceProperty::GRID_POST_EXECUTION_DELAY, "10.0"}, + {wrench::HTCondorComputeServiceProperty::NON_GRID_PRE_EXECUTION_DELAY, "5.0"}, + {wrench::HTCondorComputeServiceProperty::NON_GRID_POST_EXECUTION_DELAY, "5.0"} + }, + {} + ) + ) + ); + } + + + /* Instantiate file registry services */ + std::set> file_registry_services; + for (auto host: SimpleSimulator::file_registries) { + std::cerr << "Instantiating a FileRegistryService on " << host << "..." << std::endl; + auto file_registry_service = simulation->add(new wrench::FileRegistryService(host)); + file_registry_services.insert(file_registry_service); + } + + + /* Instantiate Execution Controllers */ + std::set> execution_controllers; + //TODO: Think of a way to support more than one execution controller + if (SimpleSimulator::executors.size() != 1) { + throw std::runtime_error("Currently this simulator supports only a single execution controller!"); + } + for (auto host: SimpleSimulator::executors) { + auto wms = simulation->add( + new SimpleExecutionController( + workload_spec, + htcondor_compute_services, + grid_storage_services, + cache_storage_services, + host, + //hitrate, + filename + ) + ); + execution_controllers.insert(wms); + } + + /* Instantiate inputfiles and set outfile destinations*/ + auto wms = *execution_controllers.begin(); + std::cerr << "Creating and staging input files plus set destination of output files..." << std::endl; + try { + for (auto &job_name_spec: wms->get_workload_spec()) { + // job specifications + auto &job_spec = job_name_spec.second; + std::shuffle(job_spec.infiles.begin(), job_spec.infiles.end(), SimpleSimulator::gen); // Shuffle the input files + // Compute the job's incremental inputfiles size + double incr_inputfile_size = 0.; + for (auto const &f : job_spec.infiles) { + incr_inputfile_size += f->getSize(); + } + double cached_files_size = 0.; + for (auto const &f : job_spec.infiles) { + // Distribute the inputfiles on all GRID storages + //TODO: Think of a more realistic distribution pattern and avoid duplications + for (auto storage_service: grid_storage_services) { + // simulation->stageFile(f, storage_service); + simulation->createFile(f, wrench::FileLocation::LOCATION(storage_service)); + SimpleSimulator::global_file_map[storage_service].touchFile(f.get()); + } + // Distribute the infiles on all caches until desired hitrate is reached + //TODO: Rework the initialization of input files on caches + if (cached_files_size < hitrate*incr_inputfile_size) { + for (const auto& cache : cache_storage_services) { + // simulation->stageFile(f, cache); + simulation->createFile(f, wrench::FileLocation::LOCATION(cache)); + SimpleSimulator::global_file_map[cache].touchFile(f.get()); + } + cached_files_size += f->getSize(); + } + } + if (cached_files_size/incr_inputfile_size < hitrate) { + throw std::runtime_error("Desired hitrate was not reached!"); + } + + // Set outfile destinations + // TODO: Think of a way to identify a specific (GRID) storage + for (auto storage_service: grid_storage_services) { + job_spec.outfile_destination = wrench::FileLocation::LOCATION(storage_service); + break; + } + } + } catch (std::runtime_error &e) { + std::cerr << "Exception: " << e.what() << std::endl; + return 0; + } + + /* Duplicate the workload */ + std::cerr << "Duplicating workload..." << std::endl; + auto new_workload_spec = duplicateJobs(wms->get_workload_spec(), duplications, grid_storage_services); + wms->set_workload_spec(new_workload_spec); + std::cerr << "The workload now has " << std::to_string(new_workload_spec.size()) << " jobs in total " << std::endl; + + + /* Launch the simulation */ + std::cerr << "Launching the Simulation..." << std::endl; + try { + simulation->launch(); + } catch (std::runtime_error &e) { + std::cerr << "Exception: " << e.what() << std::endl; + return 0; + } + std::cerr << "Simulation done!" << std::endl; + + // Check routes from workers to remote storages +#if 0 + for (auto worker_host_name: SimpleSimulator::worker_hosts) { + for(auto remote_host_name: SimpleSimulator::storage_hosts) { + std::vector links; + double latency; + auto worker_host = simgrid::s4u::Host::by_name(worker_host_name); + auto remote_host = simgrid::s4u::Host::by_name(remote_host_name); + worker_host->route_to(remote_host, links, &latency); + std::cerr << "ROUTE FROM " << worker_host->get_name() << " TO " << remote_host->get_name() << ":\n"; + for (const auto l: links) { + std::cerr << " - " << l->get_name() << "\n"; + } + } + } +#endif + + + return 0; +} diff --git a/src/SimpleSimulator.h b/src/SimpleSimulator.h new file mode 100644 index 0000000..0ee8829 --- /dev/null +++ b/src/SimpleSimulator.h @@ -0,0 +1,51 @@ + + +#ifndef S_SIMPLESIMULATOR_H +#define S_SIMPLESIMULATOR_H + +#include "LRU_FileList.h" + +class SimpleSimulator { + +public: + + static void identifyHostTypes(std::shared_ptr simulation); + + static std::set cache_hosts; // hosts configured to provide a cache + static std::set storage_hosts; // hosts configured to provide GRID storage + static std::set worker_hosts; // hosts configured to provide worker capacity + static std::set scheduler_hosts; // hosts configured to provide HTCondor scheduler + static std::set executors; // hosts configured to provide manage job activities + static std::set file_registries; // hosts configured to manage a file registry + static std::set network_monitors; // hosts configured to monitor network + + static void fillHostsInSiblingZonesMap(bool include_subzones); + static bool local_cache_scope; + + static std::map> hosts_in_zones; // map holding information of all hosts present in network zones + + static bool infile_caching_on; + static bool prefetching_on; + static std::map, LRU_FileList> global_file_map; + static double xrd_block_size; + static std::mt19937 gen; + + // Flops distribution + static double mean_flops; + static double sigma_flops; + static std::normal_distribution* flops_dist; + // Memory distribution + static double mean_mem; + static double sigma_mem; + static std::normal_distribution* mem_dist; + // Input-file distribution + static double mean_insize; + static double sigma_insize; + static std::normal_distribution* insize_dist; + // Output-file distribution + static double mean_outsize; + static double sigma_outsize; + static std::normal_distribution* outsize_dist; +}; + +#endif //S_SIMPLESIMULATOR_H diff --git a/src/computation/CacheComputation.cpp b/src/computation/CacheComputation.cpp new file mode 100644 index 0000000..bbf241e --- /dev/null +++ b/src/computation/CacheComputation.cpp @@ -0,0 +1,218 @@ +#include + +XBT_LOG_NEW_DEFAULT_CATEGORY(cache_computation, "Log category for CacheComputation"); + +#include "CacheComputation.h" +#include "../MonitorAction.h" + +//#define SIMULATE_FILE_LOOKUP_OPERATION 1 + + +/** + * @brief Construct a new CacheComputation::CacheComputation object + * to be used as a lambda within a compute action, which shall take caching of input-files into account. + * + * @param cache_storage_services Storage services reachable to retrieve and cache input files + * @param grid_storage_services Storage services reachable to retrieve input files + * @param files Input files of the job to process + * @param total_flops Total #FLOPS of the whole compute action of the job + */ +CacheComputation::CacheComputation(std::set> &cache_storage_services, + std::set> &grid_storage_services, + std::vector> &files, + double total_flops) { + this->cache_storage_services = cache_storage_services; + this->grid_storage_services = grid_storage_services; + this->files = files; + this->total_flops = total_flops; + this->total_data_size = determineTotalDataSize(files); +} + +/** + * @brief Cache by the job required files on one of the local host's + * reachable cache storage services. + * Free space when needed according to an LRU scheme. + * + * TODO: Find some optimal sources serving and destinations providing files to jobs. + * TODO: Find solutions for possible race conditions, when several jobs require same files. + * + * @param hostname Name of the host, where the job runs + */ +void CacheComputation::determineFileSourcesAndCache(std::shared_ptr action_executor, bool cache_files = true) { + + std::string hostname = action_executor->getHostname(); // host where action is executed + auto host = simgrid::s4u::Host::by_name(hostname); + std::string netzone = host->get_englobing_zone()->get_name(); // network zone executing host belongs to + auto the_action = std::dynamic_pointer_cast(action_executor->getAction()); // executed action + + double cached_data_size = 0.; + double remote_data_size = 0.; + + // Identify all cache storage services that can be reached from + // this host, which runs the streaming action + std::vector> matched_storage_services; + + for (auto const &ss : this->cache_storage_services) { + bool host_in_scope = false; + if (SimpleSimulator::local_cache_scope) { + host_in_scope = (ss->getHostname() == hostname); + } else { + host_in_scope = (SimpleSimulator::hosts_in_zones[netzone].find(ss->getHostname()) != SimpleSimulator::hosts_in_zones[netzone].end()); + } + if (host_in_scope) { + matched_storage_services.push_back(ss); + WRENCH_DEBUG("Found a reachable cache on host %s", ss->getHostname().c_str()); + } + } + if (matched_storage_services.empty()) { + WRENCH_DEBUG("Couldn't find a reachable cache"); + } + + + // For each file, identify where to read it from and/or deal with cache updates, etc. + for (auto const &f : this->files) { + // find a source providing the required file + std::shared_ptr source_ss; + // See whether the file is already available in a "reachable" cache storage service + for (auto const &ss : matched_storage_services) { +#ifdef SIMULATE_FILE_LOOKUP_OPERATION + bool has_file = ss->lookupFile(f, wrench::FileLocation::LOCATION(ss)); +#else + bool has_file = SimpleSimulator::global_file_map[ss].hasFile(f); +#endif + if (has_file) { + source_ss = ss; + WRENCH_DEBUG("Found file %s with size %.2f in cache %s", f->getID().c_str(), f->getSize(), source_ss->getHostname().c_str()); + cached_data_size += f->getSize(); + break; + } + } + // If yes, we're done + if (source_ss) { + SimpleSimulator::global_file_map[source_ss].touchFile(f.get()); + this->file_sources[f] = wrench::FileLocation::LOCATION(source_ss); + continue; + } + // If not, then we have to copy the file from some GRID source to some reachable cache storage service + // TODO: Find the optimal GRID source, whatever that means (right now it's whichever one works first) + for (auto const &ss : this->grid_storage_services) { +#ifdef SIMULATE_FILE_LOOKUP_OPERATION + bool has_file = ss->lookupFile(f, wrench::FileLocation::LOCATION(ss)); +#else + bool has_file = SimpleSimulator::global_file_map[ss].hasFile(f); +#endif + if (has_file) { + source_ss = ss; + remote_data_size += f->getSize(); + break; + } + } + if (!source_ss) { + throw std::runtime_error("CacheComputation(): Couldn't find file " + f->getID() + " on any storage service!"); + } else { + SimpleSimulator::global_file_map[source_ss].touchFile(f.get()); + } + + // When there is a reachable cache, cache the file and evict others when needed + if (!matched_storage_services.empty()) { + // Destination storage to cache the file + // TODO: Find the optimal reachable cache destination, whatever that means (right now it's random, with a bad RNG!) + auto destination_ss = matched_storage_services.at(rand() % matched_storage_services.size()); + + // Evict files while to create space, using an LRU scheme! + double free_space = destination_ss->getFreeSpace().begin()->second; + while (free_space < f->getSize()) { + auto to_evict = SimpleSimulator::global_file_map[destination_ss].removeLRUFile(); + WRENCH_INFO("Evicting file %s from storage service on host %s", + to_evict->getID().c_str(), destination_ss->getHostname().c_str()); + destination_ss->deleteFile(to_evict, wrench::FileLocation::LOCATION(destination_ss)); + free_space += to_evict->getSize(); + } + + // Instead of doing this file copy right here, instantly create the file locally for next jobs + if (cache_files) { + //? Alternative: Wait for computation to finish and copy file then + // TODO: Better idea perhaps: have the first job that streams the file update a counter + // TODO: of file blocks available at the storage service, and subsequent jobs + // TODO: can read a block only if it's available (e.g., by waiting on some + // TODO: condition variable, which is signaled by the first job each time it + // TODO: reads a block). + WRENCH_DEBUG("Caching file %s on storage %s", f->getID().c_str(), destination_ss->getHostname().c_str()); + // wrench::StorageService::copyFile(f, wrench::FileLocation::LOCATION(source_ss), wrench::FileLocation::LOCATION(destination_ss)); + wrench::Simulation::createFile(f, wrench::FileLocation::LOCATION(destination_ss)); + + SimpleSimulator::global_file_map[destination_ss].touchFile(f.get()); + + // this->file_sources[f] = wrench::FileLocation::LOCATION(destination_ss); + } + + } + + this->file_sources[f] = wrench::FileLocation::LOCATION(source_ss); + } + + // Fill monitoring information + if (std::abs(cached_data_size + remote_data_size - this->total_data_size) > 1.) { + throw std::runtime_error("There is more or less data read from cache plus remote than the job's input-data size!"); + } + WRENCH_DEBUG("Hitrate: %.2f (Cached data: %.2f, total data: %.2f)", cached_data_size/this->total_data_size, cached_data_size, this->total_data_size); + the_action->set_hitrate(cached_data_size/this->total_data_size); +} + +//? Question for Henri: put this into determineFileSources function to prevent two times the same loop? +/** + * @brief Determine the incremental size of all input-files of a job + * + * @param files Input files of the job to consider + * @return double + */ +double CacheComputation::determineTotalDataSize(const std::vector> &files) { + double incr_file_size = 0.0; + for (auto const &f : this->files) { + incr_file_size += f->getSize(); + } + return incr_file_size; +} + +/** + * @brief Functor operator to be usable as lambda in custom action + * + * @param action_executor + */ +void CacheComputation::operator () (std::shared_ptr action_executor) { + std::string hostname = action_executor->getHostname(); + + // Identify all file sources (and deal with caching, evictions, etc. + WRENCH_INFO("Determining file sources for cache computation"); + this->determineFileSourcesAndCache(action_executor, SimpleSimulator::infile_caching_on); + // Perform computation + WRENCH_INFO("Performing the computation action"); + this->performComputation(action_executor); + +} + +/** + * @brief Determine the share on the total number of FLOPS to be computed + * in the step processing a fraction of the full input data + * + * @param data_size Size of the input-data block considered + * @param total_data_size Total incremental size of all input-files + * @return double + */ +double CacheComputation::determineFlops(double data_size, double total_data_size) { + double flops = this->total_flops * data_size / total_data_size; + return flops; +} + +/** + * @brief Perform the computation within the simulation of the job + * + * @param action_executor Handle to access the action this computation belongs to + */ +void CacheComputation::performComputation(std::shared_ptr action_executor) { + throw std::runtime_error( + "Base class CacheComputation has no performComputation implemented! \ + It is meant only as a purely virtual placeholder. \ + Use one of the derived classes for the compute action!" + ); +} diff --git a/sgbatch/src/computation/CacheComputation.h b/src/computation/CacheComputation.h similarity index 53% rename from sgbatch/src/computation/CacheComputation.h rename to src/computation/CacheComputation.h index d28b11b..bd43c5a 100644 --- a/sgbatch/src/computation/CacheComputation.h +++ b/src/computation/CacheComputation.h @@ -10,23 +10,26 @@ class CacheComputation { public: - CacheComputation(std::set> & storage_services, - std::vector> &files, - double total_flops + CacheComputation( + std::set> & cache_storage_services, + std::set> & grid_storage_services, + std::vector> &files, + double total_flops ); virtual ~CacheComputation() = default; - void determineFileSources(std::string hostname); + void determineFileSourcesAndCache(std::shared_ptr action_executor, bool cache_files); void operator () (std::shared_ptr action_executor); double determineFlops(double data_size, double total_data_size); - virtual void performComputation(std::string &hostname) = 0; + virtual void performComputation(std::shared_ptr action_executor) = 0; protected: - std::set> storage_services; + std::set> cache_storage_services; + std::set> grid_storage_services; std::vector> files; //? does this need to be ordered? double total_flops; diff --git a/src/computation/CopyComputation.cpp b/src/computation/CopyComputation.cpp new file mode 100644 index 0000000..a6d3c92 --- /dev/null +++ b/src/computation/CopyComputation.cpp @@ -0,0 +1,86 @@ +#include + +XBT_LOG_NEW_DEFAULT_CATEGORY(copy_computation, "Log category for CopyComputation"); + +#include "CopyComputation.h" +#include "MonitorAction.h" + +/** + * @brief Construct a new CopyComputation::CopyComputation object + * to be used as a lambda within a compute action, which shall take caching of input-files into account. + * File read of all input-files and compute steps are performed sequentially. + * + * @param storage_services Storage services reachable to retrieve input files (caches plus remote) + * @param files Input files of the job to process + * @param total_flops Total #FLOPS of the whole compute action of the job + */ +CopyComputation::CopyComputation( + std::set> &cache_storage_services, + std::set> &grid_storage_services, + std::vector> &files, + double total_flops) : CacheComputation::CacheComputation( + cache_storage_services, + grid_storage_services, + files, + total_flops + ) {} + +/** + * @brief Perform the computation within the simulation of the job. + * First read all input-files and then compute the whole number of FLOPS. + * + * @param action_executor Handle to access the action this computation belongs to + */ +void CopyComputation::performComputation(std::shared_ptr action_executor) { + + auto the_action = std::dynamic_pointer_cast(action_executor->getAction()); // executed action + + double infile_transfer_time = 0.; + double compute_time = 0.; + + WRENCH_INFO("Performing copy computation!"); + // Incremental size of all input files to process + double total_data_size = this->total_data_size; + // Read all input files before computation + double data_size = 0; + for (auto const &fs : this->file_sources) { + WRENCH_INFO("Reading file %s from storage service on host %s", + fs.first->getID().c_str(), fs.second->getStorageService()->getHostname().c_str()); + + double read_start_time = wrench::Simulation::getCurrentSimulatedDate(); + fs.second->getStorageService()->readFile(fs.first, fs.second); + double read_end_time = wrench::Simulation::getCurrentSimulatedDate(); + + data_size += fs.first->getSize(); + if (read_end_time >= read_start_time) { + infile_transfer_time += read_end_time - read_start_time; + } else { + throw std::runtime_error( + "Reading file " + fs.first->getID() + " finished before it started!" + ); + } + } + if (! (std::abs(data_size-total_data_size) < 1.)) { + throw std::runtime_error("Something went wrong in the data size computation!"); + } + + // Perform the computation as needed + double flops = determineFlops(data_size, total_data_size); + WRENCH_INFO("Computing %.2lf flops", flops); + double compute_start_time = wrench::Simulation::getCurrentSimulatedDate(); + wrench::Simulation::compute(flops); + double compute_end_time = wrench::Simulation::getCurrentSimulatedDate(); + + if (compute_end_time > compute_start_time) { + compute_time += compute_end_time - compute_start_time; + } else { + throw std::runtime_error( + "Computing job " + the_action->getJob()->getName() + " finished before it started!" + ); + } + + // Fill monitoring information + the_action->set_infile_transfer_time(infile_transfer_time); + the_action->set_calculation_time(compute_time); +} + diff --git a/src/computation/CopyComputation.h b/src/computation/CopyComputation.h new file mode 100644 index 0000000..476d6e6 --- /dev/null +++ b/src/computation/CopyComputation.h @@ -0,0 +1,26 @@ + + +#ifndef S_COPYCOMPUTATION_H +#define S_COPYCOMPUTATION_H + +#include + +#include "CacheComputation.h" + +class CopyComputation : public CacheComputation { + +public: + CopyComputation( + std::set> &cache_storage_services, + std::set> &grid_storage_services, + std::vector> &files, + double total_flops + ); + + void performComputation(std::shared_ptr action_executor) override; + +private: + +}; + +#endif //S_COPYCOMPUTATION_H diff --git a/src/computation/StreamedComputation.cpp b/src/computation/StreamedComputation.cpp new file mode 100644 index 0000000..00f1387 --- /dev/null +++ b/src/computation/StreamedComputation.cpp @@ -0,0 +1,138 @@ +#include + +XBT_LOG_NEW_DEFAULT_CATEGORY(streamed_computation, "Log category for StreamedComputation"); + +#include "StreamedComputation.h" +#include "MonitorAction.h" + + + +/** + * @brief Construct a new StreamedComputation::StreamedComputation object + * to be used as a lambda within a compute action, which shall take caching of input-files into account. + * File read is performed asynchronously in blocks and the according coompute step is executed + * once the corresponding block is available. + * + * @param storage_services Storage services reachable to retrieve input files (caches plus remote) + * @param files Input files of the job to process + * @param total_flops Total #FLOPS of the whole compute action of the job + */ +StreamedComputation::StreamedComputation( + std::set> &cache_storage_services, + std::set> &grid_storage_services, + std::vector> &files, + double total_flops, bool prefetch_on) : CacheComputation::CacheComputation( + cache_storage_services, + grid_storage_services, + files, + total_flops + ) {prefetching_on = prefetch_on;} + +/** + * @brief Perform the computation within the simulation of the job. + * Asynchronously read the input files (don't wait for previous computation to finish) in blocks + * and compute the according share of FLOPS once read finished. + * + * @param action_executor Handle to access the action this computation belongs to + */ +void StreamedComputation::performComputation(std::shared_ptr action_executor) { + + auto the_action = std::dynamic_pointer_cast(action_executor->getAction()); // executed action + + double infile_transfer_time = 0.; + double compute_time = 0.; + + WRENCH_INFO("Performing streamed computation!"); + // Incremental size of all input files to be processed + auto total_data_size = this->total_data_size; + for (auto const &fs : this->file_sources) { + WRENCH_INFO("Streaming computation for input file %s", fs.first->getID().c_str()); + double data_to_process = fs.first->getSize(); + + // Compute the number of blocks + int num_blocks = int(std::ceil(data_to_process / (double) SimpleSimulator::xrd_block_size)); + + // Read the first block + double read_start_time = wrench::Simulation::getCurrentSimulatedDate(); + fs.second->getStorageService()->readFile(fs.first, fs.second, std::min(SimpleSimulator::xrd_block_size, data_to_process)); + double read_end_time = wrench::Simulation::getCurrentSimulatedDate(); + if (read_end_time > read_start_time) { + infile_transfer_time += read_end_time - read_start_time; + } else { + throw std::runtime_error( + "Reading block " + std::to_string(0) + + " of file " + fs.first->getID() + " finished before it started!" + ); + } + + // Process next blocks: compute block i while reading block i+i + for (int i=0; i < num_blocks - 1; i++) { + double num_bytes = std::min(SimpleSimulator::xrd_block_size, data_to_process); + double num_flops = determineFlops(num_bytes, total_data_size); + // WRENCH_INFO("Chunk: %.2lf bytes / %.2lf flops", num_bytes, num_flops); + // Start the computation asynchronously + simgrid::s4u::ExecPtr exec = simgrid::s4u::this_actor::exec_init(num_flops); + double exec_start_time = 0.0; + double exec_end_time = 0.0; + if(this->prefetching_on){ + exec->start(); + exec_start_time = exec->get_start_time(); + // Read data from the file + read_start_time = wrench::Simulation::getCurrentSimulatedDate(); + fs.second->getStorageService()->readFile(fs.first, fs.second, num_bytes); + read_end_time = wrench::Simulation::getCurrentSimulatedDate(); + // Wait for the computation to be done + exec->wait(); + exec_end_time = exec->get_finish_time(); + } + else { + exec->start(); + exec_start_time = exec->get_start_time(); + exec->wait(); + exec_end_time = exec->get_finish_time(); + read_start_time = wrench::Simulation::getCurrentSimulatedDate(); + fs.second->getStorageService()->readFile(fs.first, fs.second, num_bytes); + read_end_time = wrench::Simulation::getCurrentSimulatedDate(); + } + data_to_process -= num_bytes; + if (exec_end_time >= exec_start_time) { + compute_time += exec_end_time - exec_start_time; + } else { + throw std::runtime_error( + "Executing block " + std::to_string(i) + + " of job " + the_action->getJob()->getName() + " finished before it started!" + ); + } + if (read_end_time > read_start_time) { + infile_transfer_time += read_end_time - read_start_time; + } else { + throw std::runtime_error( + "Reading block " + std::to_string(i) + + " of file " + fs.first->getID() + " finished before it started!" + ); + } + } + + // Process last block + double num_flops = determineFlops(std::min(SimpleSimulator::xrd_block_size, data_to_process), total_data_size); + simgrid::s4u::ExecPtr exec = simgrid::s4u::this_actor::exec_init(num_flops); + exec->start(); + double exec_start_time = exec->get_start_time(); + exec->wait(); + double exec_end_time = exec->get_finish_time(); + if (exec_end_time > exec_start_time) { + compute_time += exec_end_time - exec_start_time; + } else { + throw std::runtime_error( + "Executing block " + std::to_string(num_blocks-1) + + " of job " + the_action->getJob()->getName() + " finished before it started!" + ); + } + } + + // Fill monitoring information + the_action->set_infile_transfer_time(infile_transfer_time); + the_action->set_calculation_time(compute_time); + +} + diff --git a/src/computation/StreamedComputation.h b/src/computation/StreamedComputation.h new file mode 100644 index 0000000..5114776 --- /dev/null +++ b/src/computation/StreamedComputation.h @@ -0,0 +1,30 @@ + + +#ifndef S_STREAMEDCOMPUTATION_H +#define S_STREAMEDCOMPUTATION_H + +#include + +#include "CacheComputation.h" + +class StreamedComputation : public CacheComputation { + +public: + // TODO: REMOVE MOST THINGS IN HERE AND RELY ON THE GLOBALS IN SimpleSimulation::... + StreamedComputation( + std::set> &cache_storage_services, + std::set> &grid_storage_services, + std::vector> &files, + double total_flops, + bool prefetch_on + ); + + void performComputation(std::shared_ptr action_executor) override; + +private: + + bool prefetching_on; + +}; + +#endif //S_STREAMEDCOMPUTATION_H diff --git a/src/util/DefaultValues.h b/src/util/DefaultValues.h new file mode 100644 index 0000000..c8c428b --- /dev/null +++ b/src/util/DefaultValues.h @@ -0,0 +1,19 @@ + + +#ifndef S_DEFAULTVALUES_H +#define S_DEFAULTVALUES_H + +/** + * @brief Set of default values that can be used in the simulation. + * ATTENTION: Comparing two float values is never a good idea, + * rather check for differences + */ +class DefaultValues { +public: + static constexpr const int UndefinedInt = -9999; + static constexpr const float UndefinedFloat = -9999.0f; + static constexpr const double UndefinedDouble = -9999.0; +}; + + +#endif //S_DEFAULTVALUES_H diff --git a/src/util/Enums.h b/src/util/Enums.h new file mode 100644 index 0000000..f6e44a2 --- /dev/null +++ b/src/util/Enums.h @@ -0,0 +1,67 @@ + + +#ifndef S_ENUMS_H +#define S_ENUMS_H + +#include +#include + +/** + * @brief Set of enums that can be used in the simulation and related methods. + */ + +enum WorkflowType {Calculation, Streaming, Copy}; + +inline WorkflowType get_workflow_type(std::string wfname) { + if(wfname == "calculation") { + return WorkflowType::Calculation; + } + else if (wfname == "streaming") { + return WorkflowType::Streaming; + } + else if (wfname == "copy") { + return WorkflowType::Copy; + } + else { + throw std::runtime_error("Workflow type " + wfname + " invalid. Please choose 'calculation', 'streaming', or 'copy'"); + } +} + + +/** + * @enum StorageServiceBufferType + * @brief Types of wrench::StorageServiceProperty BUFFER_SIZE values + */ +enum StorageServiceBufferType { + Infinity, /* full buffering */ + // Zero, /* ideal (continous) flow model, not implemented yet */ + Value /* Any integral value between 0 and infinity corresponding to a real buffer size (small buffer size -> many simulation calls -> slower simulation) */ +}; + +/** + * @brief Get the StorageServiceProperty BUFFER_SIZE value type object + * + * @param ssprop + * @return StorageServiceBufferType + */ +inline StorageServiceBufferType get_ssbuffer_type(std::string ssprop) { + if(ssprop == "infinity") { + return StorageServiceBufferType::Infinity; + } + else if (ssprop == "0") { + throw std::logic_error("Feature 'continous flow model' for storage buffers not implemented yet"); + // return StorageServiceBufferType::Zero; + } + else { + if ((!ssprop.empty()) && (ssprop.find_first_not_of("0123456789")==std::string::npos) && (std::stoll(ssprop) > 0)) { + return StorageServiceBufferType::Value; + } + else { + throw std::runtime_error("StorageService buffer value " + ssprop + "invalid. Please choose 'infinity', 'zero' or a positive long integer value in between"); + } + } +} + + +#endif //S_ENUMS_H + diff --git a/tools/hitratePerformancePlots.py b/tools/hitratePerformancePlots.py new file mode 100755 index 0000000..04bb955 --- /dev/null +++ b/tools/hitratePerformancePlots.py @@ -0,0 +1,298 @@ +#! /usr/bin/python3 + +import pandas as pd +import numpy as np +from matplotlib import pyplot as plt +import seaborn as sns +import os.path +import argparse +from collections.abc import Iterable + + +plt.rcParams['figure.autolayout'] = True +pd.set_option('display.max_columns',None) +plt.rcParams['axes.facecolor'] = 'white' +plt.rcParams['axes.spines.left'] = True +plt.rcParams['axes.spines.right'] = False +plt.rcParams['axes.spines.top'] = False +plt.rcParams['axes.spines.bottom'] = True +plt.rcParams['axes.grid'] = False +plt.rcParams['axes.grid.axis'] = 'both' +plt.rcParams['axes.labelcolor'] = '#555555' +plt.rcParams['text.color'] = 'black' +plt.rcParams['figure.figsize'] = 6,4 +plt.rcParams['figure.dpi'] = 100 +plt.rcParams['figure.titleweight'] = 'normal' +plt.rcParams['font.family'] = 'sans-serif' + + +QUANTITIES = { + "Walltime": { + "ylabel": "jobtime / min", + "ylim": None, # [20,65], + }, + "IOtime": { + "ylabel": "transfer time / min", + "ylim": None, # [20,65], + }, + "Efficiency": { + "ylabel": "CPU eff.", + "ylim": [0,1.05], + }, +} + + +scenario_plotlabel_dict = { + "copy": "Input-files copied", + "fullstream": "Block-streaming", + "SGBatch_fullstream_10G": "SG-Batch 10Gb/s gateway", + "SGBatch_fullstream_1G": "SG-Batch 1Gb/s gateway", + "SGBatch_fullstream_10G_70Mcache": "SG-Batch 10Gb/s gateway 70MB/s cache" +} + + +def valid_file(param: str) -> str: + base, ext = os.path.splitext(param) + if ext.lower() not in (".csv"): + raise argparse.ArgumentTypeError("File must have a csv extension") + if not os.path.exists(param): + raise FileNotFoundError('{}: No such file'.format(param)) + return param + +def scale_xticks(ax: plt.Axes, ticks: Iterable): + """Helper function which sets the xticks to the according scaled positions + + Args: + ax (matplotlib.Axes): subplot to scale xticks + ticks (Iterable): list of expected ticks (at least two values, lowest and highest tick) + """ + scale = (ax1.get_xlim()[-1]-ax1.get_xlim()[0]-1)/(ticks[-1]-ticks[0]) + print(f"Scale {(ticks[0],ticks[-1])} with {scale} to end up with correct seaborn axis {ax.get_xlim()}") + ax.set_xticks([scale*x for x in ticks]) + ax.set_xticklabels(["{:.1f}".format(x) for x in ticks]) + + +parser = argparse.ArgumentParser( + description="Produce a plot containing the hitrate dependency of the simulated system. \ + It uses several files, one for each hitrate value to be represented in the scan. \ + The files containing the simulation dump are CSV files produced by the output method of the simulator.", + add_help=True +) +parser.add_argument( + "--scenario", + type=str, + choices=scenario_plotlabel_dict.keys(), + required=True, + help="Choose a scenario, which is used in the according plotting label and file-name of the plot." +) +parser.add_argument( + "--style", + choices=["scatterplot", "pointplot", "boxplot", "boxenplot", "violinplot", "jointplot"], + default="scatterplot", + type=str, + help="Plot style for the visualization." +) +parser.add_argument( + "--suffix", + type=str, + help="Optonal suffix to add to the file-name of the plot." +) +parser.add_argument( + "simoutputs", + nargs='+', + type=valid_file, + help="CSV files containing information about the simulated jobs \ + produced by the simulator." +) + + +args = parser.parse_args() + +scenario = args.scenario +if args.suffix: + suffix = "_"+args.suffix +else: + suffix = "" +plotstyle = args.style + + +# create a dict of hitrate and corresponding simulation-trace JSON-output-files +outputfiles = args.simoutputs +for outputfile in outputfiles: + outputfile = os.path.abspath(outputfile) + assert(os.path.exists(outputfile)) + +print("Found {0} output-files! Produce a hitrate scan for {0} hitrate values...".format(len(outputfiles))) + + +# create a dataframe for each CSV file and add hitrate information +dfs = [] +for outputfile in outputfiles: + with open(outputfile) as f: + df_tmp = pd.read_csv(f, sep=",\s") + dfs.append(df_tmp) + + +# concatenate all dataframes +if (all(os.path.exists(f) for f in outputfiles)): + df = pd.concat( + [ + df + for df in dfs + ], + ignore_index=True + ) + print("Simulation task output traces: \n", df) +else: + print("Couldn't find any files") + exit(1) + + +# Derive quantities +df["Walltime"] = (df["job.end"]-df["job.start"])/60 +df["IOtime"] = (df["infiles.transfertime"]+df["outfiles.transfertime"])/60 +df["Efficiency"] = df["job.computetime"]/(df["job.end"]-df["job.start"]) + + +# plot and save +machines = sorted(df["machine.name"].unique()) +print(f"Unique machines for hue: {machines}") +hitrateticks = [x*0.1 for x in range(0,11)] + +for quantity, qstyle in QUANTITIES.items(): + + print(f"Plotting for quantity {quantity}") + + if plotstyle == "scatterplot": + fig = plt.figure(f"hitrate-{quantity}", figsize=(6,4)) + ax1 = sns.scatterplot( + x="hitrate", y=quantity, + hue="machine.name", hue_order=machines, + data=df, + palette=sns.color_palette("colorblind", n_colors=len(machines)), + alpha=0.9 + ) + ax1.set_title(scenario_plotlabel_dict[scenario]) + ax1.set_xlabel("hitrate", loc="right") + ax1.set_ylabel(qstyle["ylabel"], color="black") + ax1.set_xlim([-0.05,1.05]) + ax1.set_xticks(hitrateticks) + if qstyle["ylim"]: + ax1.set_ylim(qstyle["ylim"]) + ax1.legend(loc='best') + fig.savefig(f"hitrate{quantity}_{scenario}jobs{suffix}.pdf") + fig.savefig(f"hitrate{quantity}_{scenario}jobs{suffix}.png") + plt.close() + + elif plotstyle == "pointplot": + fig = plt.figure(f"hitrate-{quantity}", figsize=(6,4)) + ax1 = fig.add_subplot(1,1,1) + sns.pointplot( + x="hitrate", y=quantity, + hue="machine.name", hue_order=machines, + data=df, + estimator="median", errorbar=("pi",95), # ci = Confidence Interval, pi = Percentile Interval, sd = Standard Deviation, se = Standard Error of Mean + dodge=True, join=False, + markers=".", capsize=0.5/len(machines), errwidth=1., + palette=sns.color_palette("colorblind", n_colors=len(machines)), + ax=ax1 + ) + ax1.set_title(scenario_plotlabel_dict[scenario]) + ax1.set_xlabel("hitrate", loc="right", color="black") + scale_xticks(ax1, hitrateticks) + ax1.set_ylabel(qstyle["ylabel"], color="black") + if qstyle["ylim"]: + ax1.set_ylim(qstyle["ylim"]) + ax1.legend(loc='best') + fig.savefig(f"hitrate{quantity}_{scenario}jobs{suffix}.pdf") + fig.savefig(f"hitrate{quantity}_{scenario}jobs{suffix}.png") + plt.close() + + elif plotstyle=="boxplot": + fig = plt.figure(f"hitrate-{quantity}", figsize=(6,4)) + ax1 = sns.boxplot( + x="hitrate", y=quantity, + hue="machine.name", hue_order=machines, + data=df, + orient="v", + # whis=1.5, #[0.5,99.5], + flierprops=dict(marker="x"), + palette=sns.color_palette("colorblind", n_colors=len(machines)), + ) + ax1.set_title(scenario_plotlabel_dict[scenario]) + ax1.set_xlabel("hitrate", loc="right") + scale_xticks(ax1, hitrateticks) + ax1.set_ylabel(qstyle["ylabel"], color="black") + if qstyle["ylim"]: + ax1.set_ylim(qstyle["ylim"]) + ax1.legend(loc='best') + fig.savefig(f"hitrate{quantity}_{scenario}jobs{suffix}.pdf") + fig.savefig(f"hitrate{quantity}_{scenario}jobs{suffix}.png") + plt.close() + + elif plotstyle =="boxenplot": + fig = plt.figure("hitrate-walltime", figsize=(6,4)) + ax1 = sns.boxenplot( + x="hitrate", y=quantity, + hue="machine.name", hue_order=machines, + data=df, + orient="v", + k_depth="proportion", + linewidth=0.5, + flier_kws=dict(marker="."), + palette=sns.color_palette("colorblind", n_colors=len(machines)), + ) + ax1.set_title(scenario_plotlabel_dict[scenario]) + ax1.set_xlabel("hitrate", loc="right") + scale_xticks(ax1, hitrateticks) + ax1.set_ylabel(qstyle["ylabel"], color="black") + if qstyle["ylim"]: + ax1.set_ylim(qstyle["ylim"]) + ax1.legend(loc='best') + fig.savefig(f"hitrate{quantity}_{scenario}jobs{suffix}.pdf") + fig.savefig(f"hitrate{quantity}_{scenario}jobs{suffix}.png") + plt.close() + + elif plotstyle =="violinplot": + fig = plt.figure("hitrate-walltime", figsize=(6,4)) + ax1 = sns.violinplot( + x="hitrate", y=quantity, + hue="machine.name", hue_order=machines, + data=df, + orient="v", + bw="scott", scale="count", inner="quartile", + linewidth=0.5, + palette=sns.color_palette("colorblind", n_colors=len(machines)), + ) + ax1.set_title(scenario_plotlabel_dict[scenario]) + ax1.set_xlabel("hitrate", loc="right") + scale_xticks(ax1, hitrateticks) + ax1.set_ylabel(qstyle["ylabel"], color="black") + if qstyle["ylim"]: + ax1.set_ylim(qstyle["ylim"]) + ax1.legend(loc='best') + fig.savefig(f"hitrate{quantity}_{scenario}jobs{suffix}.pdf") + fig.savefig(f"hitrate{quantity}_{scenario}jobs{suffix}.png") + plt.close() + + elif plotstyle == "jointplot": + grid = sns.JointGrid( + x="hitrate", y=quantity, + hue="machine.name", hue_order=machines, + data=df, + xlim=[-0.1,1.1], + ylim=qstyle["ylim"] if qstyle["ylim"] else None, + marginal_ticks=True, + height=7, + palette=sns.color_palette("colorblind", n_colors=len(machines)), + ) + grid.plot_joint(sns.scatterplot) + grid.plot_marginals(sns.histplot, multiple="layer", element="step", fill=True) + + grid.set_axis_labels(xlabel="hitrate", ylabel=qstyle["ylabel"], color="black") + grid.savefig(f"hitrate{quantity}_{scenario}jobs{suffix}.pdf") + grid.savefig(f"hitrate{quantity}_{scenario}jobs{suffix}.png") + plt.close() + + else: + raise NotImplementedError(f"Plotstyle {plotstyle} not implemented") diff --git a/tools/hitrateScan.sh b/tools/hitrateScan.sh new file mode 100755 index 0000000..1f5c31a --- /dev/null +++ b/tools/hitrateScan.sh @@ -0,0 +1,67 @@ +#! /bin/bash + +# Bash script tu run simulations for several hitrate values. +# The simulation monitoring outputs can be plotted using the hitrateperfromance.py plotting script. + +action() { + # determine the directy of this file + if [ ! -z "$ZSH_VERSION" ]; then + local this_file="${(%):-%x}" + else + local this_file="${BASH_SOURCE[0]}" + fi + #source /cvmfs/cms.cern.ch/slc6_amd64_gcc480/external/python/2.7.3/etc/profile.d/init.sh + + local base="$( cd "$( dirname "$this_file" )" && pwd )" + local parent="$( dirname "$base" )" + + local PLATFORM_DIR="$parent/data/platform-files" + local PLATFORM="sgbatch_validation.xml" + + local NJOBS=1 + local NINFILES=20 #10 + local AVGINSIZE=$(bc -l <<< "8554379000 / ${NINFILES}") + local AVGOUTSIZE=0 + local FLOPS=$(bc -l <<< "1.95*1480*1000*1000*1000") + local MEM=2400 + local SIGMA_FLOPS=0 + local SIGMA_MEM=0 + local SIGMA_INSIZE=0 + local SIGMA_OUTSIZE=0 + local DUPLICATIONS=48 + + local XRD_BLOCKSIZE=100000000 + + local SCENARIO="fullstream" # further options synchronized with plotting script "copy", "simplifiedstream", "fullstream" + + local OUTDIR="$parent/tmp/outputs" + if [ ! -d $OUTDIR ]; then + mkdir -p $OUTDIR + fi + + for hitrate in $(LANG=en_US seq 0.0 0.1 1.0) + do + dc-sim --platform "$PLATFORM_DIR/$PLATFORM" \ + --njobs $NJOBS \ + --ninfiles $NINFILES --insize $AVGINSIZE \ + --sigma-insize $SIGMA_INSIZE \ + --hitrate ${hitrate} \ + --flops $FLOPS \ + --sigma-flops $SIGMA_FLOPS \ + --mem $MEM \ + --sigma-mem $SIGMA_MEM \ + --outsize $AVGOUTSIZE \ + --sigma-outsize $SIGMA_OUTSIZE \ + --duplications $DUPLICATIONS \ + --xrd-blocksize $XRD_BLOCKSIZE \ + --output-file ${OUTDIR}/hitratescaling_${SCENARIO}_${NJOBS}jobs_hitrate${hitrate}.csv \ + --cfg=network/loopback-bw:100000000000000 \ + --no-caching #\ + # --no-streaming \ + # --wrench-full-log + # --log=simple_wms.threshold=debug \ + # --log=cache_computation.threshold=debug + done +} + +action "$@" diff --git a/tools/hitrateperformance.py b/tools/hitrateperformance.py deleted file mode 100644 index 425d18d..0000000 --- a/tools/hitrateperformance.py +++ /dev/null @@ -1,137 +0,0 @@ -#! /usr/bin/python3 - -import pandas as pd -import numpy as np -from matplotlib import pyplot as plt -import os.path -import argparse - - -plt.rcParams["figure.figsize"] = [4., 3.] -plt.rcParams["figure.autolayout"] = True - - -def valid_file(param): - base, ext = os.path.splitext(param) - if ext.lower() not in ('.csv'): - raise argparse.ArgumentTypeError('File must have a csv extension') - return param - - -parser = argparse.ArgumentParser( - description="Produce a plot containing the hitrate dependency of the simulated system. \ - It uses several files, one for each hitrate value to be represented in the scan. \ - The files containing the simulation dump are CSV files produced by the output method of the simulator.", - add_help=True -) -parser.add_argument( - "--scenario", - type=str, - choices=("copy", "simplifiedstream", "fullstream"), - required=True, - help="Choose a scenario, which is used in the according plotting label and file-name of the plot." -) -parser.add_argument( - "--suffix", - type=str, - help="Optonal suffix to add to the file-name of the plot." -) -parser.add_argument( - "simoutputs", - nargs='+', - type=valid_file, - help="CSV files containing information about the simulated jobs \ - produced by the simulator." -) - - -args = parser.parse_args() - -scenario = args.scenario -suffix=args.suffix - - -scenario_plotlabel_dict = { - "copy": "Input-files copied", - "fullstream": "Block-streaming" -} - - -machines = ['sg01', 'sg02', 'sg03'] - - -# create a dict of hitrate and corresponding simulation-trace JSON-output-files -outputfiles = args.simoutputs -for outputfile in outputfiles: - outputfile = os.path.abspath(outputfile) - assert(os.path.exists(outputfile)) - -print("Found {0} output-files! Produce a hitrate scan for {0} hitrate values...".format(len(outputfiles))) -hitrates = [float(outfile.split("_")[-1].strip(".csv").strip("hitrate")) for outfile in outputfiles] - -outputfiles_dict = dict(zip(hitrates,outputfiles)) -print(outputfiles_dict) - - -# create a dataframe for each CSV file and add hitrate information -dfs = [] -for hitrate, outputfile in outputfiles_dict.items(): - with open(outputfile) as f: - df_tmp = pd.read_csv(f, sep=',\t') - df_tmp['hitrate'] = hitrate - dfs.append(df_tmp) - - -# concatenate all dataframes -if (all(os.path.exists(f) for f in outputfiles)): - df = pd.concat( - [ - df - for df in dfs - ], - ignore_index=True - ) - print("Simulation task output traces: \n", df) -else: - print("Couldn't find any files") - exit(1) - -# plot the job-runtime dependence on hitrate -fig, ax1 = plt.subplots() -ax1.set_title(scenario_plotlabel_dict[scenario]) - -ax1.set_xlabel('hitrate', loc='right') -ax1.set_ylabel('jobtime / min', color='black') -ax1.set_xlim([-0.05,1.05]) -# ax1.set_ylim([0,400]) - -# ax1 = df.plot.scatter(x='hitrate', y='walltime', c=) - -scatter = ax1.scatter(df['hitrate'], (df['job.end']-df['job.start'])/60., c=df['machine.name'].astype('category').cat.codes, marker='x') -# ax1.grid(axis="y", linestyle = 'dotted', which='major') - -ax1.legend( - handles=scatter.legend_elements()[0], - labels=machines, - title="machines" -) - -fig.savefig(f"hitratescaling_{scenario}jobs{suffix}.pdf") - - -fig2, ax2 = plt.subplots() -ax2.set_title(scenario_plotlabel_dict[scenario]) - -ax2.set_xlabel('hitrate', loc='right') -ax2.set_ylabel('transfer time / min', color='black') -ax2.set_xlim([-0.05,1.05]) - -scatter = ax2.scatter(df['hitrate'], ((df['infiles.transfertime']+df['outfiles.transfertime']))/60., c=df['machine.name'].astype('category').cat.codes, marker='x') - -ax2.legend( - handles=scatter.legend_elements()[0], - labels=machines, - title="machines" -) - -fig2.savefig(f"hitratetransfer_{scenario}jobs{suffix}.pdf") \ No newline at end of file diff --git a/tools/hitratescan.sh b/tools/hitratescan.sh deleted file mode 100644 index 95b85c6..0000000 --- a/tools/hitratescan.sh +++ /dev/null @@ -1,26 +0,0 @@ -#! /bin/bash - -# Bash script tu run simulations for several hitrate values. -# The simulation monitoring outputs can be plotted using the hitrateferfromance.py plotting script. - -NJOBS=60 -NINFILES=10 -AVGINSIZE=3600000000 -FLOPS=216442800000 - -SCENARIO="fullstream" # further options synchronized with plotting script "copy", "simplifiedstream", "fullstream" - -OUTDIR="tmp/outputs" -if [ ! -d $OUTDIR ]; then - mkdir -p $OUTDIR -fi - -for hitrate in $(LANG=en_US seq 0.0 0.1 1.0) -do - sgbatch-sim --platform data/platform-files/hosts.xml \ - --njobs $NJOBS \ - --ninfiles $NINFILES --insize $AVGINSIZE \ - --hitrate ${hitrate} \ - --flops $FLOPS \ - --output-file ${OUTDIR}/hitratescaling_${SCENARIO}_${NJOBS}jobs_hitrate${hitrate}.csv -done \ No newline at end of file diff --git a/tools/htcondor/etpbatch/arguments.txt b/tools/htcondor/etpbatch/arguments.txt new file mode 100644 index 0000000..88b9c7b --- /dev/null +++ b/tools/htcondor/etpbatch/arguments.txt @@ -0,0 +1,25 @@ +50 +100 +150 +200 +250 +300 +350 +400 +450 +500 +550 +600 +650 +700 +750 +800 +850 +900 +950 +1000 +1050 +1100 +1150 +1200 +1250 diff --git a/tools/htcondor/etpbatch/measure_and_simulate.jdl b/tools/htcondor/etpbatch/measure_and_simulate.jdl new file mode 100644 index 0000000..06505cc --- /dev/null +++ b/tools/htcondor/etpbatch/measure_and_simulate.jdl @@ -0,0 +1,33 @@ +# submit multiple simulations and harvest scaling and job information + +executable = ./measure_and_simulate.sh +arguments = $(PLATFORM) $(NJOBS) $(BUFFERSIZE) + +should_transfer_files = YES +transfer_input_files = ../../../data/platform-files/$(PLATFORM).xml +transfer_output_files = scaling_dump_$(PLATFORM)$(NJOBS)jobs.txt, $(PLATFORM)$(NJOBS).csv +when_to_transfer_output = ON_EXIT + +log = logs/log.$(ClusterId).$(ProcId) +output = logs/out.$(ClusterId).$(ProcId) +error = logs/err.$(ClusterId).$(ProcId) + + +accounting_group=cms.production +Requirements = TARGET.ProvidesIO && (TARGET.Machine=?="ms04.etp.kit.edu") ++RemoteJob = True + +universe = docker +docker_image = mschnepf/slc7-condocker + ++RequestWalltime = (15*$(NJOBS)) + 3600 + (24*3600*NumJobStarts) +request_cpus = 1 +RequestMemory = (10*15*$(NJOBS))+(2*4000*NumJobStarts) +periodic_release = (HoldReasonCode == 34) +RequestDisk = 4000000 + +x509userproxy = $ENV(X509_USER_PROXY) + +PLATFORM = ETPbatch_reduced_simple +BUFFERSIZE = 1048576 +queue NJOBS in (850, 900, 950) diff --git a/tools/htcondor/etpbatch/measure_and_simulate.jdl.bak b/tools/htcondor/etpbatch/measure_and_simulate.jdl.bak new file mode 100644 index 0000000..9d22510 --- /dev/null +++ b/tools/htcondor/etpbatch/measure_and_simulate.jdl.bak @@ -0,0 +1,33 @@ +# submit multiple simulations and harvest scaling and job information + +executable = ./measure_and_simulate.sh +arguments = $(PLATFORM) $(NJOBS) + +should_transfer_files = YES +transfer_input_files = ../../../data/platform-files/$(PLATFORM).xml +transfer_output_files = scaling_dump_$(PLATFORM)$(NJOBS)jobs.txt, $(PLATFORM)$(NJOBS).csv, monitor$(PLATFORM)$(NJOBS).txt +when_to_transfer_output = ON_EXIT + +log = logs/log.$(ClusterId).$(ProcId) +output = logs/out.$(ClusterId).$(ProcId) +error = logs/err.$(ClusterId).$(ProcId) + + +accounting_group=cms.production +Requirements = TARGET.ProvidesIO && (TARGET.Machine=?="portal1.etp.kit.edu") ++RemoteJob = True +NiceUser = True + +universe = docker +docker_image = mschnepf/slc7-condocker + ++RequestWalltime = (60*$(NJOBS)) + (24*3600*NumJobStarts) +request_cpus = 1 +RequestMemory = (40*$(NJOBS)) + (10000*NumJobStarts) +periodic_release = (HoldReasonCode == 34) +RequestDisk = 4000000 + +x509userproxy = $ENV(X509_USER_PROXY) + +# PLATFORM = ETPbatch +queue NJOBS, PLATFORM from arguments.txt diff --git a/tools/htcondor/etpbatch/measure_and_simulate.sh b/tools/htcondor/etpbatch/measure_and_simulate.sh new file mode 100644 index 0000000..50aed44 --- /dev/null +++ b/tools/htcondor/etpbatch/measure_and_simulate.sh @@ -0,0 +1,75 @@ +#! /bin/bash + +ulimit -s unlimited +set -e + +echo "INITIAL ENVIRONMENT START" +env +echo "INITIAL ENVIRONMENT END" +echo "" + +echo "CURRENT DIRECTORY CONTENT:" +ls $(pwd) + +echo "SETTING GRID ENVIRONMENT" +source /cvmfs/grid.cern.ch/umd-c7ui-latest/etc/profile.d/setup-c7-ui-example.sh + +# echo "SOURCING CONDA ENVIRONMENT FROM CVMFS" +# source /cvmfs/etp.kit.edu/DCSim/pre_0.2/setup.sh + +echo "GETTING CONDA ENVIRONMENT FROM REMOTE STORAGE" +gfal-copy davs://cmswebdav-kit.gridka.de:2880/pnfs/gridka.de/cms/disk-only/store/user/mhorzela/dcsim-env02.tar.gz dcsim-env.tar.gz +# gfal-copy davs://cmswebdav-kit.gridka.de:2880/pnfs/gridka.de/cms/disk-only/store/user/aakhmets/dcsim-env.tar.gz dcsim-env.tar.gz +echo "EXTRACTING AND SETTING CONDA ENVIRONMENT" +mkdir -p dcsim-env +tar -zxvf dcsim-env.tar.gz -C dcsim-env + +source dcsim-env/bin/activate +conda-unpack + +echo "UPDATING LIBRARIES" + +echo "Old LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" +export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${CONDA_PREFIX}/lib64:${CONDA_PREFIX}/lib32:${LD_LIBRARY_PATH} + +ldd ${CONDA_PREFIX}/bin/dc-sim +# ldd ${CONDA_PREFIX}/bin/sgbatch-sim + +NJOBS="${2}" +NINFILES=20 +AVGINSIZE=$(bc -l <<< "8554379000/20") +SIGMAINSIZE=$(bc -l <<< "10000") +FLOPS=$(bc -l <<< "1.95*1480*1000*1000*1000") +SIGMAFLOPS=10000000 +MEM=2400 +OUTSIZE=$(bc -l <<< "50000000") +SIGMAOUTSIZE=$(bc -l <<< "1000000") +DUPLICATIONS=1 +HITRATE=0.05 +XRDBLOCKSIZE=1000000 +BUFFERSIZE="${3}" + +PLATFORM="${1}" + +# sgbatch-sim --platform ${PLATFORM}.xml \ + dc-sim --platform ${PLATFORM}.xml \ + --njobs ${NJOBS} --ninfiles ${NINFILES} --insize ${AVGINSIZE} --sigma-insize ${SIGMAINSIZE} \ + --flops ${FLOPS} --sigma-flops ${SIGMAFLOPS} --mem ${MEM} \ + --outsize ${OUTSIZE} --sigma-outsize ${SIGMAOUTSIZE} \ + --duplications ${DUPLICATIONS} \ + --hitrate 0.0 \ + --xrd-blocksize ${XRDBLOCKSIZE} \ + --storage-buffer-size ${BUFFERSIZE} \ + --output-file ${PLATFORM}${NJOBS}.csv \ + & TEST_PID=$! + + (while [[ True ]]; \ + do ps -aux | grep " ${TEST_PID} " | grep "dc-sim" \ + >> scaling_dump_${PLATFORM}${NJOBS}jobs.txt; \ + sleep 10; done;)\ + & MONITOR_PID=$! + echo "Simulation process to monitor: $TEST_PID" + echo "Monitoring process: $MONITOR_PID" + + wait $TEST_PID + kill -9 ${MONITOR_PID} diff --git a/tools/htcondor/etpbatch_scaling/arguments.txt b/tools/htcondor/etpbatch_scaling/arguments.txt new file mode 100644 index 0000000..2d500e2 --- /dev/null +++ b/tools/htcondor/etpbatch_scaling/arguments.txt @@ -0,0 +1,3 @@ +100 +500 +1000 diff --git a/tools/htcondor/etpbatch_scaling/measure_resource_consumption.jdl b/tools/htcondor/etpbatch_scaling/measure_resource_consumption.jdl new file mode 100644 index 0000000..3a4330a --- /dev/null +++ b/tools/htcondor/etpbatch_scaling/measure_resource_consumption.jdl @@ -0,0 +1,26 @@ +Universe = docker +docker_image = mschnepf/slc7-condocker + +executable = ./measure_resource_consumption.sh + +request_cpus = 1 +request_memory = 1000+(1000*NumJobStarts) +periodic_release = (HoldReasonCode == 34) +request_disk = 4000000 + ++RemoteJob = True ++RequestWalltime = 7200 + (24*3600*NumJobStarts) + +x509userproxy = $ENV(X509_USER_PROXY) + +arguments = $(NJOBS) +transfer_output_files = scaling_dump_$(NJOBS)jobs.txt +transfer_input_files = ../../../data/platform-files/ETPbatch.xml + +log = log_scaling_$(ClusterId).$(ProcId).log +stdout = stdout_scaling_$(ClusterId).$(ProcId).log +stderr = stderr_scaling_$(ClusterId).$(ProcId).log + +accounting_group=cms.production + +queue NJOBS from arguments.txt diff --git a/tools/htcondor/etpbatch_scaling/measure_resource_consumption.sh b/tools/htcondor/etpbatch_scaling/measure_resource_consumption.sh new file mode 100755 index 0000000..0a691d4 --- /dev/null +++ b/tools/htcondor/etpbatch_scaling/measure_resource_consumption.sh @@ -0,0 +1,54 @@ +#! /bin/bash +ulimit -s unlimited +set -e + +echo "INITIAL ENVIRONMENT START" +env +echo "INITIAL ENVIRONMENT END" +echo "" + +echo "CURRENT DIRECTORY CONTENT:" +ls $(pwd) + +echo "SETTING GRID ENVIRONMENT" +source /cvmfs/grid.cern.ch/umd-c7ui-latest/etc/profile.d/setup-c7-ui-example.sh + +echo "GETTING CONDA ENVIRONMENT FROM REMOTE STORAGE" +gfal-copy davs://cmswebdav-kit.gridka.de:2880/pnfs/gridka.de/cms/disk-only/store/user/aakhmets/dcsim-env.tar.gz dcsim-env.tar.gz + +echo "EXTRACTING AND SETTING CONDA ENVIRONMENT" +mkdir -p dcsim-env +tar -zxvf dcsim-env.tar.gz -C dcsim-env + +source dcsim-env/bin/activate +conda-unpack + +echo "UPDATING LIBRARIES" + +echo "Old LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" +export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${CONDA_PREFIX}/lib64:${CONDA_PREFIX}/lib32:${LD_LIBRARY_PATH} + +ldd ${CONDA_PREFIX}/bin/dc-sim + +NJOBS=${1} +NINFILES=10 +AVGINSIZE=3600000000 + +SCENARIO="ETPbatch" + + dc-sim --platform ETPbatch.xml \ + --njobs ${NJOBS} --ninfiles ${NINFILES} --insize ${AVGINSIZE} \ + --hitrate 0.0 \ + --output-file /dev/null \ + & TEST_PID=$! + + (while [[ True ]]; \ + do ps -aux | grep " ${TEST_PID} " | grep "dc-sim" \ + >> scaling_dump_${NJOBS}jobs.txt; \ + sleep 10; done;)\ + & MONITOR_PID=$! + echo "Simulation process to monitor: $TEST_PID" + echo "Monitoring process: $MONITOR_PID" + + wait $TEST_PID + kill -9 ${MONITOR_PID} diff --git a/tools/htcondor/hitratescan/arguments.txt b/tools/htcondor/hitratescan/arguments.txt new file mode 100644 index 0000000..6504838 --- /dev/null +++ b/tools/htcondor/hitratescan/arguments.txt @@ -0,0 +1,11 @@ +0.0 +0.1 +0.2 +0.3 +0.4 +0.5 +0.6 +0.7 +0.8 +0.9 +1.0 \ No newline at end of file diff --git a/tools/htcondor/hitratescan/hitratescan.jdl b/tools/htcondor/hitratescan/hitratescan.jdl new file mode 100644 index 0000000..4e33e26 --- /dev/null +++ b/tools/htcondor/hitratescan/hitratescan.jdl @@ -0,0 +1,32 @@ +# submit multiple simulations for different initial hitrates and harvest scaling and job information + +executable = ./simulate.sh +arguments = $(PLATFORM) $(HITRATE) + +should_transfer_files = YES +transfer_input_files = ../../../data/platform-files/$(PLATFORM).xml +transfer_output_files = scaling_dump_$(PLATFORM)_h$(HITRATE).txt, $(PLATFORM)_h$(HITRATE).csv +when_to_transfer_output = ON_EXIT + +log = logs/log.$(ClusterId).$(ProcId) +output = logs/out.$(ClusterId).$(ProcId) +error = logs/err.$(ClusterId).$(ProcId) + + +accounting_group=cms.jet +Requirements = TARGET.ProvidesIO && (TARGET.Machine=?="mdm1.ekp.kit.edu") ++RemoteJob = True + +universe = docker +docker_image = mschnepf/slc7-condocker + ++RequestWalltime = 7200 + (24*3600*NumJobStarts) +request_cpus = 1 +RequestMemory = 20000+(5000*NumJobStarts) +periodic_release = (HoldReasonCode == 34) +RequestDisk = 4000000 + +x509userproxy = $ENV(X509_USER_PROXY) + +PLATFORM = ETPbatch_reduced +queue HITRATE from arguments.txt diff --git a/tools/htcondor/hitratescan/simulate.sh b/tools/htcondor/hitratescan/simulate.sh new file mode 100644 index 0000000..a890a86 --- /dev/null +++ b/tools/htcondor/hitratescan/simulate.sh @@ -0,0 +1,72 @@ +#! /bin/bash + +ulimit -s unlimited +set -e + +echo "INITIAL ENVIRONMENT START" +env +echo "INITIAL ENVIRONMENT END" +echo "" + +echo "CURRENT DIRECTORY CONTENT:" +ls $(pwd) + +echo "SETTING GRID ENVIRONMENT" +source /cvmfs/grid.cern.ch/umd-c7ui-latest/etc/profile.d/setup-c7-ui-example.sh + +# echo "SOURCING CONDA ENVIRONMENT FROM CVMFS" +# source /cvmfs/etp.kit.edu/DCSim/pre_0.2/setup.sh + +echo "GETTING CONDA ENVIRONMENT FROM REMOTE STORAGE" +gfal-copy davs://cmswebdav-kit.gridka.de:2880/pnfs/gridka.de/cms/disk-only/store/user/mhorzela/dcsim-env.tar.gz dcsim-env.tar.gz + +echo "EXTRACTING AND SETTING CONDA ENVIRONMENT" +mkdir -p dcsim-env +tar -zxvf dcsim-env.tar.gz -C dcsim-env + +source dcsim-env/bin/activate +conda-unpack + +echo "UPDATING LIBRARIES" + +echo "Old LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" +export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${CONDA_PREFIX}/lib64:${CONDA_PREFIX}/lib32:${LD_LIBRARY_PATH} + +ldd ${CONDA_PREFIX}/bin/dc-sim + +echo "START SIMULATION" +NJOBS=315 +NINFILES=20 +AVGINSIZE=$(bc -l <<< "8554379000/20") +SIGMAINSIZE=$(bc -l <<< "10000") +FLOPS=$(bc -l <<< "1.95*1480*1000*1000*1000") +SIGMAFLOPS=10000000 +MEM=2400 +OUTSIZE=$(bc -l <<< "50000000") +SIGMAOUTSIZE=$(bc -l <<< "1000000") +DUPLICATIONS=1 +HITRATE="${2}" +XRDBLOCKSIZE=1000000 + +PLATFORM="${1}" + + dc-sim --platform ${PLATFORM}.xml \ + --njobs ${NJOBS} --ninfiles ${NINFILES} --insize ${AVGINSIZE} --sigma-insize ${SIGMAINSIZE} \ + --flops ${FLOPS} --sigma-flops ${SIGMAFLOPS} --mem ${MEM} \ + --outsize ${OUTSIZE} --sigma-outsize ${SIGMAOUTSIZE} \ + --duplications ${DUPLICATIONS} \ + --hitrate ${HITRATE} \ + --xrd-blocksize ${XRDBLOCKSIZE} \ + --output-file ${PLATFORM}_h${HITRATE}.csv \ + & TEST_PID=$! + + (while [[ True ]]; \ + do ps -aux | grep " ${TEST_PID} " | grep "dc-sim" \ + >> scaling_dump_${PLATFORM}_h${HITRATE}.txt; \ + sleep 10; done;)\ + & MONITOR_PID=$! + echo "Simulation process to monitor: $TEST_PID" + echo "Monitoring process: $MONITOR_PID" + + wait $TEST_PID + kill -9 ${MONITOR_PID} diff --git a/tools/htcondor/hitratescan/simulate_scaled.sh b/tools/htcondor/hitratescan/simulate_scaled.sh new file mode 100644 index 0000000..b79ca50 --- /dev/null +++ b/tools/htcondor/hitratescan/simulate_scaled.sh @@ -0,0 +1,72 @@ +#! /bin/bash + +ulimit -s unlimited +set -e + +echo "INITIAL ENVIRONMENT START" +env +echo "INITIAL ENVIRONMENT END" +echo "" + +echo "CURRENT DIRECTORY CONTENT:" +ls $(pwd) + +echo "SETTING GRID ENVIRONMENT" +source /cvmfs/grid.cern.ch/umd-c7ui-latest/etc/profile.d/setup-c7-ui-example.sh + +# echo "SOURCING CONDA ENVIRONMENT FROM CVMFS" +# source /cvmfs/etp.kit.edu/DCSim/pre_0.2/setup.sh + +echo "GETTING CONDA ENVIRONMENT FROM REMOTE STORAGE" +gfal-copy davs://cmswebdav-kit.gridka.de:2880/pnfs/gridka.de/cms/disk-only/store/user/mhorzela/dcsim-env.tar.gz dcsim-env.tar.gz + +echo "EXTRACTING AND SETTING CONDA ENVIRONMENT" +mkdir -p dcsim-env +tar -zxvf dcsim-env.tar.gz -C dcsim-env + +source dcsim-env/bin/activate +conda-unpack + +echo "UPDATING LIBRARIES" + +echo "Old LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" +export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${CONDA_PREFIX}/lib64:${CONDA_PREFIX}/lib32:${LD_LIBRARY_PATH} + +ldd ${CONDA_PREFIX}/bin/dc-sim + +echo "START SIMULATION" +NJOBS=315 +NINFILES=20 +AVGINSIZE=$(bc -l <<< "8554379000/20/1000") +SIGMAINSIZE=$(bc -l <<< "10000/1000") +FLOPS=$(bc -l <<< "1.95*1480*1000*1000*1000") +SIGMAFLOPS=10000000 +MEM=2400 +OUTSIZE=$(bc -l <<< "50000000/1000") +SIGMAOUTSIZE=$(bc -l <<< "1000000/1000") +DUPLICATIONS=1 +HITRATE="${2}" +XRDBLOCKSIZE=1000000 + +PLATFORM="${1}" + + dc-sim --platform ${PLATFORM}.xml \ + --njobs ${NJOBS} --ninfiles ${NINFILES} --insize ${AVGINSIZE} --sigma-insize ${SIGMAINSIZE} \ + --flops ${FLOPS} --sigma-flops ${SIGMAFLOPS} --mem ${MEM} \ + --outsize ${OUTSIZE} --sigma-outsize ${SIGMAOUTSIZE} \ + --duplications ${DUPLICATIONS} \ + --hitrate ${HITRATE} \ + --xrd-blocksize ${XRDBLOCKSIZE} \ + --output-file ${PLATFORM}_h${HITRATE}.csv \ + & TEST_PID=$! + + (while [[ True ]]; \ + do ps -aux | grep " ${TEST_PID} " | grep "dc-sim" \ + >> scaling_dump_${PLATFORM}_h${HITRATE}.txt; \ + sleep 10; done;)\ + & MONITOR_PID=$! + echo "Simulation process to monitor: $TEST_PID" + echo "Monitoring process: $MONITOR_PID" + + wait $TEST_PID + kill -9 ${MONITOR_PID} diff --git a/tools/htcondor/test/test_dcsim-env_remote.jdl b/tools/htcondor/test/test_dcsim-env_remote.jdl new file mode 100644 index 0000000..ac2bb62 --- /dev/null +++ b/tools/htcondor/test/test_dcsim-env_remote.jdl @@ -0,0 +1,25 @@ +Universe = docker +docker_image = mschnepf/slc7-condocker + +executable = ./test_dcsim-env_remote.sh + +request_cpus = 1 +request_memory = 1000+(1000*NumJobStarts) +periodic_release = (HoldReasonCode == 34) +request_disk = 4000000 + ++RemoteJob = True ++RequestWalltime = 3600 + +x509userproxy = $ENV(X509_USER_PROXY) + +transfer_output_files = test.csv +transfer_input_files = ../../../data/platform-files/sgbatch_scaletest.xml + +log = log_test_$(ClusterId).$(ProcId).log +stdout = stdout_test_$(ClusterId).$(ProcId).log +stderr = stderr_test_$(ClusterId).$(ProcId).log + +accounting_group=cms.production + +queue diff --git a/tools/htcondor/test/test_dcsim-env_remote.sh b/tools/htcondor/test/test_dcsim-env_remote.sh new file mode 100755 index 0000000..005cee9 --- /dev/null +++ b/tools/htcondor/test/test_dcsim-env_remote.sh @@ -0,0 +1,35 @@ +#! /usr/bin/bash +ulimit -s unlimited +set -e + +echo "INITIAL ENVIRONMENT START" +env +echo "INITIAL ENVIRONMENT END" +echo "" + +echo "CURRENT DIRECTORY CONTENT:" +ls $(pwd) + +echo "SETTING GRID ENVIRONMENT" +source /cvmfs/grid.cern.ch/umd-c7ui-latest/etc/profile.d/setup-c7-ui-example.sh + +echo "GETTING CONDA ENVIRONMENT FROM REMOTE STORAGE" +gfal-copy davs://cmswebdav-kit.gridka.de:2880/pnfs/gridka.de/cms/disk-only/store/user/aakhmets/dcsim-env.tar.gz dcsim-env.tar.gz + +echo "EXTRACTING AND SETTING CONDA ENVIRONMENT" +mkdir -p dcsim-env +tar -zxvf dcsim-env.tar.gz -C dcsim-env + +source dcsim-env/bin/activate +conda-unpack + +echo "UPDATING LIBRARIES" + +echo "Old LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" +export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${CONDA_PREFIX}/lib64:${CONDA_PREFIX}/lib32:${LD_LIBRARY_PATH} + +ldd ${CONDA_PREFIX}/bin/dc-sim + +echo "RUNNING TEST COMMAND:" + +/usr/bin/time -v dc-sim -p dc.xml -o test.csv -n 60 diff --git a/tools/platformPerformancePlots.py b/tools/platformPerformancePlots.py new file mode 100644 index 0000000..2c25692 --- /dev/null +++ b/tools/platformPerformancePlots.py @@ -0,0 +1,192 @@ +#! /usr/bin/python3 + +from platform import platform +import pandas as pd +import numpy as np +from matplotlib import pyplot as plt +from matplotlib.font_manager import FontProperties +import os.path +import argparse + +# plt.rcParams["figure.figsize"] = [4., 3.] +plt.rcParams["figure.autolayout"] = True + +def validFileLabelTuple(param: str): + """ + Helper function, which transforms and checks for the right structure of the given arguments + + param param: the argument to check + + return: tuple[str, str] + """ + # check for valid structure of the argument + try: + filepath, label = map(str, param.split(',')) + except ValueError: + filepath = param.split(',') + if len(filepath) > 1: + exit(f"Argument {param} has too many values") + elif len(filepath) < 1: + exit(f"Argument {param} has too few values") + elif len(filepath) == 1: + filepath = param + label = "" + except Exception as e: + print(e) + raise argparse.ArgumentError(f"Unsupported argument {param}") + + # ensure the right file extension + ext = os.path.splitext(filepath)[1] + if ext.lower() not in ('.csv'): + raise argparse.ArgumentTypeError(f"Invalid file type: {param} must be a csv file") + + # ensure that file exists + if not os.path.exists(filepath): + raise argparse.ArgumentTypeError(f"Invalid file {filepath} does not exist") + + return (str(filepath), label) + + + +parser = argparse.ArgumentParser( + description="Produce plots showing the performance of the simulated system. \ + It can use several files, or tuples of file and label, one for each simulation run to be compared. \ + The files containing the simulation dump are CSV files produced by the output method of the simulator.", + add_help=True +) +parser.add_argument( + "--logscale", + action="store_true", + help="Plot with logarithmic x-axis" +) +parser.add_argument( + "--title", + type=str, + default="DCSim", + help="Plot title hinting on the simulated scenario" +) +parser.add_argument( + "--suffix", + type=str, + help="Optonal suffix to add to the file-name of the plot." +) +parser.add_argument( + "simoutputs", + nargs='+', + type=validFileLabelTuple, + help="CSV files, or tuples of file and label, containing information \ + about the simulated jobs produced by the simulator." +) + + +args = parser.parse_args() + +title = args.title +suffix = args.suffix +file_label_pairs = args.simoutputs + +file_label_pairs = dict( + file_label_pairs +) + + +event_fig, event_ax = plt.subplots() +event_ax.set_xlabel('time / s', loc='right') +if args.logscale: + event_ax.set_xscale('log') + +for file, label in file_label_pairs.items(): + with open(file) as f: + df = pd.read_csv(f, sep=",\s", engine='python') + + starts = event_ax.eventplot( + positions=df['job.start'].to_numpy(), + orientation='horizontal', + lineoffsets=label, + linewidths=0.1, + linelengths=0.75, + colors='black', + label="start" + ) + ends = event_ax.eventplot( + positions=df['job.end'].to_numpy(), + orientation='horizontal', + lineoffsets=label, + linewidths=0.1, + linelengths=0.75, + colors='black', + label="end" + ) + + + machines = df["machine.name"].unique() + + + efficiency_fig, efficiency_ax = plt.subplots() + efficiency_ax.set_xlabel("eff. / %", loc='right') + efficiency_ax.set_ylabel("jobs", loc='top') + efficiency_ax.set_yscale('log') + + walltime_fig, walltime_ax = plt.subplots() + walltime_ax.set_xlabel("walltime / s", loc='right') + walltime_ax.set_ylabel("jobs", loc='top') + walltime_ax.set_yscale('log') + + machine_efficiencies = {} + machine_walltimes = {} + for i,machine in enumerate(machines): + df_masked = df[df["machine.name"]==machine] + + machine_efficiency = df_masked["job.computetime"]/(df_masked["job.end"]-df_masked["job.start"])/100 + machine_efficiencies[machine]=machine_efficiency + + machine_walltime = (df_masked["job.end"]-df_masked["job.start"]) + machine_walltimes[machine]=machine_walltime + + machine_efficiencies_list = sorted(machine_efficiencies.items(),key=lambda x: x[1].size) + machine_efficiencies = dict(machine_efficiencies_list) + machine_walltimes_list = sorted(machine_walltimes.items(), key=lambda x: x[1].size) + machine_walltimes = dict(machine_walltimes_list) + + efficiency_ax.hist( + list(machine_efficiencies.values()), + bins=100, range=(0.,100.), + stacked=True, + label=list(machine_efficiencies.keys()) + ) + + walltime_ax.hist( + list(machine_walltimes.values()), + bins=100, + stacked=True, + label=list(machine_walltimes.keys()) + ) + + + efficiency_ax.legend() + efficiency_ax.set_title(title+" "+label, loc='left', fontsize=14, fontweight='bold') + + efficiency_fig.savefig(f"efficiency_{label}_{suffix}.png") + efficiency_fig.savefig(f"efficiency_{label}_{suffix}.pdf") + + + walltime_ax.legend() + walltime_ax.set_title(title+" "+label, loc='left', fontsize=14, fontweight='bold') + + walltime_fig.savefig(f"walltime_{label}_{suffix}.png") + walltime_fig.savefig(f"walltime_{label}_{suffix}.pdf") + + +event_ax.legend( + handles = (starts[0], ends[0]), + labels = ("start", "end") +) +event_ax.set_title(title, loc='left', fontsize=14, fontweight='bold') + +event_fig.savefig(f"jobevents_{suffix}.png") +event_fig.savefig(f"jobevents_{suffix}.pdf") + + + +plt.show() + diff --git a/tools/simScaling.sh b/tools/simScaling.sh new file mode 100755 index 0000000..f27b12a --- /dev/null +++ b/tools/simScaling.sh @@ -0,0 +1,49 @@ +#! /bin/bash + +# script for execution of simulation scenarios to test the runtime and memory scaling of the simulator +# +# + +NJOBS=60 +NINFILES=10 +AVGINSIZE=3600000000 +SIGMAINSIZE=360000000 +FLOPS=2164428000000 +SIGMAFLOPS=216442800000 +MEM=2000000000 +OUTSIZE=18000000000 +SIGMAOUTSIZE=1800000000 +DUPLICATIONS=1 +HITRATE=0.0 +XRDBLOCKSIZE=1000000000 + +SCENARIO="ETPbatch" + +if [ ! -d "tmp/monitor/$SCENARIO" ]; then + mkdir -p tmp/monitor/$SCENARIO +fi + +for NJOBS in 10 20 50 100 200 500 700 1000 1100 1200 1300 1500 1700 2000 2500 3000 +do + dc-sim --platform data/platform-files/ETPbatch.xml \ + --njobs ${NJOBS} --ninfiles ${NINFILES} --insize ${AVGINSIZE} --sigma-insize ${SIGMAINSIZE} \ + --flops ${FLOPS} --sigma-flops ${SIGMAFLOPS} --mem ${MEM} \ + --outsize ${OUTSIZE} --sigma-outsize ${SIGMAOUTSIZE} \ + --duplications ${DUPLICATIONS} \ + --hitrate 0.0 \ + --xrd-blocksize ${XRDBLOCKSIZE} \ + --output-file /dev/null \ + & TEST_PID=$! + + (while [[ True ]]; \ + do ps -aux | grep " ${TEST_PID} " | grep "dc-sim" \ + >> tmp/monitor/$SCENARIO/test_privatedump_${NJOBS}jobs.txt; \ + sleep 10; done;)\ + & MONITOR_PID=$! + echo "Simulation process to monitor: $TEST_PID" + echo "Monitoring process: $MONITOR_PID" + + wait $TEST_PID + kill -9 ${MONITOR_PID} + +done diff --git a/tools/simScalingPlots.py b/tools/simScalingPlots.py new file mode 100755 index 0000000..02b3740 --- /dev/null +++ b/tools/simScalingPlots.py @@ -0,0 +1,223 @@ +#! /usr/bin/python3 + +import pandas as pd +import numpy as np +import scipy.optimize +from matplotlib import pyplot as plt +import os.path +import glob +import argparse + + +plt.rcParams["figure.figsize"] = [4., 3.] +plt.rcParams["figure.autolayout"] = True + + +def valid_file(param): + base, ext = os.path.splitext(param) + if ext.lower() not in ('.txt', '.dat'): + raise argparse.ArgumentTypeError('File must have a txt or dat extension') + return param + + +scenario_plotlabel_dict = { + 'withdump': "with JSON dump", + 'nodump': "without JSON dump", + 'private': "private improvements", + "hacky": "final hacky-WRENCH", + "wrench2": "WRENCH 2.0", + "SGbatch": "SG-batch", + "ETPbatch": "ETP-batch", + "ETPbatchreduced": "reduced ETP-batch" +} + + +def converttime(df: pd.DataFrame, a: str, b: str): + return df[a].astype(int)*60 + df[b].astype(int) + +def timeexp(x, m, t, b, c): + return m * np.exp(t * x + c) + b + +def timep1(x, m, b): + return m * x + b + +def timep2(x, m, m2, b): + return m * x + b + m2 * x**2 + + +parser = argparse.ArgumentParser( + description="Produce a plot showing the runtime and memory scaling of the simulation. \ + If you intend to use this script, make sure that the monitoring files containing the \ + information about the simulation are in the right format. If you produced these by the \ + `simscaling.sh` script, it should work natively.", + add_help=True +) +parser.add_argument( + "--scenario", + type=str, + choices=scenario_plotlabel_dict.keys(), + required=True, + help="Choose a scenario, which sets the according plotting label and filename of the plot." +) +parser.add_argument( + "monitorfiles", + nargs='+', + help="Files containing monitoring information about the simulation run, produced by `ps -aux`.\ + Each file produces a single point in the plot for memory and runtime respectively." +) +parser.add_argument( + "--extrapolate", + action='store_true', + help="Flag to enable exponential, linear, and quadratic extapolations for runtime & memory." +) + + +args = parser.parse_args() + +scenario = args.scenario + + +# Create a data-frame holding all monitoring information +monitorfiles = args.monitorfiles +for mfile in monitorfiles: + mfile = os.path.abspath(mfile) + assert(os.path.exists(mfile)) + +print("Found {} monitorfiles".format(str(len(monitorfiles)))) + +if (all(os.path.exists(f) for f in monitorfiles) and monitorfiles): + df = pd.concat( + [ + pd.read_table( + f, + delimiter="\s+", + usecols=[0,1,2,3,4,5,6,7,8,9,10,14], + names=[ + "USER", "PID", "%CPU", "%MEM", "VSZ", "RSS", "TTY", "STAT", "START", "TIME", "COMMAND", + #"platform option", "Platform file", "njobs option", + "NJobs", #"ninfiles option", "NFilesPerJob", + #"insize option", "InFileSize", "siginfiles option", "SigInFileSize", + #"flops option", "Flops", "sigflops option", "SigFlops", "mem option", "Memory", + #"outsize option", "OutFileSize", "sigoutsize option", "SigOutFileSize", + #"duplications option", "Duplications", "hitrate option", "Hitrate", + #"buffersize option", "BufferSize", "xrdblocksize option", "XrdBlockSize", "output option", "OutputName", + ], + ) + for f in monitorfiles + ], + ignore_index=True + ) + print("Simulation monitoring information: \n", df) +else: + print("Couldn't find any files") + exit(1) + + +# postprocess data frame +df['TIME'] = df['TIME'].str.split(":",expand=True).pipe(converttime, 0, 1) +df['RSS'] = df['RSS']/(1024*1024) +runtimesdf = df.loc[df.groupby("NJobs")["TIME"].idxmax()] +memorydf = df.loc[df.groupby("NJobs")["RSS"].idxmax()] +print("Filtered data:\n", runtimesdf) + + +# Visualize the monitoring information +fig, ax = plt.subplots() +ax.set_title("Simulation scaling " + scenario_plotlabel_dict[scenario]) + +# ax.set_xscale('log') +ax.set_xlabel('$N_{jobs}$', loc='right') +ax.set_ylabel('time / min', color='cornflowerblue') +ax.set_xlim([0,runtimesdf['NJobs'].iloc[-1]*1.05]) +# ax.set_ylim([0,400]) + +ax.plot(runtimesdf['NJobs'], runtimesdf['TIME']/60, linestyle='dotted', color='cornflowerblue') +ax.scatter(runtimesdf['NJobs'], runtimesdf['TIME']/60, color='cornflowerblue', marker='x', label='runtime') +ax.grid(axis="y", linestyle = 'dotted', which='major') + +secax = ax.twinx() +secax.plot(memorydf['NJobs'], memorydf['RSS'],linestyle='dotted', color='orange') +secax.scatter(memorydf['NJobs'], memorydf['RSS'], color='orange', marker='^', label='memory') +# secax.xaxis.set_minor_locator(AutoMinorLocator()) +secax.set_ylabel('memory / GiB', color='orange') +# secax.set_ylim([0,12]) + +h1, l1 = ax.get_legend_handles_labels() +h2, l2 = secax.get_legend_handles_labels() +ax.legend(h1+h2, l1+l2, loc=2) + +njobs = np.linspace(runtimesdf['NJobs'].iloc[0], runtimesdf['NJobs'].iloc[-1], 1000) + + +if args.extrapolate: + print("RUNTIME EXTRAPOLATIONS for 100k jobs:") + start_params = (0.0, 0.0, 0.0, 0.0) + params, cv = scipy.optimize.curve_fit(timeexp, runtimesdf['NJobs'], runtimesdf['TIME']/60, start_params) + m, t, b, c = params + if np.all(np.isfinite(cv)): + ax.plot(njobs, timeexp(njobs, m, t, b, c), linestyle='-', color='blue', label='runtime extrapolation') + print(f"\tExponential: {timeexp(100000, m, t, b, c)} min = {timeexp(100000, m, t, b, c)/60.} h = {timeexp(100000, m, t, b, c)/60./24.} d") + print(f"\tfunction: m * exp(t * x + c) + b; m, t, b, c = {params}") + else: + print(f"\tExponential fit failed. Please check initial parameters.") + print("") + + start_params = (0.01, 0.01) + params, cv = scipy.optimize.curve_fit(timep1, runtimesdf['NJobs'], runtimesdf['TIME']/60, start_params) + m, b = params + if np.all(np.isfinite(cv)): + ax.plot(njobs, timep1(njobs, m, b), linestyle='-', color='green', label='runtime extrapolation') + print(f"\tLinear: {timep1(100000, m, b)} min = {timep1(100000, m, b)/60.} h = {timep1(100000, m, b)/60./24.} d") + print(f"\tfunction: m * x + b; m, b = {params}") + else: + print(f"\tLinear fit failed. Please check initial parameters.") + print("") + + start_params = (0.01, 0.01, 0.01) + params, cv = scipy.optimize.curve_fit(timep2, runtimesdf['NJobs'], runtimesdf['TIME']/60, start_params) + m, m2, b = params + if np.all(np.isfinite(cv)): + ax.plot(njobs, timep2(njobs, m, m2, b), linestyle='-', color='red', label='runtime extrapolation') + print(f"\tQuadratic: {timep2(100000, m, m2, b)} min = {timep2(100000, m, m2, b)/60.} h = {timep2(100000, m, m2, b)/60./24.} d") + print(f"\tfunction: m * x + m2 * x**2 + b; m, m2, b = {params}") + else: + print(f"\tQuadratic fit failed. Please check initial parameters.") + print("") + + print("MEMORY EXTRAPOLATION for 100k jobs:") + start_params = (0.0, 0.0, 0.0, 0.0) + params, cv = scipy.optimize.curve_fit(timeexp, memorydf['NJobs'], memorydf['RSS'], start_params) + m, t, b, c = params + if np.all(np.isfinite(cv)): + ax.plot(njobs, timeexp(njobs, m, t, b, c), linestyle='--', color='blue', label='runtime extrapolation') + print(f"\tExponential: {timeexp(100000, m, t, b, c)} GB") + print(f"\tfunction: m * exp(t * x + c) + b; m, t, b, c = {params}") + else: + print(f"\tExponential fit failed. Please check initial parameters.") + print("") + start_params = (0.0, 0.0) + params, cv = scipy.optimize.curve_fit(timep1, memorydf['NJobs'], memorydf['RSS'], start_params) + m, b = params + if np.all(np.isfinite(cv)): + ax.plot(njobs, timep1(njobs, m, b), linestyle='--', color='green', label='runtime extrapolation') + print(f"\tLinear: {timep1(100000, m, b)} GB") + print(f"\tfunction: m * x + b; m, b = {params}") + else: + print(f"\tLinear fit failed. Please check initial parameters.") + print("") + + start_params = (0.0, 0.0, 0.0) + params, cv = scipy.optimize.curve_fit(timep2, memorydf['NJobs'], memorydf['RSS'], start_params) + m, m2, b = params + if np.all(np.isfinite(cv)): + ax.plot(njobs, timep2(njobs, m, m2, b), linestyle='--', color='red', label='runtime extrapolation') + print(f"\tQuadratic: {timep2(100000, m, m2, b)} GB") + print(f"\tfunction: m * x + m2 * x**2 + b; m, m2, b = {params}") + else: + print(f"\tQuadratic fit failed. Please check initial parameters.") + print("") + +fig.savefig("scalingtest_"+ scenario +".pdf") +fig.savefig("scalingtest_"+ scenario +".png") + +# plt.show() diff --git a/tools/simscaling.py b/tools/simscaling.py deleted file mode 100644 index 1a4e29a..0000000 --- a/tools/simscaling.py +++ /dev/null @@ -1,162 +0,0 @@ -#! /usr/bin/python3 - -import pandas as pd -import numpy as np -import scipy.optimize -from matplotlib import pyplot as plt -import os.path -import glob -import argparse - - -plt.rcParams["figure.figsize"] = [4., 3.] -plt.rcParams["figure.autolayout"] = True - - -def valid_file(param): - base, ext = os.path.splitext(param) - if ext.lower() not in ('.txt', '.dat'): - raise argparse.ArgumentTypeError('File must have a txt or dat extension') - return param - - -scenario_plotlabel_dict = { - 'withdump': "with JSON dump", - 'nodump': "without JSON dump", - 'private': "private improvements", - "hacky": "final hacky-WRENCH", - "wrench2": "WRENCH 2.0" -} - - -def converttime(df: pd.DataFrame, a: str, b: str): - return df[a].astype(int)*60 + df[b].astype(int) - -def timeexp(x, m, t, b, c): - return m * np.exp(- t * x + c) + b - -def timep1(x, m, b): - return m * x + b - -def timep2(x, m, m2, b): - return m * x + b + m2 * x**2 - - -parser = argparse.ArgumentParser( - description="Produce a plot showing the runtime and memory scaling of the simulation. \ - If you intend to use this script, make sure that the monitoring files containing the \ - information about the simulation are in the right format. If you produced these by the \ - `simscaling.sh` script, it should work natively.", - add_help=True -) -parser.add_argument( - "--scenario", - type=str, - choices=("withdump", "nodump", "private", "hacky", "wrench2"), - required=True, - help="Choose a scenario, which sets the according plotting label and filename of the plot." -) -parser.add_argument( - "monitorfiles", - nargs='+', - help="Files containing monitoring information about the simulation run, produced by `ps -aux`.\ - Each file produces a single point in the plot for memory and runtime respectively." -) - - -args = parser.parse_args() - -scenario = args.scenario - - -# Create a data-frame holding all monitoring information -monitorfiles = args.monitorfiles -for mfile in monitorfiles: - mfile = os.path.abspath(mfile) - assert(os.path.exists(mfile)) - -print("Found {} monitorfiles".format(str(len(monitorfiles)))) - -if (all(os.path.exists(f) for f in monitorfiles) and monitorfiles): - df = pd.concat( - [ - pd.read_table( - f, - delimiter="\s+", - names=[ - "USER", "PID", "%CPU", "%MEM", "VSZ", "RSS", "TTY", "STAT", "START", "TIME", "COMMAND", - "platform option", "Platform file", "njobs option", "NJobs", "ninfiles option", "NFilesPerJob", - "insize option", "FileSize", "hitrate option", "Hitrate", "output option", "OutputName", - "blockstreaming option" - ], - ) - for f in monitorfiles - ], - ignore_index=True - ) - print("Simulation monitoring information: \n", df) -else: - print("Couldn't find any files") - exit(1) - - -# postprocess data frame -df['TIME'] = df['TIME'].str.split(":",expand=True).pipe(converttime, 0, 1) -df['RSS'] = df['RSS']/(1024*1024) -runtimesdf = df.loc[df.groupby("NJobs")["TIME"].idxmax()] -memorydf = df.loc[df.groupby("NJobs")["RSS"].idxmax()] -print("Filtered data:\n", runtimesdf) - - -# Visualize the monitoring information -fig, ax = plt.subplots() -ax.set_title("Simulation scaling " + scenario_plotlabel_dict[scenario]) - -# ax.set_xscale('log') -ax.set_xlabel('$N_{jobs}$', loc='right') -ax.set_ylabel('time / min', color='cornflowerblue') -ax.set_xlim([0,runtimesdf['NJobs'].iloc[-1]*1.05]) -# ax.set_ylim([0,400]) - -ax.plot(runtimesdf['NJobs'], runtimesdf['TIME']/60, linestyle='dotted', color='cornflowerblue') -ax.scatter(runtimesdf['NJobs'], runtimesdf['TIME']/60, color='cornflowerblue', marker='x', label='runtime') -ax.grid(axis="y", linestyle = 'dotted', which='major') - -secax = ax.twinx() -secax.plot(memorydf['NJobs'], memorydf['RSS'],linestyle='dotted', color='orange') -secax.scatter(memorydf['NJobs'], memorydf['RSS'], color='orange', marker='^', label='memory') -# secax.xaxis.set_minor_locator(AutoMinorLocator()) -secax.set_ylabel('memory / GiB', color='orange') -# secax.set_ylim([0,12]) - -h1, l1 = ax.get_legend_handles_labels() -h2, l2 = secax.get_legend_handles_labels() -ax.legend(h1+h2, l1+l2, loc=2) - -njobs = np.linspace(runtimesdf['NJobs'].iloc[0], runtimesdf['NJobs'].iloc[-1], 1000) - -#start_params = (1., 1., 1., 1.) -#params, cv = scipy.optimize.curve_fit(timeexp, runtimesdf['NJobs'], runtimesdf['TIME']/60, start_params) -#m, t, b, c = params -#print(params) -#ax.plot(njobs, timeexp(njobs, m, t, b, c), linestyle='-', color='black', label='runtime extrapolation') -#print(f"100k extrapolation exp: {timeexp(100000, m, t, b, c)} min") - -#start_params = (0.01, 0.01) -#params, cv = scipy.optimize.curve_fit(timep1, runtimesdf['NJobs'], runtimesdf['TIME']/60, start_params) -#m, b = params -#print(params) -#ax.plot(njobs, timep1(njobs, m, b), linestyle='-', color='black', label='runtime extrapolation') -#print(f"100k extrapolation linear: {timep1(100000, m, b)} min") - -start_params = (0.01, 0.01, 0.01) -params, cv = scipy.optimize.curve_fit(timep2, runtimesdf['NJobs'], runtimesdf['TIME']/60, start_params) -m, m2, b = params -print(params) -#ax.plot(njobs, timep2(njobs, m, m2, b), linestyle='-', color='black', label='runtime extrapolation') -print(f"100k extrapolation a_1 * x + a_2 * x**2 + b: {timep2(100000, m, m2, b)} min = {timep2(100000, m, m2, b)/60.} h = {timep2(100000, m, m2, b)/60./24.} d") - - -fig.savefig("scalingtest_"+ scenario +".pdf") - -# plt.show() diff --git a/tools/simscaling.sh b/tools/simscaling.sh deleted file mode 100644 index b7fa72a..0000000 --- a/tools/simscaling.sh +++ /dev/null @@ -1,36 +0,0 @@ -#! /bin/bash - -# script for execution of simulation scenarios to test the runtime and memory scaling of the simulator -# -# - -NJOBS=60 -NINFILES=10 -AVGINSIZE=3600000000 - -SCENARIO="test" - -if [ ! -d "tmp/monitor/$SCENARIO" ]; then - mkdir -p tmp/monitor/$SCENARIO -fi - -for NJOBS in 10 20 50 100 200 500 1000 1200 1500 2000 2500 3000 -do - sgbatch-sim --platform data/platform-files/host_scaletest.xml \ - --njobs ${NJOBS} --ninfiles ${NINFILES} --insize ${AVGINSIZE} \ - --hitrate 0.0 \ - --output-file /dev/null \ - --no-streaming \ - & TEST_PID=$! - echo $TEST_PID - - (while [[ True ]]; \ - do ps -aux | grep " ${TEST_PID} " | grep "sgbatch-sim" \ - >> tmp/monitor/$SCENARIO/test_privatedump_${NJOBS}jobs.txt; \ - sleep 10; done;)\ - & MONITOR_PID=$! - echo $TEST_PID $MONITOR_PID - - wait $TEST_PID - kill -9 ${MONITOR_PID} -done diff --git a/tools/trapezoidplot.py b/tools/trapezoidPlot.py similarity index 100% rename from tools/trapezoidplot.py rename to tools/trapezoidPlot.py