Skip to content

Commit

Permalink
(#220)
Browse files Browse the repository at this point in the history
Implemented pre- and post-overhead for StandardJobs at the WorkUnit level
Minor fixes/cleanup
HTCondor Example finalization
  • Loading branch information
henricasanova committed Aug 16, 2021
1 parent 7eeee54 commit c5b5706
Show file tree
Hide file tree
Showing 17 changed files with 231 additions and 117 deletions.
68 changes: 31 additions & 37 deletions examples/condor-grid-example/CondorGridSimulator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,107 +9,101 @@

#include <iostream>
#include <wrench.h>
#include <pugixml.hpp>

#include "CondorWMS.h" // WMS implementation
#include "CondorTimestamp.h"
#include "wrench/tools/pegasus/PegasusWorkflowParser.h"


/**
* ./wrench-example-condor-grid-universe [disk-speed in MBps] [bandwidth in MBps, storage service to batch service] ...
* [Override Pre_execution overhead time in seconds] ...
* [Override Post_execution overhead time in seconds]
* @return
*/
** This simulator simulates the execution of 10 independent tasks on HTCondor,
** where HTCondor has access for a batch compute service and a bare-metal compute
** service that runs on a VM.
**
** Example invocation of the simulator with useful WMS logging:
** ./wrench-example-condor-grid-universe condor-grid-hosts.xml --log=custom_wms.threshold=info
**/

int main(int argc, char **argv) {

// Create and initialize a simulation
auto *simulation = new wrench::Simulation();

/* Parse WRENCH-specific and SimGrid-specific command-line arguments */
// Parse WRENCH-specific and SimGrid-specific command-line arguments
simulation->init(&argc, argv);

/* Parse simulator-specific command-line arguments */
// Parse simulator-specific command-line arguments
if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " <XML platform file>\n";
exit(1);
}

/* Initialize the platform with the XML file */
// Initialize the platform with the XML file
simulation->instantiatePlatform(argv[1]);

/* Create a "workflow" of 10 independent 5-core tasks, each with some input file and an output file */
// Create a "workflow" of independent 5-core tasks, each with some input file and an output file
long num_tasks = 10;
auto workflow = new wrench::Workflow();
for (int i=0; i < num_tasks; i++) {
auto task = workflow->addTask("task_" + std::to_string(i), 1000.0 *1000.0*1000.00*1000.00, 5, 5, 0);
auto input = workflow->addFile("task_" + std::to_string(i) + ".in", 0.0*1000*1000);
auto output = workflow->addFile("task_" + std::to_string(i) + ".out", 0.0*1000*1000);
auto input = workflow->addFile("task_" + std::to_string(i) + ".in", 100.0*1000*1000);
auto output = workflow->addFile("task_" + std::to_string(i) + ".out", 100.0*1000*1000);
task->addInputFile(input);
task->addOutputFile(output);
}

/* Create a storage service on the WMS host, that will host all data */
// Create a storage service on the WMS host, that will host all data
auto local_ss = simulation->add(new wrench::SimpleStorageService("WMSHost", {"/"}));

/* Create a batch service */
// Create a 4-node batch service
auto batch_cs = simulation->add(new wrench::BatchComputeService(
"BatchHeadNode",
{"BatchNode1", "BatchNode2", "BatchNode3", "BatchNode4"},
"/scratch_batch",
{
{wrench::BatchComputeServiceProperty::GRID_PRE_EXECUTION_DELAY, "10.0"},
{wrench::BatchComputeServiceProperty::GRID_POST_EXECUTION_DELAY, "5.0"},
},
{},
{}));

/* Create a cloud service */
// Create a 2-node cloud service
auto cloud_cs = simulation->add(new wrench::CloudComputeService(
"CloudHeadNode",
{"CloudNode1", "CloudNode2"},
"/scratch_cloud"));

/* Create a HTCondor service that has access to the BatchComputeService (the WMS
* will create VMs on the CloudCompute Service, which will expose BareMetalComputeService instances
* that will be added to the HTCondor service. Set the HTCondor overhead to 1 second*/
// Create a HTCondor service that has access to the BatchComputeService (the WMS
// will create VMs on the CloudCompute Service, which will expose BareMetalComputeService instances
// that will be added to the HTCondor service). Below we set all possible overhead values (see documentation).
auto htcondor_cs = simulation->add(
new wrench::HTCondorComputeService(
"BatchHeadNode",
{batch_cs},
{
{wrench::HTCondorComputeServiceProperty::NEGOTIATOR_OVERHEAD, "0.0"}
{wrench::HTCondorComputeServiceProperty::NEGOTIATOR_OVERHEAD, "1.0"},
{wrench::HTCondorComputeServiceProperty::GRID_PRE_EXECUTION_DELAY, "10.0"},
{wrench::HTCondorComputeServiceProperty::GRID_POST_EXECUTION_DELAY, "10.0"},
{wrench::HTCondorComputeServiceProperty::NON_GRID_PRE_EXECUTION_DELAY, "5.0"},
{wrench::HTCondorComputeServiceProperty::NON_GRID_POST_EXECUTION_DELAY, "5.0"}
},
{}));

/* Set the default local storage service */
// Set the default local storage service
std::dynamic_pointer_cast<wrench::HTCondorComputeService>(htcondor_cs)->setLocalStorageService(local_ss);

// Create a WMS
auto wms = simulation->add(
new wrench::CondorWMS({htcondor_cs, batch_cs, cloud_cs}, {local_ss}, "HTCondorHost"));

// Add the workflow to the WMS
wms->addWorkflow(workflow);

// Create a file registry
simulation->add(new wrench::FileRegistryService("WMSHost"));

// Staging the input_file on the storage service
// Stage the input_file on the storage service
for (auto const &f : workflow->getInputFiles()) {
simulation->stageFile(f, local_ss);
}

// Running the simulation
// Run the simulation
simulation->launch();

/* Printing task execution information directly from WorkflowTask objects -- other
* examples showcases how to use simulation->getOutput().getTrace<T>() */

for (const auto &t : workflow->getTasks()) {
std::cout << "Task " + t->getID() << " ";
std::cout << "started at time " << t->getStartDate() << " on ";
std::cout << "host " << t->getPhysicalExecutionHost() << " and finished at time ";
std::cout << t->getEndDate() << "\n";
}

return 0;
}
73 changes: 42 additions & 31 deletions examples/condor-grid-example/CondorWMS.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@

WRENCH_LOG_CATEGORY(custom_wms, "Log category for Custom WMS");

/**
** This WMS submits n tasks for execution to HTCondor. The first ~n/2 are submitted together
* as a single grid-universe job. The remaining tasks are submitted as individual non-grid-universe
* jobs.
**/


namespace wrench {

Expand All @@ -29,43 +35,40 @@ namespace wrench {
"condor-grid"){}

/**
* @brief main method of the CondorWMS daemon
*
* @return 0 on completion
*
* @throw std::runtime_error
* Main method of the WMS
*/
int CondorWMS::main() {

/* Set the logging output to GREEN */
// Set the logging output to GREEN
TerminalOutput::setThisProcessLoggingColor(TerminalOutput::COLOR_GREEN);

/* Create a data movement manager */
// Create a data movement manager
auto data_movement_manager = this->createDataMovementManager();

/* Create a job manager */
// Create a job manager
auto job_manager = this->createJobManager();

/* Get reference to the storage service */
// Get reference to the storage service
auto ss = *(this->getAvailableStorageServices().begin());

/* Get references to all compute services */
// Get references to all compute services (note that all jobs will be submitted to the htcondor_cs)
auto htcondor_cs = *(this->getAvailableComputeServices<wrench::HTCondorComputeService>().begin());
auto batch_cs = *(this->getAvailableComputeServices<wrench::BatchComputeService>().begin());
auto cloud_cs = *(this->getAvailableComputeServices<wrench::CloudComputeService>().begin());

/* Create and start a 5-core VM with 32GB of RAM on the Cloud compute service */
// Create and start a 5-core VM with 32GB of RAM on the Cloud compute service */
WRENCH_INFO("Creating a 5-core VM instance on the cloud service");
cloud_cs->createVM(5, 32.0*1000*1000*1000, "my_vm", {}, {});
WRENCH_INFO("Starting the VM instance, which exposes a usable bare-metal compute service");
auto vm_cs = cloud_cs->startVM("my_vm");

/* Add the VM's BareMetalComputeService to the HTCondor compute service */
WRENCH_INFO("Adding the VM instance to HTCondor");
WRENCH_INFO("Adding the VM's bare-metal compute service to HTCondor");
htcondor_cs->addComputeService(vm_cs);

/* At this point, HTCondor has access to: .... */
WRENCH_INFO("At this point, HTCondor has access to one batch compute service and one bare-metal service (which runs on a VM)");

/* Create a map of files, which are all supposed to be on the local SS */
// Create a map of files, which are all supposed to be on the local SS
std::map<WorkflowFile *, std::shared_ptr<FileLocation>> file_locations;
for (auto const &t : this->getWorkflow()->getTasks()) {
for (auto const &f : t->getInputFiles()) {
Expand All @@ -76,37 +79,43 @@ namespace wrench {
}
}

/* Split the 10 tasks into two groups of 5 tasks */
std::vector<wrench::WorkflowTask *> first_five_tasks;
std::vector<wrench::WorkflowTask *> last_five_tasks;
// Split the tasks into two groups
std::vector<wrench::WorkflowTask *> first_tasks;
std::vector<wrench::WorkflowTask *> last_tasks;
int task_count = 0;
unsigned long num_tasks = this->getWorkflow()->getTasks().size();
for (auto const &t : this->getWorkflow()->getTasks()) {
if (task_count < 5) {
first_five_tasks.push_back(t);
if (task_count < num_tasks / 2) {
first_tasks.push_back(t);
} else {
last_five_tasks.push_back(t);
last_tasks.push_back(t);
}
task_count++;
}

/* Submit the first 5 tasks as part of a single "grid universe" job to HTCondor */
auto grid_universe_job = job_manager->createStandardJob(first_five_tasks, file_locations);
// Submit the first tasks as part of a single "grid universe" job to HTCondor
WRENCH_INFO("Creating a standard job with the first %ld tasks", first_tasks.size());
auto grid_universe_job = job_manager->createStandardJob(first_tasks, file_locations);
WRENCH_INFO("Submitting the job as a grid-universe job to HTCondor, asking for 3 compute nodes");
std::map<std::string, std::string> htcondor_service_specific_arguments;
htcondor_service_specific_arguments["universe"] = "grid";
htcondor_service_specific_arguments["-N"] = "3";
htcondor_service_specific_arguments["-c"] = "5";
htcondor_service_specific_arguments["-t"] = "3600";
// This argument below is not required, as there is a single batch service in this example
htcondor_service_specific_arguments["-service"] = batch_cs->getName();
WRENCH_INFO("Submitting the first 5 tasks as a single grid-universe job to HTCondor (will run on the batch service)");
job_manager->submitJob(grid_universe_job, htcondor_cs, htcondor_service_specific_arguments);
WRENCH_INFO("Job submitted!");

/* Submit the next 5 tasks as individual non "grid universe" jobs to HTCondor */
for (auto const &t : last_five_tasks) {
WRENCH_INFO("Submitting a task as a single non-grid-universe job to HTCondor (will run on the VM)");
auto job = job_manager->createStandardJob(t, file_locations);
/* Submit the last tasks as individual non "grid universe" jobs to HTCondor */
for (auto const &task : last_tasks) {
WRENCH_INFO("Creating and submitting a single-task job (for task %s) as a non-grid-universe job to HTCondor (will run on the VM)",
task->getID().c_str());
auto job = job_manager->createStandardJob(task, file_locations);
job_manager->submitJob(job, htcondor_cs);
}

WRENCH_INFO("Waiting for Workflow Execution Events until the workflow execution is finished...");
/* Wait for all execution events */
while (not this->getWorkflow()->isDone()) {
this->waitForAndProcessNextEvent();
Expand All @@ -124,12 +133,14 @@ namespace wrench {
void CondorWMS::processEventStandardJobCompletion(std::shared_ptr<StandardJobCompletedEvent> event) {
/* Retrieve the job that this event is for */
auto job = event->standard_job;
/* Retrieve the job's tasks */
WRENCH_INFO("Notified that a standard job has completed: ");
for (auto const &task : job->getTasks()) {
WRENCH_INFO("Notified that a standard job has completed task %s",
task->getID().c_str())
WRENCH_INFO(" - Task %s ran on host %s (started at time %.2lf and finished at time %.2lf)",
task->getID().c_str(),
task->getPhysicalExecutionHost().c_str(),
task->getStartDate(),
task->getEndDate());
}
simulation->getOutput().addTimestamp<CondorGridEndTimestamp>(new CondorGridEndTimestamp);
}


Expand Down
2 changes: 0 additions & 2 deletions include/wrench/services/compute/batch/BatchComputeService.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,6 @@ namespace wrench {
{BatchComputeServiceProperty::TASK_STARTUP_OVERHEAD, "0"},
{BatchComputeServiceProperty::HOST_SELECTION_ALGORITHM, "FIRSTFIT"},
{BatchComputeServiceProperty::TASK_SELECTION_ALGORITHM, "maximum_flops"},
{BatchComputeServiceProperty::GRID_PRE_EXECUTION_DELAY, "78.0"},
{BatchComputeServiceProperty::GRID_POST_EXECUTION_DELAY, "16.0"},
#ifdef ENABLE_BATSCHED
{BatchComputeServiceProperty::BATCH_SCHEDULING_ALGORITHM, "conservative_bf"},
// {BatchComputeServiceProperty::BATCH_SCHEDULING_ALGORITHM, "easy_bf"},
Expand Down
22 changes: 11 additions & 11 deletions include/wrench/services/compute/batch/BatchComputeServiceProperty.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,17 +159,17 @@ namespace wrench {
*/
DECLARE_PROPERTY_NAME(BATSCHED_CONTIGUOUS_ALLOCATION);

/** @brief Overhead delay in seconds between condor and slurm for the start of execution
* - defaults to calibrated figure
* - property is set on first receiving grid universe job.
*/
DECLARE_PROPERTY_NAME(GRID_PRE_EXECUTION_DELAY);

/** @brief Overhead delay in seconds between condor and slurm for the completion of execution
* - defaults to calibrated figure
* - property is set on first receiving grid universe job.
*/
DECLARE_PROPERTY_NAME(GRID_POST_EXECUTION_DELAY);
// /** @brief Overhead delay in seconds between condor and slurm for the start of execution
// * - defaults to calibrated figure
// * - property is set on first receiving grid universe job.
// */
// DECLARE_PROPERTY_NAME(GRID_PRE_EXECUTION_DELAY);
//
// /** @brief Overhead delay in seconds between condor and slurm for the completion of execution
// * - defaults to calibrated figure
// * - property is set on first receiving grid universe job.
// */
// DECLARE_PROPERTY_NAME(GRID_POST_EXECUTION_DELAY);



Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,13 @@ namespace wrench {
std::map<std::string, double> default_messagepayload_values = {
{HTCondorCentralManagerServiceMessagePayload::STOP_DAEMON_MESSAGE_PAYLOAD, 1024},
{HTCondorCentralManagerServiceMessagePayload::DAEMON_STOPPED_MESSAGE_PAYLOAD, 1024},
{HTCondorCentralManagerServiceMessagePayload::SUBMIT_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD, 256000000},
{HTCondorCentralManagerServiceMessagePayload::SUBMIT_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD, 256000000},
{HTCondorCentralManagerServiceMessagePayload::SUBMIT_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD, 256000000},
{HTCondorCentralManagerServiceMessagePayload::SUBMIT_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD, 256000000},
{HTCondorCentralManagerServiceMessagePayload::RESOURCE_DESCRIPTION_REQUEST_MESSAGE_PAYLOAD, 196000000},
{HTCondorCentralManagerServiceMessagePayload::RESOURCE_DESCRIPTION_ANSWER_MESSAGE_PAYLOAD, 196000000},
{HTCondorCentralManagerServiceMessagePayload::STANDARD_JOB_DONE_MESSAGE_PAYLOAD, 512000000},
{HTCondorCentralManagerServiceMessagePayload::SUBMIT_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD, 1024},
{HTCondorCentralManagerServiceMessagePayload::SUBMIT_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD, 1024},
{HTCondorCentralManagerServiceMessagePayload::SUBMIT_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD, 1024},
{HTCondorCentralManagerServiceMessagePayload::SUBMIT_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD, 1024},
{HTCondorCentralManagerServiceMessagePayload::RESOURCE_DESCRIPTION_REQUEST_MESSAGE_PAYLOAD, 1024},
{HTCondorCentralManagerServiceMessagePayload::RESOURCE_DESCRIPTION_ANSWER_MESSAGE_PAYLOAD, 1024},
{HTCondorCentralManagerServiceMessagePayload::STANDARD_JOB_DONE_MESSAGE_PAYLOAD, 1024},
{HTCondorCentralManagerServiceMessagePayload::PILOT_JOB_STARTED_MESSAGE_PAYLOAD, 1024},
{HTCondorCentralManagerServiceMessagePayload::PILOT_JOB_EXPIRED_MESSAGE_PAYLOAD, 1024}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ namespace wrench {
private:
std::map<std::string, std::string> default_property_values = {
{HTCondorComputeServiceProperty::NEGOTIATOR_OVERHEAD, "0.0"},
{HTCondorComputeServiceProperty::GRID_PRE_EXECUTION_DELAY, "0.0"},
{HTCondorComputeServiceProperty::GRID_POST_EXECUTION_DELAY, "0.0"},
{HTCondorComputeServiceProperty::NON_GRID_PRE_EXECUTION_DELAY, "0.0"},
{HTCondorComputeServiceProperty::NON_GRID_POST_EXECUTION_DELAY, "0.0"},
};

std::map<std::string, double> default_messagepayload_values = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,26 @@ namespace wrench {

public:

/** @brief Overhead, in seconds, of the HTCondor Negotiator **/
/** @brief Overhead, in seconds, of the HTCondor Negotiator, which is invoked each time a new job is submitted or
* a running job completes and there are still pending jobs **/
DECLARE_PROPERTY_NAME(NEGOTIATOR_OVERHEAD);

/** @brief Overhead (in seconds) between condor and a batch compute service for the start of execution of grid-universe jobs
*/
DECLARE_PROPERTY_NAME(GRID_PRE_EXECUTION_DELAY);

/** @brief Overhead (in seconds) between condor and a batch compute service for the completion of execution of grid-universe jobs
*/
DECLARE_PROPERTY_NAME(GRID_POST_EXECUTION_DELAY);

/** @brief Overhead (in seconds) between condor and a bare-metal compute service for the start of execution of non-grid-universe jobs
*/
DECLARE_PROPERTY_NAME(NON_GRID_PRE_EXECUTION_DELAY);

/** @brief Overhead (in seconds) between condor and a bare-metal compute for the completion of execution of non-grid-universe jobs
*/
DECLARE_PROPERTY_NAME(NON_GRID_POST_EXECUTION_DELAY);

};
}

Expand Down
Loading

0 comments on commit c5b5706

Please sign in to comment.