(#220)

Implemented pre- and post-overhead for StandardJobs at the WorkUnit level Minor fixes/cleanup HTCondor Example finalization
wrench-project · Aug 16, 2021 · c5b5706 · c5b5706
1 parent 7eeee54
commit c5b5706
Show file tree

Hide file tree

Showing 17 changed files with 231 additions and 117 deletions.
diff --git a/examples/condor-grid-example/CondorGridSimulator.cpp b/examples/condor-grid-example/CondorGridSimulator.cpp
@@ -9,107 +9,101 @@
 
 #include <iostream>
 #include <wrench.h>
-#include <pugixml.hpp>
 
 #include "CondorWMS.h" // WMS implementation
-#include "CondorTimestamp.h"
 #include "wrench/tools/pegasus/PegasusWorkflowParser.h"
 
 
 /**
- * ./wrench-example-condor-grid-universe [disk-speed in MBps] [bandwidth in MBps, storage service to batch service] ...
- * [Override Pre_execution overhead time in seconds] ...
- * [Override Post_execution overhead time in seconds]
- * @return
- */
+ ** This simulator simulates the execution of 10 independent tasks on HTCondor,
+ ** where HTCondor has access for a batch compute service and a bare-metal compute
+ ** service that runs on a VM.
+ **
+ ** Example invocation of the simulator with useful WMS logging:
+ **    ./wrench-example-condor-grid-universe condor-grid-hosts.xml --log=custom_wms.threshold=info
+ **/
+
 int main(int argc, char **argv) {
+
     // Create and initialize a simulation
     auto *simulation = new wrench::Simulation();
 
-    /* Parse WRENCH-specific and SimGrid-specific command-line arguments */
+    // Parse WRENCH-specific and SimGrid-specific command-line arguments
     simulation->init(&argc, argv);
 
-    /* Parse simulator-specific command-line arguments */
+    // Parse simulator-specific command-line arguments
     if (argc != 2) {
         std::cerr << "Usage: " << argv[0] << " <XML platform file>\n";
         exit(1);
     }
 
-    /* Initialize the platform with the XML file */
+    // Initialize the platform with the XML file
     simulation->instantiatePlatform(argv[1]);
 
-    /* Create a "workflow" of 10 independent 5-core tasks, each with some input file and an output file */
+    // Create a "workflow" of independent 5-core tasks, each with some input file and an output file
     long num_tasks = 10;
     auto workflow = new wrench::Workflow();
     for (int i=0; i < num_tasks; i++) {
         auto task   = workflow->addTask("task_" + std::to_string(i), 1000.0 *1000.0*1000.00*1000.00, 5, 5, 0);
-        auto input  = workflow->addFile("task_" + std::to_string(i) + ".in", 0.0*1000*1000);
-        auto output = workflow->addFile("task_" + std::to_string(i) + ".out", 0.0*1000*1000);
+        auto input  = workflow->addFile("task_" + std::to_string(i) + ".in", 100.0*1000*1000);
+        auto output = workflow->addFile("task_" + std::to_string(i) + ".out", 100.0*1000*1000);
         task->addInputFile(input);
         task->addOutputFile(output);
     }
 
-    /* Create a storage service on the WMS host, that will host all data */
+    // Create a storage service on the WMS host, that will host all data
     auto local_ss = simulation->add(new wrench::SimpleStorageService("WMSHost", {"/"}));
 
-    /* Create a batch service */
+    // Create a 4-node batch service
     auto batch_cs = simulation->add(new wrench::BatchComputeService(
             "BatchHeadNode",
             {"BatchNode1", "BatchNode2", "BatchNode3", "BatchNode4"},
             "/scratch_batch",
-            {
-                    {wrench::BatchComputeServiceProperty::GRID_PRE_EXECUTION_DELAY, "10.0"},
-                    {wrench::BatchComputeServiceProperty::GRID_POST_EXECUTION_DELAY, "5.0"},
-            },
+            {},
             {}));
 
-    /* Create a cloud service */
+    // Create a 2-node cloud service
     auto cloud_cs = simulation->add(new wrench::CloudComputeService(
             "CloudHeadNode",
             {"CloudNode1", "CloudNode2"},
             "/scratch_cloud"));
 
-    /* Create a HTCondor service that has access to the BatchComputeService (the WMS
-     * will create VMs on the CloudCompute Service, which will expose BareMetalComputeService instances
-     * that will be added to the HTCondor service. Set the HTCondor overhead to 1 second*/
+    // Create a HTCondor service that has access to the BatchComputeService (the WMS
+    // will create VMs on the CloudCompute Service, which will expose BareMetalComputeService instances
+    // that will be added to the HTCondor service). Below we set all possible overhead values (see documentation).
     auto htcondor_cs = simulation->add(
             new wrench::HTCondorComputeService(
                     "BatchHeadNode",
                     {batch_cs},
                     {
-                            {wrench::HTCondorComputeServiceProperty::NEGOTIATOR_OVERHEAD, "0.0"}
+                            {wrench::HTCondorComputeServiceProperty::NEGOTIATOR_OVERHEAD, "1.0"},
+                            {wrench::HTCondorComputeServiceProperty::GRID_PRE_EXECUTION_DELAY, "10.0"},
+                            {wrench::HTCondorComputeServiceProperty::GRID_POST_EXECUTION_DELAY, "10.0"},
+                            {wrench::HTCondorComputeServiceProperty::NON_GRID_PRE_EXECUTION_DELAY, "5.0"},
+                            {wrench::HTCondorComputeServiceProperty::NON_GRID_POST_EXECUTION_DELAY, "5.0"}
                     },
                     {}));
 
-    /* Set the default local storage service */
+    // Set the default local storage service
     std::dynamic_pointer_cast<wrench::HTCondorComputeService>(htcondor_cs)->setLocalStorageService(local_ss);
 
     // Create a WMS
     auto wms = simulation->add(
             new wrench::CondorWMS({htcondor_cs, batch_cs, cloud_cs}, {local_ss}, "HTCondorHost"));
 
+    // Add the workflow to the WMS
     wms->addWorkflow(workflow);
 
     // Create a file registry
     simulation->add(new wrench::FileRegistryService("WMSHost"));
 
-    // Staging the input_file on the storage service
+    // Stage the input_file on the storage service
     for (auto const &f : workflow->getInputFiles()) {
         simulation->stageFile(f, local_ss);
     }
 
-    // Running the simulation
+    // Run the simulation
     simulation->launch();
 
-    /* Printing task execution information directly from WorkflowTask objects -- other
-     * examples showcases how to use simulation->getOutput().getTrace<T>() */
-
-    for (const auto &t : workflow->getTasks()) {
-        std::cout << "Task " + t->getID() << " ";
-        std::cout << "started at time " << t->getStartDate() << " on ";
-        std::cout << "host " << t->getPhysicalExecutionHost() << " and finished at time ";
-        std::cout << t->getEndDate() << "\n";
-    }
-
     return 0;
 }
diff --git a/examples/condor-grid-example/CondorWMS.cpp b/examples/condor-grid-example/CondorWMS.cpp
@@ -14,6 +14,12 @@
 
 WRENCH_LOG_CATEGORY(custom_wms, "Log category for Custom WMS");
 
+/**
+ ** This WMS submits n tasks for execution to HTCondor. The first ~n/2 are submitted together
+ *  as a single grid-universe job. The remaining tasks are submitted as individual non-grid-universe
+ *  jobs.
+ **/
+
 
 namespace wrench {
 
@@ -29,43 +35,40 @@ namespace wrench {
                     "condor-grid"){}
 
     /**
-     * @brief main method of the CondorWMS daemon
-     *
-     * @return 0 on completion
-     *
-     * @throw std::runtime_error
+     * Main method of the WMS
      */
     int CondorWMS::main() {
 
-        /* Set the logging output to GREEN */
+        // Set the logging output to GREEN
         TerminalOutput::setThisProcessLoggingColor(TerminalOutput::COLOR_GREEN);
 
-        /* Create a data movement manager */
+        // Create a data movement manager
         auto data_movement_manager = this->createDataMovementManager();
 
-        /* Create a job manager */
+        // Create a job manager
         auto job_manager = this->createJobManager();
 
-        /* Get reference to the storage service */
+        // Get reference to the storage service
         auto ss = *(this->getAvailableStorageServices().begin());
 
-        /* Get references to all compute services */
+        // Get references to all compute services (note that all jobs will be submitted to the htcondor_cs)
         auto htcondor_cs = *(this->getAvailableComputeServices<wrench::HTCondorComputeService>().begin());
         auto batch_cs = *(this->getAvailableComputeServices<wrench::BatchComputeService>().begin());
         auto cloud_cs = *(this->getAvailableComputeServices<wrench::CloudComputeService>().begin());
 
-        /* Create and start a 5-core VM with 32GB of RAM on the Cloud compute service */
+        // Create and start a 5-core VM with 32GB of RAM on the Cloud compute service */
         WRENCH_INFO("Creating a 5-core VM instance on the cloud service");
         cloud_cs->createVM(5, 32.0*1000*1000*1000, "my_vm", {}, {});
+        WRENCH_INFO("Starting the VM instance, which exposes a usable bare-metal compute service");
         auto vm_cs = cloud_cs->startVM("my_vm");
 
         /* Add the VM's BareMetalComputeService to the HTCondor compute service */
-        WRENCH_INFO("Adding the VM instance to HTCondor");
+        WRENCH_INFO("Adding the VM's bare-metal compute service to HTCondor");
         htcondor_cs->addComputeService(vm_cs);
 
-        /* At this point, HTCondor has access to: .... */
+        WRENCH_INFO("At this point, HTCondor has access to one batch compute service and one bare-metal service (which runs on a VM)");
 
-        /* Create a map of files, which are all supposed to be on the local SS */
+        // Create a map of files, which are all supposed to be on the local SS
         std::map<WorkflowFile *, std::shared_ptr<FileLocation>> file_locations;
         for (auto const &t : this->getWorkflow()->getTasks()) {
             for (auto const &f : t->getInputFiles()) {
@@ -76,37 +79,43 @@ namespace wrench {
             }
         }
 
-        /* Split the 10 tasks into two groups of 5 tasks */
-        std::vector<wrench::WorkflowTask *> first_five_tasks;
-        std::vector<wrench::WorkflowTask *> last_five_tasks;
+        // Split the tasks into two groups
+        std::vector<wrench::WorkflowTask *> first_tasks;
+        std::vector<wrench::WorkflowTask *> last_tasks;
         int task_count = 0;
+        unsigned long num_tasks = this->getWorkflow()->getTasks().size();
         for (auto const &t : this->getWorkflow()->getTasks()) {
-            if (task_count < 5) {
-                first_five_tasks.push_back(t);
+            if (task_count < num_tasks / 2) {
+                first_tasks.push_back(t);
             } else {
-                last_five_tasks.push_back(t);
+                last_tasks.push_back(t);
             }
             task_count++;
         }
 
-        /* Submit the first 5 tasks as part of a single "grid universe" job to HTCondor */
-        auto grid_universe_job = job_manager->createStandardJob(first_five_tasks, file_locations);
+        // Submit the first tasks as part of a single "grid universe" job to HTCondor
+        WRENCH_INFO("Creating a standard job with the first %ld tasks", first_tasks.size());
+        auto grid_universe_job = job_manager->createStandardJob(first_tasks, file_locations);
+        WRENCH_INFO("Submitting the job as a grid-universe job to HTCondor, asking for 3 compute nodes");
         std::map<std::string, std::string> htcondor_service_specific_arguments;
         htcondor_service_specific_arguments["universe"] = "grid";
         htcondor_service_specific_arguments["-N"] = "3";
         htcondor_service_specific_arguments["-c"] = "5";
         htcondor_service_specific_arguments["-t"] = "3600";
+        // This argument below is not required, as there is a single batch service in this example
         htcondor_service_specific_arguments["-service"] = batch_cs->getName();
-        WRENCH_INFO("Submitting the first 5 tasks as a single grid-universe job to HTCondor (will run on the batch service)");
         job_manager->submitJob(grid_universe_job, htcondor_cs, htcondor_service_specific_arguments);
+        WRENCH_INFO("Job submitted!");
 
-        /* Submit the next 5 tasks as individual non "grid universe" jobs to HTCondor */
-        for (auto const &t : last_five_tasks) {
-            WRENCH_INFO("Submitting a task as a single non-grid-universe job to HTCondor (will run on the VM)");
-            auto job = job_manager->createStandardJob(t, file_locations);
+        /* Submit the last tasks as individual non "grid universe" jobs to HTCondor */
+        for (auto const &task : last_tasks) {
+            WRENCH_INFO("Creating and submitting a single-task job (for task %s) as a non-grid-universe job to HTCondor (will run on the VM)",
+                        task->getID().c_str());
+            auto job = job_manager->createStandardJob(task, file_locations);
             job_manager->submitJob(job, htcondor_cs);
         }
 
+        WRENCH_INFO("Waiting for Workflow Execution Events until the workflow execution is finished...");
         /* Wait for all execution events */
         while (not this->getWorkflow()->isDone()) {
             this->waitForAndProcessNextEvent();
@@ -124,12 +133,14 @@ namespace wrench {
     void CondorWMS::processEventStandardJobCompletion(std::shared_ptr<StandardJobCompletedEvent> event) {
         /* Retrieve the job that this event is for */
         auto job = event->standard_job;
-        /* Retrieve the job's tasks */
+        WRENCH_INFO("Notified that a standard job has completed: ");
         for (auto const &task : job->getTasks()) {
-            WRENCH_INFO("Notified that a standard job has completed task %s",
-                        task->getID().c_str())
+            WRENCH_INFO("    - Task %s ran on host %s (started at time %.2lf and finished at time %.2lf)",
+                        task->getID().c_str(),
+                        task->getPhysicalExecutionHost().c_str(),
+                        task->getStartDate(),
+                        task->getEndDate());
         }
-        simulation->getOutput().addTimestamp<CondorGridEndTimestamp>(new CondorGridEndTimestamp);
     }
 
 

diff --git a/include/wrench/services/compute/batch/BatchComputeService.h b/include/wrench/services/compute/batch/BatchComputeService.h
@@ -59,8 +59,6 @@ namespace wrench {
                 {BatchComputeServiceProperty::TASK_STARTUP_OVERHEAD,                     "0"},
                 {BatchComputeServiceProperty::HOST_SELECTION_ALGORITHM,                    "FIRSTFIT"},
                 {BatchComputeServiceProperty::TASK_SELECTION_ALGORITHM,                    "maximum_flops"},
-                {BatchComputeServiceProperty::GRID_PRE_EXECUTION_DELAY,                    "78.0"},
-                {BatchComputeServiceProperty::GRID_POST_EXECUTION_DELAY,                   "16.0"},
 #ifdef ENABLE_BATSCHED
                 {BatchComputeServiceProperty::BATCH_SCHEDULING_ALGORITHM,                  "conservative_bf"},
 //                {BatchComputeServiceProperty::BATCH_SCHEDULING_ALGORITHM,                  "easy_bf"},

diff --git a/include/wrench/services/compute/batch/BatchComputeServiceProperty.h b/include/wrench/services/compute/batch/BatchComputeServiceProperty.h
@@ -159,17 +159,17 @@ namespace wrench {
         */
         DECLARE_PROPERTY_NAME(BATSCHED_CONTIGUOUS_ALLOCATION);
 
-        /** @brief Overhead delay in seconds between condor and slurm for the start of execution
-         *      - defaults to calibrated figure
-         *      - property is set on first receiving grid universe job.
-         */
-        DECLARE_PROPERTY_NAME(GRID_PRE_EXECUTION_DELAY);
-
-        /** @brief Overhead delay in seconds between condor and slurm for the completion of execution
-         *      - defaults to calibrated figure
-         *      - property is set on first receiving grid universe job.
-         */
-        DECLARE_PROPERTY_NAME(GRID_POST_EXECUTION_DELAY);
+//        /** @brief Overhead delay in seconds between condor and slurm for the start of execution
+//         *      - defaults to calibrated figure
+//         *      - property is set on first receiving grid universe job.
+//         */
+//        DECLARE_PROPERTY_NAME(GRID_PRE_EXECUTION_DELAY);
+//
+//        /** @brief Overhead delay in seconds between condor and slurm for the completion of execution
+//         *      - defaults to calibrated figure
+//         *      - property is set on first receiving grid universe job.
+//         */
+//        DECLARE_PROPERTY_NAME(GRID_POST_EXECUTION_DELAY);
 
 
 

diff --git a/include/wrench/services/compute/htcondor/HTCondorCentralManagerService.h b/include/wrench/services/compute/htcondor/HTCondorCentralManagerService.h
@@ -27,13 +27,13 @@ namespace wrench {
         std::map<std::string, double> default_messagepayload_values = {
                 {HTCondorCentralManagerServiceMessagePayload::STOP_DAEMON_MESSAGE_PAYLOAD,                  1024},
                 {HTCondorCentralManagerServiceMessagePayload::DAEMON_STOPPED_MESSAGE_PAYLOAD,               1024},
-                {HTCondorCentralManagerServiceMessagePayload::SUBMIT_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD,  256000000},
-                {HTCondorCentralManagerServiceMessagePayload::SUBMIT_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD,   256000000},
-                {HTCondorCentralManagerServiceMessagePayload::SUBMIT_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD,     256000000},
-                {HTCondorCentralManagerServiceMessagePayload::SUBMIT_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD,      256000000},
-                {HTCondorCentralManagerServiceMessagePayload::RESOURCE_DESCRIPTION_REQUEST_MESSAGE_PAYLOAD, 196000000},
-                {HTCondorCentralManagerServiceMessagePayload::RESOURCE_DESCRIPTION_ANSWER_MESSAGE_PAYLOAD,  196000000},
-                {HTCondorCentralManagerServiceMessagePayload::STANDARD_JOB_DONE_MESSAGE_PAYLOAD,            512000000},
+                {HTCondorCentralManagerServiceMessagePayload::SUBMIT_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD,  1024},
+                {HTCondorCentralManagerServiceMessagePayload::SUBMIT_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD,   1024},
+                {HTCondorCentralManagerServiceMessagePayload::SUBMIT_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD,     1024},
+                {HTCondorCentralManagerServiceMessagePayload::SUBMIT_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD,      1024},
+                {HTCondorCentralManagerServiceMessagePayload::RESOURCE_DESCRIPTION_REQUEST_MESSAGE_PAYLOAD, 1024},
+                {HTCondorCentralManagerServiceMessagePayload::RESOURCE_DESCRIPTION_ANSWER_MESSAGE_PAYLOAD,  1024},
+                {HTCondorCentralManagerServiceMessagePayload::STANDARD_JOB_DONE_MESSAGE_PAYLOAD,            1024},
                 {HTCondorCentralManagerServiceMessagePayload::PILOT_JOB_STARTED_MESSAGE_PAYLOAD,            1024},
                 {HTCondorCentralManagerServiceMessagePayload::PILOT_JOB_EXPIRED_MESSAGE_PAYLOAD,            1024}
         };

diff --git a/include/wrench/services/compute/htcondor/HTCondorComputeService.h b/include/wrench/services/compute/htcondor/HTCondorComputeService.h
@@ -28,6 +28,10 @@ namespace wrench {
     private:
         std::map<std::string, std::string> default_property_values = {
                 {HTCondorComputeServiceProperty::NEGOTIATOR_OVERHEAD, "0.0"},
+                {HTCondorComputeServiceProperty::GRID_PRE_EXECUTION_DELAY, "0.0"},
+                {HTCondorComputeServiceProperty::GRID_POST_EXECUTION_DELAY, "0.0"},
+                {HTCondorComputeServiceProperty::NON_GRID_PRE_EXECUTION_DELAY, "0.0"},
+                {HTCondorComputeServiceProperty::NON_GRID_POST_EXECUTION_DELAY, "0.0"},
         };
 
         std::map<std::string, double> default_messagepayload_values = {

diff --git a/include/wrench/services/compute/htcondor/HTCondorComputeServiceProperty.h b/include/wrench/services/compute/htcondor/HTCondorComputeServiceProperty.h
@@ -21,9 +21,26 @@ namespace wrench {
 
     public:
 
-        /** @brief Overhead, in seconds, of the HTCondor Negotiator **/
+        /** @brief Overhead, in seconds, of the HTCondor Negotiator, which is invoked each time a new job is submitted or
+         *  a running job completes and there are still pending jobs **/
         DECLARE_PROPERTY_NAME(NEGOTIATOR_OVERHEAD);
 
+        /** @brief Overhead (in seconds) between condor and a batch compute service for the start of execution of grid-universe jobs
+         */
+        DECLARE_PROPERTY_NAME(GRID_PRE_EXECUTION_DELAY);
+
+        /** @brief Overhead (in seconds) between condor and a batch compute service for the completion of execution of grid-universe jobs
+         */
+        DECLARE_PROPERTY_NAME(GRID_POST_EXECUTION_DELAY);
+
+        /** @brief Overhead (in seconds) between condor and a bare-metal compute service for the start of execution of non-grid-universe jobs
+         */
+        DECLARE_PROPERTY_NAME(NON_GRID_PRE_EXECUTION_DELAY);
+
+        /** @brief Overhead (in seconds) between condor and a bare-metal compute for the completion of execution of non-grid-universe jobs
+         */
+        DECLARE_PROPERTY_NAME(NON_GRID_POST_EXECUTION_DELAY);
+
     };
 }