Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Limit number of threads spawned for tstOMP and tstatomics #1131

Merged
merged 2 commits into from
Sep 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 32 additions & 23 deletions src/c4/test/tstOMP.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
* \author Kelly Thompson
* \date Tue Jun 6 15:03:08 2006
* \brief Demonstrate basic OMP threads under MPI.
* \note Copyright (C) 2016-2020 Triad National Security, LLC.
* All rights reserved. */
* \note Copyright (C) 2011-2021 Triad National Security, LLC., All rights reserved. */
//------------------------------------------------------------------------------------------------//

#include "c4/ParallelUnitTest.hh"
Expand Down Expand Up @@ -79,17 +78,14 @@ bool topology_report() {
void topo_report(rtt_dsxx::UnitTest &ut, bool &one_mpi_rank_per_node) {
// Determine if MPI ranks are on unique machine nodes:
//
// If there are multiple MPI ranks per machine node, then don't use OMP
// because OMP can't restrict its threads to running only on an MPI rank's
// cores. The OMP threads will be distributed over the whole machine node.
// For example, we might choose to use 4 MPI ranks on a machine node with 16
// cores. Ideally, we could allow each MPI rank to use 4 OMP threads for a
// maximum of 4x4=16 OMP threads on the 16 core node. However, because OMP
// doesn't know about the MPI ranks sharing the 16 cores, the even
// distribution of OMP threads is not guaranteed.
// If there are multiple MPI ranks per machine node, then don't use OMP because OMP can't restrict
// its threads to running only on an MPI rank's cores. The OMP threads will be distributed over
// the whole machine node. For example, we might choose to use 4 MPI ranks on a machine node with
// 16 cores. Ideally, we could allow each MPI rank to use 4 OMP threads for a maximum of 4x4=16
// OMP threads on the 16 core node. However, because OMP doesn't know about the MPI ranks sharing
// the 16 cores, the even distribution of OMP threads is not guaranteed.
//
// So - if we have more than one MPI rank per machine node, then turn off OMP
// threads.
// So - if we have more than one MPI rank per machine node, then turn off OMP threads.
one_mpi_rank_per_node = topology_report();

std::string procname = rtt_c4::get_processor_name();
Expand All @@ -101,9 +97,13 @@ void topo_report(rtt_dsxx::UnitTest &ut, bool &one_mpi_rank_per_node) {
int num_dynamic_threads = omp_get_dynamic();

int tid(-1);
int nthreads(-1), maxthreads(-1);
int nthreads(-1);
int maxthreads(-1);

maxthreads = omp_get_max_threads();
// This is just a unit test. Limit the parallelism.
if (maxthreads > 16)
omp_set_num_threads(16);

#pragma omp parallel private(tid)
{
Expand Down Expand Up @@ -172,6 +172,12 @@ void sample_sum(rtt_dsxx::UnitTest &ut, bool const omrpn) {

#ifdef OPENMP_FOUND
{
// This is just a unit test. Limit the parallelism.
int maxthreads(-1);
maxthreads = omp_get_max_threads();
if (maxthreads > 16)
omp_set_num_threads(16);

// More than 1 MPI rank per node --> turn off OMP.
if (!omrpn)
omp_set_num_threads(1);
Expand Down Expand Up @@ -231,9 +237,8 @@ void sample_sum(rtt_dsxx::UnitTest &ut, bool const omrpn) {
<< std::endl;
}

// [2015-11-17 KT] The accumulate test no longer provides enough work
// to offset the overhead of OpenMP, especially for the optimized
// build. Turn this test off...
// [2015-11-17 KT] The accumulate test no longer provides enough work to offset the overhead of
// OpenMP, especially for the optimized build. Turn this test off...

// if( omrpn && nthreads > 4 )
// {
Expand All @@ -251,12 +256,9 @@ void sample_sum(rtt_dsxx::UnitTest &ut, bool const omrpn) {
}

//------------------------------------------------------------------------------------------------//
// This is a simple demonstration problem for OMP. Nothing really to check
// for PASS/FAIL.
// This is a simple demonstration problem for OMP. Nothing really to check for PASS/FAIL.
int MandelbrotCalculate(std::complex<double> c, int maxiter) {
// iterates z = z*z + c until |z| >= 2 or maxiter is reached, returns the
// number of iterations

// iterates z = z*z + c until |z| >= 2 or maxiter is reached, returns the number of iterations
std::complex<double> z = c;
int n = 0;
for (; n < maxiter; ++n) {
Expand All @@ -277,16 +279,23 @@ void MandelbrotDriver(rtt_dsxx::UnitTest &ut) {
const complex<double> center(-0.7, 0.0);
const complex<double> span(2.7, -(4 / 3.0) * 2.7 * height / width);
const complex<double> begin = center - span / 2.0;
// const complex<double> end = center+span/2.0;
const int maxiter = 100000;

// Use OMP threads
Timer t;
ostringstream image1, image2;
ostringstream image1;
ostringstream image2;
t.start();

int nthreads(-1);
#ifdef OPENMP_FOUND

// This is just a unit test. Limit the parallelism.
int maxthreads(-1);
maxthreads = omp_get_max_threads();
if (maxthreads > 16)
omp_set_num_threads(16);

#pragma omp parallel
{
if (node() == 0 && omp_get_thread_num() == 0) {
Expand Down
28 changes: 12 additions & 16 deletions src/ds++/test/tstatomics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
* \file ds++/test/tstatomics.cc
* \author Tim Kelley
* \date Thursday, Sept. 6, 2018, 10:51 am
* \note Copyright (C) 2018-2020 Triad National Security, LLC.
* All rights reserved. */
* \note Copyright (C) 2018-2021 Triad National Security, LLC., All rights reserved. */
//------------------------------------------------------------------------------------------------//

#include "ds++/Release.hh"
Expand All @@ -17,10 +16,9 @@
using rtt_dsxx::UnitTest;

//------------------------------------------------------------------------------------------------//
/* Hammer an atomic from each thread. Each iteration, the thread adds
* (tid * iteration) to the counter. The atomic ensures that everyone sees
* a consistent view of the counter: no thread overwrites the contribution
* from any other thread.
/* Hammer an atomic from each thread. Each iteration, the thread adds (tid * iteration) to the
* counter. The atomic ensures that everyone sees a consistent view of the counter: no thread
* overwrites the contribution from any other thread.
*/
void thread_action(std::atomic<double> &d, size_t N, size_t tid) {
auto const did = static_cast<double>(tid);
Expand Down Expand Up @@ -87,14 +85,12 @@ void test_fetch_add_atomic_1e6(UnitTest &ut) {
} // test_fetch_add_atomic

// --------------------- non-atomic version --------------------------
// This should give the wrong answer nearly every time on any respectable
// thread implementation.
// This should give the wrong answer nearly every time on any respectable thread implementation.

//------------------------------------------------------------------------------------------------//
/* Similarly, hammer a POD from each thread. Each iteration, the thread adds
* (tid * iteration) to the counter. Since the threads are contending, we expect
* to have a race condition where two threads read the same value from d and
* one of the thread's write (+=) overwrites the other's.
/* Similarly, hammer a POD from each thread. Each iteration, the thread adds (tid * iteration) to
* the counter. Since the threads are contending, we expect to have a race condition where two
* threads read the same value from d and one of the thread's write (+=) overwrites the other's.
*/
void thread_action_pod(double &d, size_t N, size_t tid) {
auto const did = static_cast<double>(tid);
Expand Down Expand Up @@ -155,8 +151,8 @@ void test_fetch_add_not_atomic(UnitTest & /*ut*/) {

// fetch_sub tests

/* Same as thread_action above, except uses fetch_sub. Total sum is just the
* negative of the preceding test.
/* Same as thread_action above, except uses fetch_sub. Total sum is just the negative of the
* preceding test.
*/
void thread_action_sub(std::atomic<double> &d, size_t N, size_t tid) {
auto const did = static_cast<double>(tid);
Expand Down Expand Up @@ -210,14 +206,14 @@ void fetch_sub_atomic_core(UnitTest &ut, size_t const n_threads, size_t const n_
} // fetch_add_atomic_core

void test_fetch_sub_atomic(UnitTest &ut) {
size_t const n_threads(19);
size_t const n_threads(8);
size_t const n_iterations(10001);
fetch_sub_atomic_core(ut, n_threads, n_iterations);
return;
} // test_fetch_add_atomic

void test_fetch_sub_atomic_1e6(UnitTest &ut) {
size_t const n_threads(19);
size_t const n_threads(8);
size_t const n_iterations(1000001);
fetch_sub_atomic_core(ut, n_threads, n_iterations);
return;
Expand Down