From b275d47582eb9e2307e30fa406a8259ac7f5d9cc Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 18 Jan 2016 19:26:13 -0600 Subject: [PATCH 1/4] first working implementation of partition_1d. unoptimized --- CHANGES.md | 4 + CMakeLists.txt | 2 +- src/ccp/ccp.c | 147 ++++++++++++++++++++++++++++ src/ccp/ccp.h | 58 +++++++++++ tests/ccp_test.c | 249 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 459 insertions(+), 1 deletion(-) create mode 100644 src/ccp/ccp.c create mode 100644 src/ccp/ccp.h create mode 100644 tests/ccp_test.c diff --git a/CHANGES.md b/CHANGES.md index 5761883..b77ffad 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,4 +1,8 @@ +1.1.2 +===== + + 1.1.1 ===== * Updated README.md to include MPI instructions. diff --git a/CMakeLists.txt b/CMakeLists.txt index 7943221..f58b626 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -102,7 +102,7 @@ endif() set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) -file(GLOB SPLATT_SOURCES src/*.c ${MPI_SOURCES}) +file(GLOB SPLATT_SOURCES src/*.c src/ccp/*.c ${MPI_SOURCES}) # Generate splatt library add_subdirectory(lib) diff --git a/src/ccp/ccp.c b/src/ccp/ccp.c new file mode 100644 index 0000000..c176472 --- /dev/null +++ b/src/ccp/ccp.c @@ -0,0 +1,147 @@ + + +/****************************************************************************** + * INCLUDES + *****************************************************************************/ + +#include "ccp.h" + +/****************************************************************************** + * PRIVATE FUNCTIONS + *****************************************************************************/ + + +static idx_t p_linear_search( + idx_t const * const weights, + idx_t const left, + idx_t const right, + idx_t const target) +{ + for(idx_t x=left; x < right-1; ++x) { + if(weights[x+1] > target) { + return x; + } + } + + return right; +} + +static idx_t p_binary_search( + idx_t const * const weights, + idx_t left, + idx_t right, + idx_t const target) +{ + while((right - left) > 8) { + idx_t mid = left + ((right - left) / 2); + + if(weights[mid] <= target && weights[mid+1] > target) { + return mid; + } + + if(weights[mid] < target) { + left = mid + 1; + } else { + right = mid; + } + } + + return p_linear_search(weights, left, right, target); +} + + + + +/****************************************************************************** + * PUBLIC FUNCTIONS + *****************************************************************************/ + +idx_t partition_1d( + idx_t * const weights, + idx_t const nitems, + idx_t * const parts, + idx_t const nparts) +{ + prefix_sum_inc(weights, nitems); + + idx_t const total_weight = weights[nitems-1]; + + idx_t nprobes = 0; + + /* naive attempts */ + bool success; + idx_t bottleneck = (total_weight / nparts) - 1; /* -1 because we inc first */ + do { + ++nprobes; + ++bottleneck; + success = lprobe(weights, nitems, parts, nparts, bottleneck); + } while(!success); + + printf("nprobes: %lu\n", nprobes); + + return bottleneck; +} + + + +bool lprobe( + idx_t const * const weights, + idx_t const nitems, + idx_t * const parts, + idx_t const nparts, + idx_t const bottleneck) +{ + idx_t p=0; + parts[p++] = 0; + idx_t bsum = bottleneck; + + idx_t const wtotal = weights[nitems-1]; + +#if 0 + while(p < nparts && bsum < wtotal) { + parts[p] = p_linear_search(weights, parts[p-1], nitems, bsum); + bsum = weights[parts[p]] + bottleneck; + ++p; + } +#else + idx_t step = nitems / nparts; + while(p < nparts && bsum < wtotal) { + while(step < nitems && weights[step] < bsum) { + step += nitems / nparts; + } + parts[p] = p_binary_search(weights, step - (nitems/nparts), SS_MIN(step, nitems), + bsum); + bsum = weights[parts[p]] + bottleneck; + ++p; + } +#endif + + parts[p] = nitems; + + return bsum >= wtotal; +} + + +void prefix_sum_inc( + idx_t * const weights, + idx_t const nitems) +{ + for(idx_t x=1; x < nitems; ++x) { + weights[x] += weights[x-1]; + } +} + + + +void prefix_sum_exc( + idx_t * const weights, + idx_t const nitems) +{ + idx_t saved = weights[0]; + weights[0] = 0; + for(idx_t x=1; x < nitems; ++x) { + idx_t const tmp = weights[x]; + weights[x] = weights[x-1] + saved; + saved = tmp; + } +} diff --git a/src/ccp/ccp.h b/src/ccp/ccp.h new file mode 100644 index 0000000..7fd4bf3 --- /dev/null +++ b/src/ccp/ccp.h @@ -0,0 +1,58 @@ +#ifndef SPLATT_CCP_CCP_H +#define SPLATT_CCP_CCP_H + +#include "../base.h" + + +/****************************************************************************** + * INCLUDES + *****************************************************************************/ +#include + + + + +/****************************************************************************** + * PUBLIC FUNCTIONS + *****************************************************************************/ + +#define partition_1d splatt_partition_1d +idx_t partition_1d( + idx_t * const weights, + idx_t const nitems, + idx_t * const parts, + idx_t const nparts); + + +bool lprobe( + idx_t const * const weights, + idx_t const nitems, + idx_t * const parts, + idx_t const nparts, + idx_t const bottleneck); + + +#define prefix_sum_inc splatt_prefix_sum_inc +/** +* @brief Compute an inclusive prefix sum: [3, 4, 5] -> [3, 7, 12]. +* +* @param weights The numbers to sum. +* @param nitems The number of items in 'weights'. +*/ +void prefix_sum_inc( + idx_t * const weights, + idx_t const nitems); + + +#define prefix_sum_exc splatt_prefix_sum_exc +/** +* @brief Compute an exclusive prefix sum: [3, 4, 5] -> [0, 3, 7]. +* +* @param weights The numbers to sum. +* @param nitems The number of items in 'weights'. +*/ +void prefix_sum_exc( + idx_t * const weights, + idx_t const nitems); + +#endif diff --git a/tests/ccp_test.c b/tests/ccp_test.c new file mode 100644 index 0000000..72f5a62 --- /dev/null +++ b/tests/ccp_test.c @@ -0,0 +1,249 @@ + +#include "../src/ccp/ccp.h" +#include "../src/util.h" +#include "../src/sort.h" + +#include "ctest/ctest.h" + +#include "splatt_test.h" + +#define NUM_CCP_TESTS 6 + + +CTEST_DATA(ccp) +{ + idx_t P; + idx_t * parts; + idx_t N; + idx_t * unit_data; + idx_t * rand_data; + idx_t * sorted_data; + idx_t * fororder_data; + idx_t * revorder_data; + idx_t * bigend_data; + + idx_t * ptrs[NUM_CCP_TESTS]; +}; + + +CTEST_SETUP(ccp) +{ + data->P = 31; + data->parts = calloc(data->P + 1, sizeof(*(data->parts))); + + data->N = 500; + data->rand_data = malloc(data->N * sizeof(*(data->rand_data))); + data->sorted_data = malloc(data->N * sizeof(*(data->sorted_data))); + data->fororder_data = malloc(data->N * sizeof(*(data->fororder_data))); + data->revorder_data = malloc(data->N * sizeof(*(data->revorder_data))); + data->bigend_data = malloc(data->N * sizeof(*(data->bigend_data))); + data->unit_data = malloc(data->N * sizeof(*(data->unit_data))); + + for(idx_t x=0; x < data->N; ++x) { + data->unit_data[x] = 1; + data->rand_data[x] = rand_idx() % 131; + data->sorted_data[x] = rand_idx() % 131; + data->bigend_data[x] = rand_idx() % 131; + + data->fororder_data[x] = x; + data->revorder_data[x] = data->N - x; + } + + + splatt_quicksort(data->sorted_data, data->N); + data->bigend_data[data->N - 1] = 999; + + data->ptrs[0] = data->rand_data; + data->ptrs[1] = data->sorted_data; + data->ptrs[2] = data->fororder_data; + data->ptrs[3] = data->revorder_data; + data->ptrs[4] = data->bigend_data; + data->ptrs[5] = data->unit_data; +} + +CTEST_TEARDOWN(ccp) +{ + free(data->parts); + for(idx_t t=0; t < NUM_CCP_TESTS; ++t) { + free(data->ptrs[t]); + } +} + + +CTEST2(ccp, prefix_sum_inc) +{ + idx_t * pref = malloc(data->N * sizeof(*pref)); + + for(idx_t t=0; t < NUM_CCP_TESTS; ++t) { + idx_t * const restrict weights = data->ptrs[t]; + + /* make a copy */ + memcpy(pref, weights, data->N * sizeof(*pref)); + + prefix_sum_inc(pref, data->N); + + idx_t running = 0; + for(idx_t x=0; x < data->N; ++x) { + running += weights[x]; + ASSERT_EQUAL(running, pref[x]); + } + } + free(pref); +} + + +CTEST2(ccp, prefix_sum_exc) +{ + /* make a copy */ + idx_t * pref = malloc(data->N * sizeof(*pref)); + + /* foreach test */ + for(idx_t t=0; t < NUM_CCP_TESTS; ++t) { + idx_t * const restrict weights = data->ptrs[t]; + memcpy(pref, weights, data->N * sizeof(*pref)); + + prefix_sum_exc(pref, data->N); + + idx_t running = 0; + for(idx_t x=0; x < data->N; ++x) { + ASSERT_EQUAL(running, pref[x]); + running += weights[x]; + } + } + + free(pref); +} + + +CTEST2(ccp, partition_1d) +{ + /* foreach test */ + for(idx_t t=0; t < NUM_CCP_TESTS; ++t) { + idx_t * const restrict weights = data->ptrs[t]; + idx_t bneck = partition_1d(weights, data->N, data->parts, data->P); + + /* check bounds */ + ASSERT_EQUAL(0, data->parts[0]); + ASSERT_EQUAL(data->N, data->parts[data->P]); + + /* check non-overlapping partitions */ + for(idx_t p=1; p < data->P; ++p) { + /* if N < P, someone will have no work */ + if(data->parts[p] <= data->parts[p-1]) { + ASSERT_FAIL(); + } + } + + /* check that bneck is not surpassed */ + for(idx_t p=0; p < data->P; ++p) { + if(weights[p+1] - weights[p] > bneck) { + ASSERT_FAIL(); + } + } + + /* check actual optimality */ + bool success; + success = lprobe(weights, data->N, data->parts, data->P, bneck); + ASSERT_EQUAL(true, success); + success = lprobe(weights, data->N, data->parts, data->P, bneck-1); + ASSERT_EQUAL(false, success); + + } /* end foreach test */ +} + + +CTEST2(ccp, probe) +{ + idx_t total = 0; + for(idx_t x=0; x < data->N; ++x) { + total += data->rand_data[x]; + } + + prefix_sum_exc(data->rand_data, data->N); + bool result = lprobe(data->rand_data, data->N, data->parts, data->P, + (total / data->P) - 1); + ASSERT_EQUAL(false, result); + + idx_t bottleneck = total / data->P; + while(!result) { + result = lprobe(data->rand_data, data->N, data->parts, data->P, bottleneck); + ++bottleneck; + } + --bottleneck; + + /* check bounds */ + ASSERT_EQUAL(0, data->parts[0]); + ASSERT_EQUAL(data->N, data->parts[data->P]); + + /* check non-overlapping partitions */ + for(idx_t p=1; p < data->P; ++p) { + /* if N < P, someone will have no work */ + if(data->parts[p] <= data->parts[p-1]) { + ASSERT_FAIL(); + } + } + + /* check actual bneck */ + for(idx_t p=1; p < data->P; ++p) { + /* if N < P, someone will have no work */ + if(data->parts[p] - data->parts[p-1] > bottleneck) { + ASSERT_FAIL(); + } + } + +} + + +CTEST2(ccp, bigpart) +{ + idx_t const N = 2500000; + idx_t const P = 24; + + idx_t * weights = malloc(N * sizeof(*weights)); + idx_t * parts = malloc((P+1) * sizeof(*weights)); + + for(idx_t x=0; x < N; ++x) { + weights[x] = rand_idx() % N; + } + + sp_timer_t part; + timer_fstart(&part); + idx_t const bneck = partition_1d(weights, N, parts, P); + timer_stop(&part); + printf("partition: %0.3fs\n", part.seconds); + + /* correctness */ + bool success; + success = lprobe(weights, N, parts, P, bneck); + ASSERT_EQUAL(true, success); + success = lprobe(weights, N, parts, P, bneck-1); + ASSERT_EQUAL(false, success); + + free(weights); + free(parts); +} + + +CTEST2(ccp, part_equalsize) +{ + idx_t const P = 24; + idx_t const CHUNK = 10000; + idx_t const N = P * CHUNK; + + idx_t * weights = malloc(N * sizeof(*weights)); + idx_t * parts = malloc((P+1) * sizeof(*weights)); + + for(idx_t x=0; x < N; ++x) { + weights[x] = 1; + } + + idx_t const bneck = partition_1d(weights, N, parts, P); + ASSERT_EQUAL(CHUNK, bneck); + + for(idx_t p=0; p < P; ++p) { + ASSERT_EQUAL(CHUNK * p, parts[p]); + } + + free(weights); + free(parts); +} From 21d27009dcec7d9c4a68714d19caeb2770697fac Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 18 Jan 2016 19:57:19 -0600 Subject: [PATCH 2/4] added eRB as a method, e=0 makes optimal --- src/ccp/ccp.c | 66 ++++++++++++++++++++++++++++++++++++++++-------- tests/ccp_test.c | 4 +-- 2 files changed, 58 insertions(+), 12 deletions(-) diff --git a/src/ccp/ccp.c b/src/ccp/ccp.c index c176472..8009f9f 100644 --- a/src/ccp/ccp.c +++ b/src/ccp/ccp.c @@ -6,11 +6,24 @@ #include "ccp.h" + /****************************************************************************** * PRIVATE FUNCTIONS *****************************************************************************/ +static idx_t nprobes = 0; + +/** +* @brief Perform a linear search on an array for a value. +* +* @param weights The array to search. +* @param left The lower bound to begin at. +* @param right The upper (exclusive) bound of items. +* @param target The target value. +* +* @return The index j, where weights[j] <= target && weights[j+1] > target. +*/ static idx_t p_linear_search( idx_t const * const weights, idx_t const left, @@ -26,6 +39,17 @@ static idx_t p_linear_search( return right; } + +/** +* @brief Perform a binary search on an array for a value. +* +* @param weights The array to search. +* @param left The lower bound to begin at. +* @param right The upper (exclusive) bound of items. +* @param target The target value. +* +* @return The index j, where weights[j] <= target && weights[j+1] > target. +*/ static idx_t p_binary_search( idx_t const * const weights, idx_t left, @@ -51,6 +75,30 @@ static idx_t p_binary_search( +static idx_t p_eps_rb_partition_1d( + idx_t * const weights, + idx_t const nitems, + idx_t * const parts, + idx_t const nparts, + idx_t const eps) +{ + idx_t lower = weights[nitems-1] / nparts; + idx_t upper = weights[nitems-1]; + + do { + idx_t mid = lower + ((upper - lower) / 2); + if(lprobe(weights, nitems, parts, nparts, mid)) { + upper = mid; + } else { + lower = mid+1; + } + } while(upper > lower + eps); + + return upper; +} + + + /****************************************************************************** * PUBLIC FUNCTIONS @@ -66,16 +114,20 @@ idx_t partition_1d( idx_t const total_weight = weights[nitems-1]; - idx_t nprobes = 0; + nprobes = 0; /* naive attempts */ bool success; + idx_t bottleneck; +#if 0 idx_t bottleneck = (total_weight / nparts) - 1; /* -1 because we inc first */ do { - ++nprobes; ++bottleneck; success = lprobe(weights, nitems, parts, nparts, bottleneck); } while(!success); +#else + bottleneck = p_eps_rb_partition_1d(weights, nitems, parts, nparts, 0); +#endif printf("nprobes: %lu\n", nprobes); @@ -97,13 +149,6 @@ bool lprobe( idx_t const wtotal = weights[nitems-1]; -#if 0 - while(p < nparts && bsum < wtotal) { - parts[p] = p_linear_search(weights, parts[p-1], nitems, bsum); - bsum = weights[parts[p]] + bottleneck; - ++p; - } -#else idx_t step = nitems / nparts; while(p < nparts && bsum < wtotal) { while(step < nitems && weights[step] < bsum) { @@ -114,14 +159,15 @@ bool lprobe( bsum = weights[parts[p]] + bottleneck; ++p; } -#endif parts[p] = nitems; + ++nprobes; return bsum >= wtotal; } + void prefix_sum_inc( idx_t * const weights, idx_t const nitems) diff --git a/tests/ccp_test.c b/tests/ccp_test.c index 72f5a62..d757c1a 100644 --- a/tests/ccp_test.c +++ b/tests/ccp_test.c @@ -196,14 +196,14 @@ CTEST2(ccp, probe) CTEST2(ccp, bigpart) { - idx_t const N = 2500000; + idx_t const N = 25000000; idx_t const P = 24; idx_t * weights = malloc(N * sizeof(*weights)); idx_t * parts = malloc((P+1) * sizeof(*weights)); for(idx_t x=0; x < N; ++x) { - weights[x] = rand_idx() % N; + weights[x] = rand_idx() % 100; } sp_timer_t part; From 890ddb6f2acb9a49997223f08e189c22ecd238a9 Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 18 Jan 2016 20:09:44 -0600 Subject: [PATCH 3/4] removed old naive p1D code --- src/ccp/ccp.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/src/ccp/ccp.c b/src/ccp/ccp.c index 8009f9f..24dd543 100644 --- a/src/ccp/ccp.c +++ b/src/ccp/ccp.c @@ -112,22 +112,10 @@ idx_t partition_1d( { prefix_sum_inc(weights, nitems); - idx_t const total_weight = weights[nitems-1]; - nprobes = 0; - /* naive attempts */ - bool success; - idx_t bottleneck; -#if 0 - idx_t bottleneck = (total_weight / nparts) - 1; /* -1 because we inc first */ - do { - ++bottleneck; - success = lprobe(weights, nitems, parts, nparts, bottleneck); - } while(!success); -#else - bottleneck = p_eps_rb_partition_1d(weights, nitems, parts, nparts, 0); -#endif + /* use recursive bisectioning with 0 tolerance to get exact solution */ + idx_t bottleneck = p_eps_rb_partition_1d(weights, nitems, parts, nparts, 0); printf("nprobes: %lu\n", nprobes); From 0b49ff7d786714f3803857d811a9ae6ca483b548 Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 18 Jan 2016 20:38:33 -0600 Subject: [PATCH 4/4] 3D partitioning now uses CCP --- src/ccp/ccp.c | 3 +++ src/cmds/mpi_cmd_cpd.c | 1 + src/mpi/mpi_io.c | 16 ++++++++++++++++ src/timer.c | 1 + src/timer.h | 1 + 5 files changed, 22 insertions(+) diff --git a/src/ccp/ccp.c b/src/ccp/ccp.c index 24dd543..d6615ef 100644 --- a/src/ccp/ccp.c +++ b/src/ccp/ccp.c @@ -5,6 +5,7 @@ *****************************************************************************/ #include "ccp.h" +#include "../timer.h" /****************************************************************************** @@ -110,6 +111,7 @@ idx_t partition_1d( idx_t * const parts, idx_t const nparts) { + timer_start(&timers[TIMER_PART]); prefix_sum_inc(weights, nitems); nprobes = 0; @@ -119,6 +121,7 @@ idx_t partition_1d( printf("nprobes: %lu\n", nprobes); + timer_stop(&timers[TIMER_PART]); return bottleneck; } diff --git a/src/cmds/mpi_cmd_cpd.c b/src/cmds/mpi_cmd_cpd.c index a3cb9ef..ffc95a8 100644 --- a/src/cmds/mpi_cmd_cpd.c +++ b/src/cmds/mpi_cmd_cpd.c @@ -102,6 +102,7 @@ static error_t parse_cpd_opt( break; case 'v': args->opts[SPLATT_OPTION_VERBOSITY] += 1; + timer_inc_verbose(); break; case TT_TILE: args->opts[SPLATT_OPTION_TILE] = SPLATT_DENSETILE; diff --git a/src/mpi/mpi_io.c b/src/mpi/mpi_io.c index 0abf11c..7b2d6ef 100644 --- a/src/mpi/mpi_io.c +++ b/src/mpi/mpi_io.c @@ -6,6 +6,7 @@ #include "../io.h" #include "../timer.h" +#include "../ccp/ccp.h" /****************************************************************************** @@ -246,6 +247,20 @@ static void p_find_my_slices( { idx_t const * const dims = rinfo->global_dims; +#if 1 + for(idx_t m=0; m < nmodes; ++m) { + idx_t * parts = splatt_malloc((rinfo->dims_3d[m] + 1) * sizeof(*parts)); + + /* optimally partition this mode */ + partition_1d(ssizes[m], dims[m], parts, rinfo->dims_3d[m]); + + rinfo->layer_starts[m] = parts[rinfo->coords_3d[m]]; + rinfo->layer_ends[m] = parts[rinfo->coords_3d[m]+1]; + + splatt_free(parts); + } + +#else /* find start/end slices for my partition */ for(idx_t m=0; m < nmodes; ++m) { idx_t pnnz = nnz / rinfo->dims_3d[m]; /* nnz in a layer */ @@ -304,6 +319,7 @@ static void p_find_my_slices( rinfo->layer_ends[m] = dims[m]; } } +#endif } diff --git a/src/timer.c b/src/timer.c index cc9875b..ed64784 100644 --- a/src/timer.c +++ b/src/timer.c @@ -28,6 +28,7 @@ static char const * const timer_names[] = { [TIMER_MATMUL] = "MAT MULT", [TIMER_ATA] = "MAT A^TA", [TIMER_MATNORM] = "MAT NORM", + [TIMER_PART] = "PART1D", [TIMER_MISC] = "MISC", #ifdef SPLATT_USE_MPI [TIMER_MPI] = "MPI", diff --git a/src/timer.h b/src/timer.h index 92c371b..a1f3661 100644 --- a/src/timer.h +++ b/src/timer.h @@ -42,6 +42,7 @@ typedef enum TIMER_ATA, TIMER_MATNORM, TIMER_IO, + TIMER_PART, TIMER_LVL2, /* LEVEL 2 */ #ifdef SPLATT_USE_MPI TIMER_MPI,