diff --git a/CHANGES.md b/CHANGES.md index 210810a..8cecd45 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,6 +4,7 @@ * Binary file IO now supported. * Better handled missing input files. * Added CREDITS.md +* Chains-on-chains partition available 1.1.1 ===== diff --git a/CMakeLists.txt b/CMakeLists.txt index 7943221..f58b626 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -102,7 +102,7 @@ endif() set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) -file(GLOB SPLATT_SOURCES src/*.c ${MPI_SOURCES}) +file(GLOB SPLATT_SOURCES src/*.c src/ccp/*.c ${MPI_SOURCES}) # Generate splatt library add_subdirectory(lib) diff --git a/src/ccp/ccp.c b/src/ccp/ccp.c new file mode 100644 index 0000000..11bda81 --- /dev/null +++ b/src/ccp/ccp.c @@ -0,0 +1,182 @@ + + +/****************************************************************************** + * INCLUDES + *****************************************************************************/ + +#include "ccp.h" +#include "../timer.h" + + +/****************************************************************************** + * PRIVATE FUNCTIONS + *****************************************************************************/ + +static idx_t nprobes = 0; + + +/** +* @brief Perform a linear search on an array for a value. +* +* @param weights The array to search. +* @param left The lower bound to begin at. +* @param right The upper (exclusive) bound of items. +* @param target The target value. +* +* @return The index j, where weights[j] <= target && weights[j+1] > target. +*/ +static idx_t p_linear_search( + idx_t const * const weights, + idx_t const left, + idx_t const right, + idx_t const target) +{ + for(idx_t x=left; x < right-1; ++x) { + if(weights[x+1] > target) { + return x; + } + } + + return right; +} + + +/** +* @brief Perform a binary search on an array for a value. +* +* @param weights The array to search. +* @param left The lower bound to begin at. +* @param right The upper (exclusive) bound of items. +* @param target The target value. +* +* @return The index j, where weights[j] <= target && weights[j+1] > target. +*/ +static idx_t p_binary_search( + idx_t const * const weights, + idx_t left, + idx_t right, + idx_t const target) +{ + while((right - left) > 8) { + idx_t mid = left + ((right - left) / 2); + + if(weights[mid] <= target && weights[mid+1] > target) { + return mid; + } + + if(weights[mid] < target) { + left = mid + 1; + } else { + right = mid; + } + } + + return p_linear_search(weights, left, right, target); +} + + + +static idx_t p_eps_rb_partition_1d( + idx_t * const weights, + idx_t const nitems, + idx_t * const parts, + idx_t const nparts, + idx_t const eps) +{ + idx_t lower = weights[nitems-1] / nparts; + idx_t upper = weights[nitems-1]; + + do { + idx_t mid = lower + ((upper - lower) / 2); + if(lprobe(weights, nitems, parts, nparts, mid)) { + upper = mid; + } else { + lower = mid+1; + } + } while(upper > lower + eps); + + return upper; +} + + + + +/****************************************************************************** + * PUBLIC FUNCTIONS + *****************************************************************************/ + +idx_t partition_1d( + idx_t * const weights, + idx_t const nitems, + idx_t * const parts, + idx_t const nparts) +{ + timer_start(&timers[TIMER_PART]); + prefix_sum_inc(weights, nitems); + + nprobes = 0; + + /* use recursive bisectioning with 0 tolerance to get exact solution */ + idx_t bottleneck = p_eps_rb_partition_1d(weights, nitems, parts, nparts, 0); + + timer_stop(&timers[TIMER_PART]); + return bottleneck; +} + + + +bool lprobe( + idx_t const * const weights, + idx_t const nitems, + idx_t * const parts, + idx_t const nparts, + idx_t const bottleneck) +{ + idx_t p=0; + parts[p++] = 0; + idx_t bsum = bottleneck; + + idx_t const wtotal = weights[nitems-1]; + + idx_t step = nitems / nparts; + while(p < nparts && bsum < wtotal) { + while(step < nitems && weights[step] < bsum) { + step += nitems / nparts; + } + parts[p] = p_binary_search(weights, step - (nitems/nparts), SS_MIN(step, nitems), + bsum); + bsum = weights[parts[p]] + bottleneck; + ++p; + } + + parts[p] = nitems; + + ++nprobes; + return bsum >= wtotal; +} + + + +void prefix_sum_inc( + idx_t * const weights, + idx_t const nitems) +{ + for(idx_t x=1; x < nitems; ++x) { + weights[x] += weights[x-1]; + } +} + + + +void prefix_sum_exc( + idx_t * const weights, + idx_t const nitems) +{ + idx_t saved = weights[0]; + weights[0] = 0; + for(idx_t x=1; x < nitems; ++x) { + idx_t const tmp = weights[x]; + weights[x] = weights[x-1] + saved; + saved = tmp; + } +} diff --git a/src/ccp/ccp.h b/src/ccp/ccp.h new file mode 100644 index 0000000..7fd4bf3 --- /dev/null +++ b/src/ccp/ccp.h @@ -0,0 +1,58 @@ +#ifndef SPLATT_CCP_CCP_H +#define SPLATT_CCP_CCP_H + +#include "../base.h" + + +/****************************************************************************** + * INCLUDES + *****************************************************************************/ +#include + + + + +/****************************************************************************** + * PUBLIC FUNCTIONS + *****************************************************************************/ + +#define partition_1d splatt_partition_1d +idx_t partition_1d( + idx_t * const weights, + idx_t const nitems, + idx_t * const parts, + idx_t const nparts); + + +bool lprobe( + idx_t const * const weights, + idx_t const nitems, + idx_t * const parts, + idx_t const nparts, + idx_t const bottleneck); + + +#define prefix_sum_inc splatt_prefix_sum_inc +/** +* @brief Compute an inclusive prefix sum: [3, 4, 5] -> [3, 7, 12]. +* +* @param weights The numbers to sum. +* @param nitems The number of items in 'weights'. +*/ +void prefix_sum_inc( + idx_t * const weights, + idx_t const nitems); + + +#define prefix_sum_exc splatt_prefix_sum_exc +/** +* @brief Compute an exclusive prefix sum: [3, 4, 5] -> [0, 3, 7]. +* +* @param weights The numbers to sum. +* @param nitems The number of items in 'weights'. +*/ +void prefix_sum_exc( + idx_t * const weights, + idx_t const nitems); + +#endif diff --git a/src/mpi/mpi_io.c b/src/mpi/mpi_io.c index 24e600d..9dd0124 100644 --- a/src/mpi/mpi_io.c +++ b/src/mpi/mpi_io.c @@ -8,7 +8,6 @@ #include "../util.h" - /****************************************************************************** * API FUNCTONS *****************************************************************************/ @@ -279,7 +278,6 @@ static sptensor_t * p_rearrange_by_part( } - static void p_find_my_slices_1d( idx_t ** const ssizes, idx_t const nmodes, diff --git a/src/timer.c b/src/timer.c index cc9875b..ed64784 100644 --- a/src/timer.c +++ b/src/timer.c @@ -28,6 +28,7 @@ static char const * const timer_names[] = { [TIMER_MATMUL] = "MAT MULT", [TIMER_ATA] = "MAT A^TA", [TIMER_MATNORM] = "MAT NORM", + [TIMER_PART] = "PART1D", [TIMER_MISC] = "MISC", #ifdef SPLATT_USE_MPI [TIMER_MPI] = "MPI", diff --git a/src/timer.h b/src/timer.h index 92c371b..a1f3661 100644 --- a/src/timer.h +++ b/src/timer.h @@ -42,6 +42,7 @@ typedef enum TIMER_ATA, TIMER_MATNORM, TIMER_IO, + TIMER_PART, TIMER_LVL2, /* LEVEL 2 */ #ifdef SPLATT_USE_MPI TIMER_MPI, diff --git a/tests/ccp_test.c b/tests/ccp_test.c new file mode 100644 index 0000000..5515394 --- /dev/null +++ b/tests/ccp_test.c @@ -0,0 +1,248 @@ + +#include "../src/ccp/ccp.h" +#include "../src/util.h" +#include "../src/sort.h" + +#include "ctest/ctest.h" + +#include "splatt_test.h" + +#define NUM_CCP_TESTS 6 + + +CTEST_DATA(ccp) +{ + idx_t P; + idx_t * parts; + idx_t N; + idx_t * unit_data; + idx_t * rand_data; + idx_t * sorted_data; + idx_t * fororder_data; + idx_t * revorder_data; + idx_t * bigend_data; + + idx_t * ptrs[NUM_CCP_TESTS]; +}; + + +CTEST_SETUP(ccp) +{ + data->P = 31; + data->parts = calloc(data->P + 1, sizeof(*(data->parts))); + + data->N = 500; + data->rand_data = malloc(data->N * sizeof(*(data->rand_data))); + data->sorted_data = malloc(data->N * sizeof(*(data->sorted_data))); + data->fororder_data = malloc(data->N * sizeof(*(data->fororder_data))); + data->revorder_data = malloc(data->N * sizeof(*(data->revorder_data))); + data->bigend_data = malloc(data->N * sizeof(*(data->bigend_data))); + data->unit_data = malloc(data->N * sizeof(*(data->unit_data))); + + for(idx_t x=0; x < data->N; ++x) { + data->unit_data[x] = 1; + data->rand_data[x] = rand_idx() % 131; + data->sorted_data[x] = rand_idx() % 131; + data->bigend_data[x] = rand_idx() % 131; + + data->fororder_data[x] = x; + data->revorder_data[x] = data->N - x; + } + + + splatt_quicksort(data->sorted_data, data->N); + data->bigend_data[data->N - 1] = 999; + + data->ptrs[0] = data->rand_data; + data->ptrs[1] = data->sorted_data; + data->ptrs[2] = data->fororder_data; + data->ptrs[3] = data->revorder_data; + data->ptrs[4] = data->bigend_data; + data->ptrs[5] = data->unit_data; +} + +CTEST_TEARDOWN(ccp) +{ + free(data->parts); + for(idx_t t=0; t < NUM_CCP_TESTS; ++t) { + free(data->ptrs[t]); + } +} + + +CTEST2(ccp, prefix_sum_inc) +{ + idx_t * pref = malloc(data->N * sizeof(*pref)); + + for(idx_t t=0; t < NUM_CCP_TESTS; ++t) { + idx_t * const restrict weights = data->ptrs[t]; + + /* make a copy */ + memcpy(pref, weights, data->N * sizeof(*pref)); + + prefix_sum_inc(pref, data->N); + + idx_t running = 0; + for(idx_t x=0; x < data->N; ++x) { + running += weights[x]; + ASSERT_EQUAL(running, pref[x]); + } + } + free(pref); +} + + +CTEST2(ccp, prefix_sum_exc) +{ + /* make a copy */ + idx_t * pref = malloc(data->N * sizeof(*pref)); + + /* foreach test */ + for(idx_t t=0; t < NUM_CCP_TESTS; ++t) { + idx_t * const restrict weights = data->ptrs[t]; + memcpy(pref, weights, data->N * sizeof(*pref)); + + prefix_sum_exc(pref, data->N); + + idx_t running = 0; + for(idx_t x=0; x < data->N; ++x) { + ASSERT_EQUAL(running, pref[x]); + running += weights[x]; + } + } + + free(pref); +} + + +CTEST2(ccp, partition_1d) +{ + /* foreach test */ + for(idx_t t=0; t < NUM_CCP_TESTS; ++t) { + idx_t * const restrict weights = data->ptrs[t]; + idx_t bneck = partition_1d(weights, data->N, data->parts, data->P); + + /* check bounds */ + ASSERT_EQUAL(0, data->parts[0]); + ASSERT_EQUAL(data->N, data->parts[data->P]); + + /* check non-overlapping partitions */ + for(idx_t p=1; p < data->P; ++p) { + /* if N < P, someone will have no work */ + if(data->parts[p] <= data->parts[p-1]) { + ASSERT_FAIL(); + } + } + + /* check that bneck is not surpassed */ + for(idx_t p=0; p < data->P; ++p) { + if(weights[p+1] - weights[p] > bneck) { + ASSERT_FAIL(); + } + } + + /* check actual optimality */ + bool success; + success = lprobe(weights, data->N, data->parts, data->P, bneck); + ASSERT_EQUAL(true, success); + success = lprobe(weights, data->N, data->parts, data->P, bneck-1); + ASSERT_EQUAL(false, success); + + } /* end foreach test */ +} + + +CTEST2(ccp, probe) +{ + idx_t total = 0; + for(idx_t x=0; x < data->N; ++x) { + total += data->rand_data[x]; + } + + prefix_sum_exc(data->rand_data, data->N); + bool result = lprobe(data->rand_data, data->N, data->parts, data->P, + (total / data->P) - 1); + ASSERT_EQUAL(false, result); + + idx_t bottleneck = total / data->P; + while(!result) { + result = lprobe(data->rand_data, data->N, data->parts, data->P, bottleneck); + ++bottleneck; + } + --bottleneck; + + /* check bounds */ + ASSERT_EQUAL(0, data->parts[0]); + ASSERT_EQUAL(data->N, data->parts[data->P]); + + /* check non-overlapping partitions */ + for(idx_t p=1; p < data->P; ++p) { + /* if N < P, someone will have no work */ + if(data->parts[p] <= data->parts[p-1]) { + ASSERT_FAIL(); + } + } + + /* check actual bneck */ + for(idx_t p=1; p < data->P; ++p) { + /* if N < P, someone will have no work */ + if(data->parts[p] - data->parts[p-1] > bottleneck) { + ASSERT_FAIL(); + } + } + +} + + +CTEST2(ccp, bigpart) +{ + idx_t const N = 25000000; + idx_t const P = 24; + + idx_t * weights = malloc(N * sizeof(*weights)); + idx_t * parts = malloc((P+1) * sizeof(*weights)); + + for(idx_t x=0; x < N; ++x) { + weights[x] = rand_idx() % 100; + } + + sp_timer_t part; + timer_fstart(&part); + idx_t const bneck = partition_1d(weights, N, parts, P); + timer_stop(&part); + + /* correctness */ + bool success; + success = lprobe(weights, N, parts, P, bneck); + ASSERT_EQUAL(true, success); + success = lprobe(weights, N, parts, P, bneck-1); + ASSERT_EQUAL(false, success); + + free(weights); + free(parts); +} + + +CTEST2(ccp, part_equalsize) +{ + idx_t const P = 24; + idx_t const CHUNK = 10000; + idx_t const N = P * CHUNK; + + idx_t * weights = malloc(N * sizeof(*weights)); + idx_t * parts = malloc((P+1) * sizeof(*weights)); + + for(idx_t x=0; x < N; ++x) { + weights[x] = 1; + } + + idx_t const bneck = partition_1d(weights, N, parts, P); + ASSERT_EQUAL(CHUNK, bneck); + + for(idx_t p=0; p < P; ++p) { + ASSERT_EQUAL(CHUNK * p, parts[p]); + } + + free(weights); + free(parts); +}