Skip to content

Commit

Permalink
Add XSBench
Browse files Browse the repository at this point in the history
  • Loading branch information
kchristin22 committed Dec 2, 2024
1 parent 2556ba6 commit 4e6e569
Show file tree
Hide file tree
Showing 10 changed files with 2,688 additions and 0 deletions.
268 changes: 268 additions & 0 deletions test/CUDA/XSBench/GridInit.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
#include "XSbench_header.cuh"

// Moves all required data structures to the GPU's memory space
SimulationData move_simulation_data_to_device( Inputs in, int mype, SimulationData SD )
{
if(mype == 0) printf("Allocating and moving simulation data to GPU memory space...\n");

////////////////////////////////////////////////////////////////////////////////
// SUMMARY: Simulation Data Structure Manifest for "SD" Object
// Here we list all heap arrays (and lengths) in SD that would need to be
// offloaded manually if using an accelerator with a seperate memory space
////////////////////////////////////////////////////////////////////////////////
// int * num_nucs; // Length = length_num_nucs;
// double * concs; // Length = length_concs
// int * mats; // Length = length_mats
// double * unionized_energy_array; // Length = length_unionized_energy_array
// int * index_grid; // Length = length_index_grid
// NuclideGridPoint * nuclide_grid; // Length = length_nuclide_grid
//
// Note: "unionized_energy_array" and "index_grid" can be of zero length
// depending on lookup method.
//
// Note: "Lengths" are given as the number of objects in the array, not the
// number of bytes.
////////////////////////////////////////////////////////////////////////////////
size_t sz;
size_t total_sz = 0;

// Shallow copy of CPU simulation data to GPU simulation data
SimulationData GSD = SD;

// Move data to GPU memory space
sz = GSD.length_num_nucs * sizeof(int);
gpuErrchk( cudaMalloc((void **) &GSD.num_nucs, sz) );
gpuErrchk( cudaMemcpy(GSD.num_nucs, SD.num_nucs, sz, cudaMemcpyHostToDevice) );
total_sz += sz;

sz = GSD.length_concs * sizeof(double);
gpuErrchk( cudaMalloc((void **) &GSD.concs, sz) );
gpuErrchk( cudaMemcpy(GSD.concs, SD.concs, sz, cudaMemcpyHostToDevice) );
total_sz += sz;

sz = GSD.length_mats * sizeof(int);
gpuErrchk( cudaMalloc((void **) &GSD.mats, sz) );
gpuErrchk( cudaMemcpy(GSD.mats, SD.mats, sz, cudaMemcpyHostToDevice) );
total_sz += sz;

sz = GSD.length_unionized_energy_array * sizeof(double);
gpuErrchk( cudaMalloc((void **) &GSD.unionized_energy_array, sz) );
gpuErrchk( cudaMemcpy(GSD.unionized_energy_array, SD.unionized_energy_array, sz, cudaMemcpyHostToDevice) );
total_sz += sz;

sz = GSD.length_index_grid * sizeof(int);
gpuErrchk( cudaMalloc((void **) &GSD.index_grid, sz) );
gpuErrchk( cudaMemcpy(GSD.index_grid, SD.index_grid, sz, cudaMemcpyHostToDevice) );
total_sz += sz;

sz = GSD.length_nuclide_grid * sizeof(NuclideGridPoint);
gpuErrchk( cudaMalloc((void **) &GSD.nuclide_grid, sz) );
gpuErrchk( cudaMemcpy(GSD.nuclide_grid, SD.nuclide_grid, sz, cudaMemcpyHostToDevice) );
total_sz += sz;

sz = GSD.length_nuclide_grid * sizeof(NuclideGridPoint);
gpuErrchk( cudaMalloc((void **) &GSD.d_nuclide_grid, sz) );
gpuErrchk( cudaMemcpy(GSD.d_nuclide_grid, SD.d_nuclide_grid, sz, cudaMemcpyHostToDevice) );
total_sz += sz;

// Allocate verification array on device. This structure is not needed on CPU, so we don't
// have to copy anything over.
sz = in.lookups * sizeof(unsigned long);
gpuErrchk( cudaMalloc((void **) &GSD.verification, sz) );
total_sz += sz;
GSD.length_verification = in.lookups;

#ifdef PRINT
gpuErrchk( cudaMalloc((void **) &GSD.dout, sizeof(double)) );
double zero = 0;
gpuErrchk( cudaMemcpy(GSD.dout, &zero, sizeof(double), cudaMemcpyHostToDevice) );
#endif

// Synchronize
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );

if(mype == 0 ) printf("GPU Intialization complete. Allocated %.0lf MB of data on GPU.\n", total_sz/1024.0/1024.0 );

return GSD;

}

SimulationData grid_init_do_not_profile( Inputs in, int mype )
{
// Structure to hold all allocated simuluation data arrays
SimulationData SD;

// Keep track of how much data we're allocating
size_t nbytes = 0;

// Set the initial seed value
uint64_t seed = 42;

////////////////////////////////////////////////////////////////////
// Initialize Nuclide Grids
////////////////////////////////////////////////////////////////////

if(mype == 0) printf("Intializing nuclide grids...\n");

// First, we need to initialize our nuclide grid. This comes in the form
// of a flattened 2D array that hold all the information we need to define
// the cross sections for all isotopes in the simulation.
// The grid is composed of "NuclideGridPoint" structures, which hold the
// energy level of the grid point and all associated XS data at that level.
// An array of structures (AOS) is used instead of
// a structure of arrays, as the grid points themselves are accessed in
// a random order, but all cross section interaction channels and the
// energy level are read whenever the gridpoint is accessed, meaning the
// AOS is more cache efficient.

// Initialize Nuclide Grid
SD.length_nuclide_grid = in.n_isotopes * in.n_gridpoints;
SD.nuclide_grid = (NuclideGridPoint *) malloc( SD.length_nuclide_grid * sizeof(NuclideGridPoint));
SD.d_nuclide_grid = (NuclideGridPoint *) calloc( SD.length_nuclide_grid , sizeof(NuclideGridPoint));
assert(SD.nuclide_grid != NULL);
nbytes += SD.length_nuclide_grid * sizeof(NuclideGridPoint);
for( int i = 0; i < SD.length_nuclide_grid; i++ )
{
SD.nuclide_grid[i].energy = LCG_random_double(&seed);
SD.nuclide_grid[i].total_xs = LCG_random_double(&seed);
SD.nuclide_grid[i].elastic_xs = LCG_random_double(&seed);
SD.nuclide_grid[i].absorbtion_xs = LCG_random_double(&seed);
SD.nuclide_grid[i].fission_xs = LCG_random_double(&seed);
SD.nuclide_grid[i].nu_fission_xs = LCG_random_double(&seed);
}

// Sort so that each nuclide has data stored in ascending energy order.
for( int i = 0; i < in.n_isotopes; i++ )
qsort( &SD.nuclide_grid[i*in.n_gridpoints], in.n_gridpoints, sizeof(NuclideGridPoint), NGP_compare);

#ifdef FORWARD_PASS
memcpy(SD.d_nuclide_grid, SD.nuclide_grid, SD.length_nuclide_grid * sizeof(NuclideGridPoint));
SD.d_nuclide_grid[0].energy += DELTA;
#endif
// error debug check
/*
for( int i = 0; i < in.n_isotopes; i++ )
{
printf("NUCLIDE %d ==============================\n", i);
for( int j = 0; j < in.n_gridpoints; j++ )
printf("E%d = %lf\n", j, SD.nuclide_grid[i * in.n_gridpoints + j].energy);
}
*/


////////////////////////////////////////////////////////////////////
// Initialize Acceleration Structure
////////////////////////////////////////////////////////////////////

if( in.grid_type == NUCLIDE )
{
SD.length_unionized_energy_array = 0;
SD.length_index_grid = 0;
}

if( in.grid_type == UNIONIZED )
{
if(mype == 0) printf("Intializing unionized grid...\n");

// Allocate space to hold the union of all nuclide energy data
SD.length_unionized_energy_array = in.n_isotopes * in.n_gridpoints;
SD.unionized_energy_array = (double *) malloc( SD.length_unionized_energy_array * sizeof(double));
assert(SD.unionized_energy_array != NULL );
nbytes += SD.length_unionized_energy_array * sizeof(double);

// Copy energy data over from the nuclide energy grid
for( int i = 0; i < SD.length_unionized_energy_array; i++ )
SD.unionized_energy_array[i] = SD.nuclide_grid[i].energy;

// Sort unionized energy array
qsort( SD.unionized_energy_array, SD.length_unionized_energy_array, sizeof(double), double_compare);

// Allocate space to hold the acceleration grid indices
SD.length_index_grid = SD.length_unionized_energy_array * in.n_isotopes;
SD.index_grid = (int *) malloc( SD.length_index_grid * sizeof(int));
assert(SD.index_grid != NULL);
nbytes += SD.length_index_grid * sizeof(int);

// Generates the double indexing grid
int * idx_low = (int *) calloc( in.n_isotopes, sizeof(int));
assert(idx_low != NULL );
double * energy_high = (double *) malloc( in.n_isotopes * sizeof(double));
assert(energy_high != NULL );

for( int i = 0; i < in.n_isotopes; i++ )
energy_high[i] = SD.nuclide_grid[i * in.n_gridpoints + 1].energy;

for( long e = 0; e < SD.length_unionized_energy_array; e++ )
{
double unionized_energy = SD.unionized_energy_array[e];
for( long i = 0; i < in.n_isotopes; i++ )
{
if( unionized_energy < energy_high[i] )
SD.index_grid[e * in.n_isotopes + i] = idx_low[i];
else if( idx_low[i] == in.n_gridpoints - 2 )
SD.index_grid[e * in.n_isotopes + i] = idx_low[i];
else
{
idx_low[i]++;
SD.index_grid[e * in.n_isotopes + i] = idx_low[i];
energy_high[i] = SD.nuclide_grid[i * in.n_gridpoints + idx_low[i] + 1].energy;
}
}
}

free(idx_low);
free(energy_high);
}

if( in.grid_type == HASH )
{
if(mype == 0) printf("Intializing hash grid...\n");
SD.length_unionized_energy_array = 0;
SD.length_index_grid = in.hash_bins * in.n_isotopes;
SD.index_grid = (int *) malloc( SD.length_index_grid * sizeof(int));
assert(SD.index_grid != NULL);
nbytes += SD.length_index_grid * sizeof(int);

double du = 1.0 / in.hash_bins;

// For each energy level in the hash table
for( long e = 0; e < in.hash_bins; e++ )
{
double energy = e * du;

// We need to determine the bounding energy levels for all isotopes
for( long i = 0; i < in.n_isotopes; i++ )
{
SD.index_grid[e * in.n_isotopes + i] = grid_search_nuclide( in.n_gridpoints, energy, SD.nuclide_grid + i * in.n_gridpoints, 0, in.n_gridpoints-1);
}
}
}

////////////////////////////////////////////////////////////////////
// Initialize Materials and Concentrations
////////////////////////////////////////////////////////////////////
if(mype == 0) printf("Intializing material data...\n");

// Set the number of nuclides in each material
SD.num_nucs = load_num_nucs(in.n_isotopes);
SD.length_num_nucs = 12; // There are always 12 materials in XSBench

// Intialize the flattened 2D grid of material data. The grid holds
// a list of nuclide indices for each of the 12 material types. The
// grid is allocated as a full square grid, even though not all
// materials have the same number of nuclides.
SD.mats = load_mats(SD.num_nucs, in.n_isotopes, &SD.max_num_nucs);
SD.length_mats = SD.length_num_nucs * SD.max_num_nucs;

// Intialize the flattened 2D grid of nuclide concentration data. The grid holds
// a list of nuclide concentrations for each of the 12 material types. The
// grid is allocated as a full square grid, even though not all
// materials have the same number of nuclides.
SD.concs = load_concs(SD.num_nucs, SD.max_num_nucs);
SD.length_concs = SD.length_mats;

if(mype == 0) printf("Intialization complete. Allocated %.0lf MB of data on CPU.\n", nbytes/1024.0/1024.0 );

return SD;
}
18 changes: 18 additions & 0 deletions test/CUDA/XSBench/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
Copyright (c) 2012-2019 Argonne National Laboratory

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
108 changes: 108 additions & 0 deletions test/CUDA/XSBench/Main.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#include "XSbench_header.cuh"

int main( int argc, char* argv[] )
{
// =====================================================================
// Initialization & Command Line Read-In
// =====================================================================
int version = 19;
int mype = 0;
double omp_start, omp_end;
int nprocs = 1;
unsigned long long verification;

// Process CLI Fields -- store in "Inputs" structure
Inputs in = read_CLI( argc, argv );

// Print-out of Input Summary
if( mype == 0 )
print_inputs( in, nprocs, version );

// =====================================================================
// Prepare Nuclide Energy Grids, Unionized Energy Grid, & Material Data
// This is not reflective of a real Monte Carlo simulation workload,
// therefore, do not profile this region!
// =====================================================================

SimulationData SD;

// If read from file mode is selected, skip initialization and load
// all simulation data structures from file instead
if( in.binary_mode == READ )
SD = binary_read(in);
else
SD = grid_init_do_not_profile( in, mype );

// If writing from file mode is selected, write all simulation data
// structures to file
if( in.binary_mode == WRITE && mype == 0 )
binary_write(in, SD);

// Move data to GPU
SimulationData GSD = move_simulation_data_to_device( in, mype, SD );

cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1*1024*1024*1024);

// =====================================================================
// Cross Section (XS) Parallel Lookup Simulation
// This is the section that should be profiled, as it reflects a
// realistic continuous energy Monte Carlo macroscopic cross section
// lookup kernel.
// =====================================================================
if( mype == 0 )
{
printf("\n");
border_print();
center_print("SIMULATION", 79);
border_print();
}

// Start Simulation Timer
omp_start = get_time();

// Run simulation
if( in.simulation_method == EVENT_BASED )
{
if( in.kernel_id == 0 )
verification = run_event_based_simulation_baseline(in, GSD, mype);
else if( in.kernel_id == 1 )
verification = run_event_based_simulation_optimization_1(in, GSD, mype);
else if( in.kernel_id == 2 )
verification = run_event_based_simulation_optimization_2(in, GSD, mype);
else if( in.kernel_id == 3 )
verification = run_event_based_simulation_optimization_3(in, GSD, mype);
else if( in.kernel_id == 4 )
verification = run_event_based_simulation_optimization_4(in, GSD, mype);
else if( in.kernel_id == 5 )
verification = run_event_based_simulation_optimization_5(in, GSD, mype);
else if( in.kernel_id == 6 )
verification = run_event_based_simulation_optimization_6(in, GSD, mype);
else
{
printf("Error: No kernel ID %d found!\n", in.kernel_id);
exit(1);
}
}
else
{
printf("History-based simulation not implemented in CUDA code. Instead,\nuse the event-based method with \"-m event\" argument.\n");
exit(1);
}

if( mype == 0)
{
printf("\n" );
printf("Simulation complete.\n" );
}

// End Simulation Timer
omp_end = get_time();

// Final Hash Step
verification = verification % 999983;

// Print / Save Results and Exit
int is_invalid_result = print_results( in, mype, omp_end-omp_start, nprocs, verification );

return is_invalid_result;
}
Loading

0 comments on commit 4e6e569

Please sign in to comment.