Add XSBench

kchristin22 · Dec 2, 2024 · 4e6e569 · 4e6e569
1 parent 2556ba6
commit 4e6e569
Show file tree

Hide file tree

Showing 10 changed files with 2,688 additions and 0 deletions.
diff --git a/test/CUDA/XSBench/GridInit.cu b/test/CUDA/XSBench/GridInit.cu
@@ -0,0 +1,268 @@
+#include "XSbench_header.cuh"
+
+// Moves all required data structures to the GPU's memory space
+SimulationData move_simulation_data_to_device( Inputs in, int mype, SimulationData SD )
+{
+	if(mype == 0) printf("Allocating and moving simulation data to GPU memory space...\n");
+
+	////////////////////////////////////////////////////////////////////////////////
+	// SUMMARY: Simulation Data Structure Manifest for "SD" Object
+	// Here we list all heap arrays (and lengths) in SD that would need to be
+	// offloaded manually if using an accelerator with a seperate memory space
+	////////////////////////////////////////////////////////////////////////////////
+	// int * num_nucs;                     // Length = length_num_nucs;
+	// double * concs;                     // Length = length_concs
+	// int * mats;                         // Length = length_mats
+	// double * unionized_energy_array;    // Length = length_unionized_energy_array
+	// int * index_grid;                   // Length = length_index_grid
+	// NuclideGridPoint * nuclide_grid;    // Length = length_nuclide_grid
+	// 
+	// Note: "unionized_energy_array" and "index_grid" can be of zero length
+	//        depending on lookup method.
+	//
+	// Note: "Lengths" are given as the number of objects in the array, not the
+	//       number of bytes.
+	////////////////////////////////////////////////////////////////////////////////
+	size_t sz;
+	size_t total_sz = 0;
+
+	// Shallow copy of CPU simulation data to GPU simulation data
+	SimulationData GSD = SD;
+
+	// Move data to GPU memory space
+	sz = GSD.length_num_nucs * sizeof(int);
+	gpuErrchk( cudaMalloc((void **) &GSD.num_nucs, sz) );
+	gpuErrchk( cudaMemcpy(GSD.num_nucs, SD.num_nucs, sz, cudaMemcpyHostToDevice) );
+	total_sz += sz;
+
+	sz = GSD.length_concs * sizeof(double);
+	gpuErrchk( cudaMalloc((void **) &GSD.concs, sz) );
+	gpuErrchk( cudaMemcpy(GSD.concs, SD.concs, sz, cudaMemcpyHostToDevice) );
+	total_sz += sz;
+
+	sz = GSD.length_mats * sizeof(int);
+	gpuErrchk( cudaMalloc((void **) &GSD.mats, sz) );
+	gpuErrchk( cudaMemcpy(GSD.mats, SD.mats, sz, cudaMemcpyHostToDevice) );
+	total_sz += sz;
+
+	sz = GSD.length_unionized_energy_array * sizeof(double);
+	gpuErrchk( cudaMalloc((void **) &GSD.unionized_energy_array, sz) );
+	gpuErrchk( cudaMemcpy(GSD.unionized_energy_array, SD.unionized_energy_array, sz, cudaMemcpyHostToDevice) );
+	total_sz += sz;
+
+	sz = GSD.length_index_grid * sizeof(int);
+	gpuErrchk( cudaMalloc((void **) &GSD.index_grid, sz) );
+	gpuErrchk( cudaMemcpy(GSD.index_grid, SD.index_grid, sz, cudaMemcpyHostToDevice) );
+	total_sz += sz;
+
+	sz = GSD.length_nuclide_grid * sizeof(NuclideGridPoint);
+	gpuErrchk( cudaMalloc((void **) &GSD.nuclide_grid, sz) );
+	gpuErrchk( cudaMemcpy(GSD.nuclide_grid, SD.nuclide_grid, sz, cudaMemcpyHostToDevice) );
+	total_sz += sz;
+
+	sz = GSD.length_nuclide_grid * sizeof(NuclideGridPoint);
+	gpuErrchk( cudaMalloc((void **) &GSD.d_nuclide_grid, sz) );
+	gpuErrchk( cudaMemcpy(GSD.d_nuclide_grid, SD.d_nuclide_grid, sz, cudaMemcpyHostToDevice) );
+	total_sz += sz;
+
+	// Allocate verification array on device. This structure is not needed on CPU, so we don't
+	// have to copy anything over.
+	sz = in.lookups * sizeof(unsigned long);
+	gpuErrchk( cudaMalloc((void **) &GSD.verification, sz) );
+	total_sz += sz;
+	GSD.length_verification = in.lookups;
+
+#ifdef PRINT
+	gpuErrchk( cudaMalloc((void **) &GSD.dout, sizeof(double)) );
+	double zero = 0;
+	gpuErrchk( cudaMemcpy(GSD.dout, &zero, sizeof(double), cudaMemcpyHostToDevice) );
+#endif
+
+	// Synchronize
+	gpuErrchk( cudaPeekAtLastError() );
+	gpuErrchk( cudaDeviceSynchronize() );
+
+	if(mype == 0 ) printf("GPU Intialization complete. Allocated %.0lf MB of data on GPU.\n", total_sz/1024.0/1024.0 );
+
+	return GSD;
+
+}
+
+SimulationData grid_init_do_not_profile( Inputs in, int mype )
+{
+	// Structure to hold all allocated simuluation data arrays
+	SimulationData SD;
+
+	// Keep track of how much data we're allocating
+	size_t nbytes = 0;
+
+	// Set the initial seed value
+	uint64_t seed = 42;	
+
+	////////////////////////////////////////////////////////////////////
+	// Initialize Nuclide Grids
+	////////////////////////////////////////////////////////////////////
+
+	if(mype == 0) printf("Intializing nuclide grids...\n");
+
+	// First, we need to initialize our nuclide grid. This comes in the form
+	// of a flattened 2D array that hold all the information we need to define
+	// the cross sections for all isotopes in the simulation. 
+	// The grid is composed of "NuclideGridPoint" structures, which hold the
+	// energy level of the grid point and all associated XS data at that level.
+	// An array of structures (AOS) is used instead of
+	// a structure of arrays, as the grid points themselves are accessed in 
+	// a random order, but all cross section interaction channels and the
+	// energy level are read whenever the gridpoint is accessed, meaning the
+	// AOS is more cache efficient.
+
+	// Initialize Nuclide Grid
+	SD.length_nuclide_grid = in.n_isotopes * in.n_gridpoints;
+	SD.nuclide_grid     = (NuclideGridPoint *) malloc( SD.length_nuclide_grid * sizeof(NuclideGridPoint));
+	SD.d_nuclide_grid     = (NuclideGridPoint *) calloc( SD.length_nuclide_grid , sizeof(NuclideGridPoint));
+	assert(SD.nuclide_grid != NULL);
+	nbytes += SD.length_nuclide_grid * sizeof(NuclideGridPoint);
+	for( int i = 0; i < SD.length_nuclide_grid; i++ )
+	{
+		SD.nuclide_grid[i].energy        = LCG_random_double(&seed);
+		SD.nuclide_grid[i].total_xs      = LCG_random_double(&seed);
+		SD.nuclide_grid[i].elastic_xs    = LCG_random_double(&seed);
+		SD.nuclide_grid[i].absorbtion_xs = LCG_random_double(&seed);
+		SD.nuclide_grid[i].fission_xs    = LCG_random_double(&seed);
+		SD.nuclide_grid[i].nu_fission_xs = LCG_random_double(&seed);
+	}
+
+	// Sort so that each nuclide has data stored in ascending energy order.
+	for( int i = 0; i < in.n_isotopes; i++ )
+		qsort( &SD.nuclide_grid[i*in.n_gridpoints], in.n_gridpoints, sizeof(NuclideGridPoint), NGP_compare);
+
+	#ifdef FORWARD_PASS
+	memcpy(SD.d_nuclide_grid, SD.nuclide_grid, SD.length_nuclide_grid * sizeof(NuclideGridPoint));
+	SD.d_nuclide_grid[0].energy += DELTA;
+	#endif	
+	// error debug check
+	/*
+	for( int i = 0; i < in.n_isotopes; i++ )
+	{
+		printf("NUCLIDE %d ==============================\n", i);
+		for( int j = 0; j < in.n_gridpoints; j++ )
+			printf("E%d = %lf\n", j, SD.nuclide_grid[i * in.n_gridpoints + j].energy);
+	}
+	*/
+
+
+	////////////////////////////////////////////////////////////////////
+	// Initialize Acceleration Structure
+	////////////////////////////////////////////////////////////////////
+
+	if( in.grid_type == NUCLIDE )
+	{
+		SD.length_unionized_energy_array = 0;
+		SD.length_index_grid = 0;
+	}
+
+	if( in.grid_type == UNIONIZED )
+	{
+		if(mype == 0) printf("Intializing unionized grid...\n");
+
+		// Allocate space to hold the union of all nuclide energy data
+		SD.length_unionized_energy_array = in.n_isotopes * in.n_gridpoints;
+		SD.unionized_energy_array = (double *) malloc( SD.length_unionized_energy_array * sizeof(double));
+		assert(SD.unionized_energy_array != NULL );
+		nbytes += SD.length_unionized_energy_array * sizeof(double);
+
+		// Copy energy data over from the nuclide energy grid
+		for( int i = 0; i < SD.length_unionized_energy_array; i++ )
+			SD.unionized_energy_array[i] = SD.nuclide_grid[i].energy;
+
+		// Sort unionized energy array
+		qsort( SD.unionized_energy_array, SD.length_unionized_energy_array, sizeof(double), double_compare);
+
+		// Allocate space to hold the acceleration grid indices
+		SD.length_index_grid = SD.length_unionized_energy_array * in.n_isotopes;
+		SD.index_grid = (int *) malloc( SD.length_index_grid * sizeof(int));
+		assert(SD.index_grid != NULL);
+		nbytes += SD.length_index_grid * sizeof(int);
+
+		// Generates the double indexing grid
+		int * idx_low = (int *) calloc( in.n_isotopes, sizeof(int));
+		assert(idx_low != NULL );
+		double * energy_high = (double *) malloc( in.n_isotopes * sizeof(double));
+		assert(energy_high != NULL );
+
+		for( int i = 0; i < in.n_isotopes; i++ )
+			energy_high[i] = SD.nuclide_grid[i * in.n_gridpoints + 1].energy;
+
+		for( long e = 0; e < SD.length_unionized_energy_array; e++ )
+		{
+			double unionized_energy = SD.unionized_energy_array[e];
+			for( long i = 0; i < in.n_isotopes; i++ )
+			{
+				if( unionized_energy < energy_high[i]  )
+					SD.index_grid[e * in.n_isotopes + i] = idx_low[i];
+				else if( idx_low[i] == in.n_gridpoints - 2 )
+					SD.index_grid[e * in.n_isotopes + i] = idx_low[i];
+				else
+				{
+					idx_low[i]++;
+					SD.index_grid[e * in.n_isotopes + i] = idx_low[i];
+					energy_high[i] = SD.nuclide_grid[i * in.n_gridpoints + idx_low[i] + 1].energy;	
+				}
+			}
+		}
+
+		free(idx_low);
+		free(energy_high);
+	}
+
+	if( in.grid_type == HASH )
+	{
+		if(mype == 0) printf("Intializing hash grid...\n");
+		SD.length_unionized_energy_array = 0;
+		SD.length_index_grid  = in.hash_bins * in.n_isotopes;
+		SD.index_grid = (int *) malloc( SD.length_index_grid * sizeof(int)); 
+		assert(SD.index_grid != NULL);
+		nbytes += SD.length_index_grid * sizeof(int);
+
+		double du = 1.0 / in.hash_bins;
+
+		// For each energy level in the hash table
+		for( long e = 0; e < in.hash_bins; e++ )
+		{
+			double energy = e * du;
+
+			// We need to determine the bounding energy levels for all isotopes
+			for( long i = 0; i < in.n_isotopes; i++ )
+			{
+				SD.index_grid[e * in.n_isotopes + i] = grid_search_nuclide( in.n_gridpoints, energy, SD.nuclide_grid + i * in.n_gridpoints, 0, in.n_gridpoints-1);
+			}
+		}
+	}
+
+	////////////////////////////////////////////////////////////////////
+	// Initialize Materials and Concentrations
+	////////////////////////////////////////////////////////////////////
+	if(mype == 0) printf("Intializing material data...\n");
+
+	// Set the number of nuclides in each material
+	SD.num_nucs  = load_num_nucs(in.n_isotopes);
+	SD.length_num_nucs = 12; // There are always 12 materials in XSBench
+
+	// Intialize the flattened 2D grid of material data. The grid holds
+	// a list of nuclide indices for each of the 12 material types. The
+	// grid is allocated as a full square grid, even though not all
+	// materials have the same number of nuclides.
+	SD.mats = load_mats(SD.num_nucs, in.n_isotopes, &SD.max_num_nucs);
+	SD.length_mats = SD.length_num_nucs * SD.max_num_nucs;
+
+	// Intialize the flattened 2D grid of nuclide concentration data. The grid holds
+	// a list of nuclide concentrations for each of the 12 material types. The
+	// grid is allocated as a full square grid, even though not all
+	// materials have the same number of nuclides.
+	SD.concs = load_concs(SD.num_nucs, SD.max_num_nucs);
+	SD.length_concs = SD.length_mats;
+
+	if(mype == 0) printf("Intialization complete. Allocated %.0lf MB of data on CPU.\n", nbytes/1024.0/1024.0 );
+
+	return SD;
+}
diff --git a/test/CUDA/XSBench/LICENSE b/test/CUDA/XSBench/LICENSE
@@ -0,0 +1,18 @@
+Copyright (c) 2012-2019 Argonne National Laboratory
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/test/CUDA/XSBench/Main.cu b/test/CUDA/XSBench/Main.cu
@@ -0,0 +1,108 @@
+#include "XSbench_header.cuh"
+
+int main( int argc, char* argv[] )
+{
+	// =====================================================================
+	// Initialization & Command Line Read-In
+	// =====================================================================
+	int version = 19;
+	int mype = 0;
+	double omp_start, omp_end;
+	int nprocs = 1;
+	unsigned long long verification;
+
+	// Process CLI Fields -- store in "Inputs" structure
+	Inputs in = read_CLI( argc, argv );
+
+	// Print-out of Input Summary
+	if( mype == 0 )
+		print_inputs( in, nprocs, version );
+
+	// =====================================================================
+	// Prepare Nuclide Energy Grids, Unionized Energy Grid, & Material Data
+	// This is not reflective of a real Monte Carlo simulation workload,
+	// therefore, do not profile this region!
+	// =====================================================================
+
+	SimulationData SD;
+
+	// If read from file mode is selected, skip initialization and load
+	// all simulation data structures from file instead
+	if( in.binary_mode == READ )
+		SD = binary_read(in);
+	else
+		SD = grid_init_do_not_profile( in, mype );
+
+	// If writing from file mode is selected, write all simulation data
+	// structures to file
+	if( in.binary_mode == WRITE && mype == 0 )
+		binary_write(in, SD);
+
+	// Move data to GPU
+	SimulationData GSD = move_simulation_data_to_device( in, mype, SD );
+
+  	cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1*1024*1024*1024);
+
+	// =====================================================================
+	// Cross Section (XS) Parallel Lookup Simulation
+	// This is the section that should be profiled, as it reflects a 
+	// realistic continuous energy Monte Carlo macroscopic cross section
+	// lookup kernel.
+	// =====================================================================
+	if( mype == 0 )
+	{
+		printf("\n");
+		border_print();
+		center_print("SIMULATION", 79);
+		border_print();
+	}
+
+	// Start Simulation Timer
+	omp_start = get_time();
+
+	// Run simulation
+	if( in.simulation_method == EVENT_BASED )
+	{
+		if( in.kernel_id == 0 )
+			verification = run_event_based_simulation_baseline(in, GSD, mype);
+		else if( in.kernel_id == 1 )
+			verification = run_event_based_simulation_optimization_1(in, GSD, mype);
+		else if( in.kernel_id == 2 )
+			verification = run_event_based_simulation_optimization_2(in, GSD, mype);
+		else if( in.kernel_id == 3 )
+			verification = run_event_based_simulation_optimization_3(in, GSD, mype);
+		else if( in.kernel_id == 4 )
+			verification = run_event_based_simulation_optimization_4(in, GSD, mype);
+		else if( in.kernel_id == 5 )
+			verification = run_event_based_simulation_optimization_5(in, GSD, mype);
+		else if( in.kernel_id == 6 )
+			verification = run_event_based_simulation_optimization_6(in, GSD, mype);
+		else
+		{
+			printf("Error: No kernel ID %d found!\n", in.kernel_id);
+			exit(1);
+		}
+	}
+	else
+	{
+		printf("History-based simulation not implemented in CUDA code. Instead,\nuse the event-based method with \"-m event\" argument.\n");
+		exit(1);
+	}
+
+	if( mype == 0)	
+	{	
+		printf("\n" );
+		printf("Simulation complete.\n" );
+	}
+
+	// End Simulation Timer
+	omp_end = get_time();
+
+	// Final Hash Step
+	verification = verification % 999983;
+
+	// Print / Save Results and Exit
+	int is_invalid_result = print_results( in, mype, omp_end-omp_start, nprocs, verification );
+
+	return is_invalid_result;
+}