diff --git a/benchmark_cgdrag/benchmarker_cgdrag_forpy.f90 b/benchmark_cgdrag/benchmarker_cgdrag_forpy.f90
index fd15206..0f030fe 100644
--- a/benchmark_cgdrag/benchmarker_cgdrag_forpy.f90
+++ b/benchmark_cgdrag/benchmarker_cgdrag_forpy.f90
@@ -24,8 +24,8 @@ subroutine main()
integer :: i, j, n
real(dp) :: start_time, end_time, start_loop_time, end_loop_time
- real(dp), dimension(:), allocatable :: module_load_durations, module_delete_durations, loop_durations, inference_durations
- real(dp), dimension(:), allocatable :: allocation_durations, deallocation_durations, tensor_creation_durations, tensor_deletion_durations
+ real(dp), dimension(:), allocatable :: loop_durations, inference_durations, allocation_durations
+ real(dp), dimension(:), allocatable :: deallocation_durations, tensor_creation_durations, tensor_deletion_durations
real(dp), dimension(:,:), allocatable :: all_durations
character(len=20), dimension(:), allocatable :: messages
@@ -47,6 +47,8 @@ subroutine main()
character(len=:), allocatable :: model_dir, model_name
character(len=128) :: msg1, msg2, msg3, msg4, msg5, msg6
integer :: ntimes
+ character(len=10) :: input_device
+ logical :: use_cuda = .false.
type(ndarray) :: uuu_nd, vvv_nd, gwfcng_x_nd, gwfcng_y_nd, lat_nd, psfc_nd
@@ -56,16 +58,22 @@ subroutine main()
print *, "====== FORPY ======"
- call setup(model_dir, model_name, ntimes, n, alloc_in_loop)
+ call setup(model_dir, model_name, ntimes, n, alloc_in_loop, use_cuda=use_cuda)
if (ntimes .lt. 2) then
write(*,*) "Error: ntimes must be at least 2"
return
end if
+ if (use_cuda) then
+ input_device = "cuda"
+ else
+ input_device = "cpu"
+ end if
+
! Allocate arrays shared with FTorch implementation and read in data
call init_common_arrays(ntimes, I_MAX, J_MAX, K_MAX, uuu, vvv, gwfcng_x, gwfcng_y, gwfcng_x_ref, gwfcng_y_ref, lat, psfc, &
- module_load_durations, module_delete_durations, loop_durations, allocation_durations, deallocation_durations, &
- tensor_creation_durations, tensor_deletion_durations, inference_durations, all_durations, messages, &
+ loop_durations, allocation_durations, deallocation_durations, tensor_creation_durations, &
+ tensor_deletion_durations, inference_durations, all_durations, messages, &
start_loop_time, end_loop_time, start_time, end_time)
! Reshape arrays, if not done for every loop
@@ -80,7 +88,7 @@ subroutine main()
#else
print *, "generate model in python runtime"
#endif
- call load_module(model_dir, model_name, run_emulator, model)
+ call load_module(model_dir, model_name, run_emulator, model, use_cuda)
do i = 1, ntimes
@@ -108,13 +116,14 @@ subroutine main()
ie = ndarray_create_nocopy(gwfcng_y_nd, gwfcng_y_flattened)
! create model input args as tuple
- ie = tuple_create(args,6)
+ ie = tuple_create(args, 7)
ie = args%setitem(0, model)
ie = args%setitem(1, uuu_nd)
ie = args%setitem(2, lat_nd)
ie = args%setitem(3, psfc_nd)
ie = args%setitem(4, gwfcng_x_nd)
ie = args%setitem(5, J_MAX)
+ ie = args%setitem(6, trim(input_device))
end_time = omp_get_wtime()
tensor_creation_durations(i) = end_time - start_time
! ------------------------------ End tensor creation timer ------------------------------
@@ -210,25 +219,23 @@ subroutine main()
end do
- call time_module(ntimes, model_dir, model_name, module_load_durations, module_delete_durations, run_emulator, model)
+ call forpy_finalize
! Call individual print for loop, to avoid adding to combined mean
call print_time_stats(loop_durations, "full loop")
- all_durations(:, 1) = module_load_durations
- all_durations(:, 2) = module_delete_durations
- all_durations(:, 3) = allocation_durations
- all_durations(:, 4) = deallocation_durations
- all_durations(:, 5) = tensor_creation_durations
- all_durations(:, 6) = tensor_deletion_durations
- all_durations(:, 7) = inference_durations
- messages = [character(len=20) :: "module creation", "module deletion", "array allocation", "array deallocation", &
+ all_durations(:, 1) = allocation_durations
+ all_durations(:, 2) = deallocation_durations
+ all_durations(:, 3) = tensor_creation_durations
+ all_durations(:, 4) = tensor_deletion_durations
+ all_durations(:, 5) = inference_durations
+ messages = [character(len=20) :: "array allocation", "array deallocation", &
"tensor creation", "tensor deletion", "forward pass"]
call print_all_time_stats(all_durations, messages)
- call deallocate_common_arrays(uuu, vvv, gwfcng_x, gwfcng_y, gwfcng_x_ref, gwfcng_y_ref, lat, psfc, module_load_durations, &
- module_delete_durations, loop_durations, allocation_durations, deallocation_durations, &
- tensor_creation_durations, tensor_deletion_durations, inference_durations, all_durations, messages)
+ call deallocate_common_arrays(uuu, vvv, gwfcng_x, gwfcng_y, gwfcng_x_ref, gwfcng_y_ref, lat, psfc, &
+ loop_durations, allocation_durations, deallocation_durations, tensor_creation_durations, &
+ tensor_deletion_durations, inference_durations, all_durations, messages)
if (.not. alloc_in_loop) then
call deallocate_reshaped_arrays(uuu_flattened, vvv_flattened, lat_reshaped, psfc_reshaped, gwfcng_x_flattened, gwfcng_y_flattened)
@@ -236,45 +243,12 @@ subroutine main()
end subroutine main
- subroutine time_module(ntimes, model_dir, model_name, module_load_durations, module_delete_durations, run_emulator, model)
-
- implicit none
-
- integer, intent(in) :: ntimes
- character(len=*), intent(in) :: model_dir, model_name
- real(dp), dimension(:), intent(inout) :: module_load_durations, module_delete_durations
- type(module_py), intent(out) :: run_emulator
- type(object), intent(out) :: model
-
- integer :: i
- real(dp) :: start_time, end_time
-
- do i = 1, ntimes
- ! ------------------------------ Start module load timer ------------------------------
- start_time = omp_get_wtime()
- call load_module(model_dir, model_name, run_emulator, model)
- end_time = omp_get_wtime()
- module_load_durations(i) = end_time - start_time
- ! ------------------------------ End module load timer ------------------------------
-
- ! ------------------------------ Start module deletion timer ------------------------------
- ! We can only call forpy_finalize once
- if (i == ntimes) then
- start_time = omp_get_wtime()
- call forpy_finalize
- end_time = omp_get_wtime()
- module_delete_durations(:) = (end_time - start_time) / (ntimes + 1)
- end if
- ! ------------------------------ End module deletion timer ------------------------------
- end do
-
- end subroutine time_module
-
- subroutine load_module(model_dir, model_name, run_emulator, model)
+ subroutine load_module(model_dir, model_name, run_emulator, model, use_cuda)
implicit none
character(len=*), intent(in) :: model_dir, model_name
+ logical, intent(in) :: use_cuda
type(module_py), intent(out) :: run_emulator
type(object), intent(out) :: model
@@ -301,7 +275,11 @@ subroutine load_module(model_dir, model_name, run_emulator, model)
#ifdef USETS
! load torchscript saved model
ie = tuple_create(args,1)
- ie = str_create(filename, trim(model_dir//"/"//"saved_cgdrag_model_cpu.pt"))
+ if (use_cuda) then
+ ie = str_create(filename, trim(model_dir//"/"//"saved_cgdrag_model_gpu.pt"))
+ else
+ ie = str_create(filename, trim(model_dir//"/"//"saved_cgdrag_model_cpu.pt"))
+ end if
ie = args%setitem(0, filename)
ie = call_py(model, run_emulator, "initialize_ts", args)
call args%destroy
@@ -318,8 +296,8 @@ subroutine load_module(model_dir, model_name, run_emulator, model)
end subroutine load_module
subroutine init_common_arrays(ntimes, I_MAX, J_MAX, K_MAX, uuu, vvv, gwfcng_x, gwfcng_y, gwfcng_x_ref, gwfcng_y_ref, lat, psfc, &
- module_load_durations, module_delete_durations, loop_durations, allocation_durations, &
- deallocation_durations, tensor_creation_durations, tensor_deletion_durations, inference_durations, &
+ loop_durations, allocation_durations, deallocation_durations, &
+ tensor_creation_durations, tensor_deletion_durations, inference_durations, &
all_durations, messages, start_loop_time, end_loop_time, start_time, end_time)
implicit none
@@ -330,8 +308,8 @@ subroutine init_common_arrays(ntimes, I_MAX, J_MAX, K_MAX, uuu, vvv, gwfcng_x, g
real(wp), intent(out), dimension(:,:,:), allocatable :: gwfcng_x_ref, gwfcng_y_ref
real(wp), intent(out), dimension(:,:), allocatable :: lat, psfc
- real(dp), intent(out), dimension(:), allocatable :: module_load_durations, module_delete_durations, loop_durations, inference_durations
- real(dp), intent(out), dimension(:), allocatable :: allocation_durations, deallocation_durations, tensor_creation_durations, tensor_deletion_durations
+ real(dp), intent(out), dimension(:), allocatable :: loop_durations, inference_durations, allocation_durations
+ real(dp), intent(out), dimension(:), allocatable :: deallocation_durations, tensor_creation_durations, tensor_deletion_durations
real(dp), intent(out), dimension(:,:), allocatable :: all_durations
character(len=20), intent(out), dimension(:), allocatable :: messages
@@ -385,20 +363,16 @@ subroutine init_common_arrays(ntimes, I_MAX, J_MAX, K_MAX, uuu, vvv, gwfcng_x, g
close(15)
! Allocate arrays for timings
- allocate(module_load_durations(ntimes))
- allocate(module_delete_durations(ntimes))
allocate(loop_durations(ntimes))
allocate(allocation_durations(ntimes))
allocate(deallocation_durations(ntimes))
allocate(tensor_creation_durations(ntimes))
allocate(tensor_deletion_durations(ntimes))
allocate(inference_durations(ntimes))
- allocate(all_durations(ntimes, 7))
- allocate(messages(7))
+ allocate(all_durations(ntimes, 5))
+ allocate(messages(5))
! Initialise timings with arbitrary large values
- module_load_durations(:) = 100.
- module_delete_durations(:) = 100.
loop_durations(:) = 100.
allocation_durations(:) = 100.
deallocation_durations(:) = 100.
@@ -445,14 +419,14 @@ subroutine init_reshaped_arrays(I_MAX, J_MAX, K_MAX, uuu, vvv, lat, psfc, uuu_fl
end subroutine init_reshaped_arrays
- subroutine deallocate_common_arrays(uuu, vvv, gwfcng_x, gwfcng_y, gwfcng_x_ref, gwfcng_y_ref, lat, psfc, module_load_durations, &
- module_delete_durations, loop_durations, allocation_durations, deallocation_durations, &
- tensor_creation_durations, tensor_deletion_durations, inference_durations, all_durations, messages)
+ subroutine deallocate_common_arrays(uuu, vvv, gwfcng_x, gwfcng_y, gwfcng_x_ref, gwfcng_y_ref, lat, psfc, &
+ loop_durations, allocation_durations, deallocation_durations, tensor_creation_durations, &
+ tensor_deletion_durations, inference_durations, all_durations, messages)
implicit none
- real(dp), intent(inout), dimension(:), allocatable :: module_load_durations, module_delete_durations, loop_durations, inference_durations
- real(dp), intent(inout), dimension(:), allocatable :: allocation_durations, deallocation_durations, tensor_creation_durations, tensor_deletion_durations
+ real(dp), intent(inout), dimension(:), allocatable :: loop_durations, inference_durations, allocation_durations
+ real(dp), intent(inout), dimension(:), allocatable :: deallocation_durations, tensor_creation_durations, tensor_deletion_durations
real(dp), intent(inout), dimension(:,:), allocatable :: all_durations
character(len=20), intent(inout), dimension(:), allocatable :: messages
@@ -460,8 +434,6 @@ subroutine deallocate_common_arrays(uuu, vvv, gwfcng_x, gwfcng_y, gwfcng_x_ref,
real(wp), intent(inout), dimension(:,:,:), allocatable :: gwfcng_x_ref, gwfcng_y_ref
real(wp), intent(inout), dimension(:,:), allocatable :: lat, psfc
- deallocate(module_load_durations)
- deallocate(module_delete_durations)
deallocate(loop_durations)
deallocate(allocation_durations)
deallocate(deallocation_durations)
diff --git a/benchmark_cgdrag/benchmarker_cgdrag_torch.f90 b/benchmark_cgdrag/benchmarker_cgdrag_torch.f90
index c985ebc..9fbd2b2 100644
--- a/benchmark_cgdrag/benchmarker_cgdrag_torch.f90
+++ b/benchmark_cgdrag/benchmarker_cgdrag_torch.f90
@@ -1,6 +1,6 @@
program benchmark_cgdrag_test
- use, intrinsic :: iso_c_binding
+ use, intrinsic :: iso_c_binding, only : c_loc, c_int, c_int64_t
use :: omp_lib, only : omp_get_wtime
use :: utils, only : assert, setup, print_time_stats, print_all_time_stats
use :: ftorch
@@ -22,8 +22,8 @@ subroutine main()
integer :: i, j, n, ii
real(dp) :: start_time, end_time, start_loop_time, end_loop_time
- real(dp), dimension(:), allocatable :: module_load_durations, module_delete_durations, loop_durations, inference_durations
- real(dp), dimension(:), allocatable :: allocation_durations, deallocation_durations, tensor_creation_durations, tensor_deletion_durations
+ real(dp), dimension(:), allocatable :: loop_durations, inference_durations, allocation_durations
+ real(dp), dimension(:), allocatable :: deallocation_durations, tensor_creation_durations, tensor_deletion_durations
real(dp), dimension(:,:), allocatable :: all_durations
character(len=20), dimension(:), allocatable :: messages
@@ -37,17 +37,17 @@ subroutine main()
real(wp), dimension(:,:), allocatable, target :: lat_reshaped, psfc_reshaped
real(wp), dimension(:,:), allocatable, target :: gwfcng_x_flattened, gwfcng_y_flattened
- integer(c_int), parameter :: n_inputs = 3
+ integer, parameter :: n_inputs = 3
- integer(c_int), parameter :: dims_1D = 2
- integer(c_int), parameter :: dims_2D = 2
- integer(c_int), parameter :: dims_out = 2
- integer(c_int64_t) :: shape_2D(dims_2D) = [I_MAX * J_MAX, K_MAX]
- integer(c_int) :: stride_2D(dims_2D) = [1, 2]
- integer(c_int64_t) :: shape_1D(dims_1D) = [I_MAX * J_MAX, 1]
- integer(c_int) :: stride_1D(dims_1D) = [1, 2]
- integer(c_int64_t) :: shape_out(dims_out) = [I_MAX * J_MAX, K_MAX]
- integer(c_int) :: stride_out(dims_out) = [1, 2]
+ integer, parameter :: dims_1D = 2
+ integer, parameter :: dims_2D = 2
+ integer, parameter :: dims_out = 2
+ integer :: shape_2D(dims_2D) = [I_MAX * J_MAX, K_MAX]
+ integer :: stride_2D(dims_2D) = [1, 2]
+ integer :: shape_1D(dims_1D) = [I_MAX * J_MAX, 1]
+ integer :: stride_1D(dims_1D) = [1, 2]
+ integer :: shape_out(dims_out) = [I_MAX * J_MAX, K_MAX]
+ integer :: stride_out(dims_out) = [1, 2]
character(len=:), allocatable :: model_dir, model_name
character(len=128) :: msg1, msg2, msg3, msg4, msg5, msg6
@@ -83,8 +83,8 @@ subroutine main()
! Allocate arrays shared with FTorch implementation and read in data
call init_common_arrays(ntimes, I_MAX, J_MAX, K_MAX, uuu, vvv, gwfcng_x, gwfcng_y, gwfcng_x_ref, gwfcng_y_ref, lat, psfc, &
- module_load_durations, module_delete_durations, loop_durations, allocation_durations, deallocation_durations, &
- tensor_creation_durations, tensor_deletion_durations, inference_durations, all_durations, messages, &
+ loop_durations, allocation_durations, deallocation_durations, tensor_creation_durations, &
+ tensor_deletion_durations, inference_durations, all_durations, messages, &
start_loop_time, end_loop_time, start_time, end_time)
! Allocate arrays and flatten inputs and outputs if --explicit_reshape is set, but --alloc_in_loop is not
@@ -94,7 +94,7 @@ subroutine main()
lat_reshaped, psfc_reshaped, gwfcng_x_flattened, gwfcng_y_flattened)
end if
- ! Load model (creation/deletion timed at end)
+ ! Load model
model = torch_module_load(model_dir//"/"//model_name)
do i = 1, ntimes
@@ -117,20 +117,20 @@ subroutine main()
! ------------------------------ Start tensor creation timer ------------------------------
start_time = omp_get_wtime()
if (explicit_reshape) then
- in_tensors(3) = torch_tensor_from_blob(c_loc(lat_reshaped), dims_1D, shape_1D, torch_wp, input_device, stride_1D)
- in_tensors(2) = torch_tensor_from_blob(c_loc(psfc_reshaped), dims_1D, shape_1D, torch_wp, input_device, stride_1D)
+ in_tensors(3) = torch_tensor_from_array(lat_reshaped, stride_1D, input_device)
+ in_tensors(2) = torch_tensor_from_array(psfc_reshaped, stride_1D, input_device)
else
- in_tensors(3) = torch_tensor_from_blob(c_loc(lat), dims_1D, shape_1D, torch_wp, input_device, stride_1D)
- in_tensors(2) = torch_tensor_from_blob(c_loc(psfc), dims_1D, shape_1D, torch_wp, input_device, stride_1D)
+ in_tensors(3) = torch_tensor_from_blob(c_loc(lat), int(dims_1D, c_int), int(shape_1D, c_int64_t), int(stride_1D, c_int), torch_wp, input_device)
+ in_tensors(2) = torch_tensor_from_blob(c_loc(psfc), int(dims_1D, c_int), int(shape_1D, c_int64_t), int(stride_1D, c_int), torch_wp, input_device)
end if
! Zonal
if (explicit_reshape) then
- in_tensors(1) = torch_tensor_from_blob(c_loc(uuu_flattened), dims_2D, shape_2D, torch_wp, input_device, stride_2D)
- gwfcng_x_tensor = torch_tensor_from_blob(c_loc(gwfcng_x_flattened), dims_out, shape_out, torch_wp, torch_kCPU, stride_out)
+ in_tensors(1) = torch_tensor_from_array(uuu_flattened, stride_2D, input_device)
+ gwfcng_x_tensor = torch_tensor_from_array(gwfcng_x_flattened, stride_out, torch_kCPU)
else
- in_tensors(1) = torch_tensor_from_blob(c_loc(uuu), dims_2D, shape_2D, torch_wp, input_device, stride_2D)
- gwfcng_x_tensor = torch_tensor_from_blob(c_loc(gwfcng_x), dims_out, shape_out, torch_wp, torch_kCPU, stride_out)
+ in_tensors(1) = torch_tensor_from_blob(c_loc(uuu), int(dims_2D, c_int), int(shape_2D, c_int64_t), int(stride_2D, c_int), torch_wp, input_device)
+ gwfcng_x_tensor = torch_tensor_from_blob(c_loc(gwfcng_x), int(dims_out, c_int), int(shape_out, c_int64_t), int(stride_out, c_int), torch_wp, torch_kCPU)
end if
end_time = omp_get_wtime()
tensor_creation_durations(i) = end_time - start_time
@@ -144,15 +144,18 @@ subroutine main()
inference_durations(i) = end_time - start_time
! ------------------------------ End inference timer ------------------------------
+ ! Clean up here before this points to a new tensor.
+ call torch_tensor_delete(in_tensors(1))
+
! Meridional
! ------------------------------ Start tensor creation timer ------------------------------
start_time = omp_get_wtime()
if (explicit_reshape) then
- in_tensors(1) = torch_tensor_from_blob(c_loc(vvv_flattened), dims_2D, shape_2D, torch_wp, input_device, stride_2D)
- gwfcng_y_tensor = torch_tensor_from_blob(c_loc(gwfcng_y_flattened), dims_out, shape_out, torch_wp, torch_kCPU, stride_out)
+ in_tensors(1) = torch_tensor_from_array(vvv_flattened, stride_2D, input_device)
+ gwfcng_y_tensor = torch_tensor_from_array(gwfcng_y_flattened, stride_out, torch_kCPU)
else
- in_tensors(1) = torch_tensor_from_blob(c_loc(vvv), dims_2D, shape_2D, torch_wp, input_device, stride_2D)
- gwfcng_y_tensor = torch_tensor_from_blob(c_loc(gwfcng_y), dims_out, shape_out, torch_wp, torch_kCPU, stride_out)
+ in_tensors(1) = torch_tensor_from_blob(c_loc(vvv), int(dims_2D, c_int), int(shape_2D, c_int64_t), int(stride_2D, c_int), torch_wp, input_device)
+ gwfcng_y_tensor = torch_tensor_from_blob(c_loc(gwfcng_y), int(dims_out, c_int), int(shape_out, c_int64_t), int(stride_out, c_int), torch_wp, torch_kCPU)
end if
end_time = omp_get_wtime()
tensor_creation_durations(i) = tensor_creation_durations(i) + (end_time - start_time)
@@ -225,25 +228,24 @@ subroutine main()
end do
- call time_module(ntimes, model_dir, model_name, module_load_durations, module_delete_durations)
+ ! Delete model
+ call torch_module_delete(model)
! Call individual print for loop, to avoid adding to combined mean
call print_time_stats(loop_durations, "full loop")
- all_durations(:, 1) = module_load_durations
- all_durations(:, 2) = module_delete_durations
- all_durations(:, 3) = allocation_durations
- all_durations(:, 4) = deallocation_durations
- all_durations(:, 5) = tensor_creation_durations
- all_durations(:, 6) = tensor_deletion_durations
- all_durations(:, 7) = inference_durations
- messages = [character(len=20) :: "module creation", "module deletion", "array allocation", "array deallocation", &
+ all_durations(:, 1) = allocation_durations
+ all_durations(:, 2) = deallocation_durations
+ all_durations(:, 3) = tensor_creation_durations
+ all_durations(:, 4) = tensor_deletion_durations
+ all_durations(:, 5) = inference_durations
+ messages = [character(len=20) :: "array allocation", "array deallocation", &
"tensor creation", "tensor deletion", "forward pass"]
call print_all_time_stats(all_durations, messages)
- call deallocate_common_arrays(uuu, vvv, gwfcng_x, gwfcng_y, gwfcng_x_ref, gwfcng_y_ref, lat, psfc, module_load_durations, &
- module_delete_durations, loop_durations, allocation_durations, deallocation_durations, &
- tensor_creation_durations, tensor_deletion_durations, inference_durations, all_durations, messages)
+ call deallocate_common_arrays(uuu, vvv, gwfcng_x, gwfcng_y, gwfcng_x_ref, gwfcng_y_ref, lat, psfc, loop_durations, &
+ allocation_durations, deallocation_durations, tensor_creation_durations, tensor_deletion_durations, &
+ inference_durations, all_durations, messages)
! Deallocate arrays for flattened inputs and outputs if --explicit_reshape is set, but --alloc_in_loop is not
! if --explicit_reshape and --alloc_in_loop are both set, this is done within each loop instead
@@ -253,39 +255,10 @@ subroutine main()
end subroutine main
- subroutine time_module(ntimes, model_dir, model_name, module_load_durations, module_delete_durations)
-
- implicit none
-
- integer, intent(in) :: ntimes
- real(dp), dimension(:), intent(inout) :: module_load_durations, module_delete_durations
- integer :: i
- real(dp) :: start_time, end_time
- character(len=*), intent(in) :: model_dir, model_name
- type(torch_module) :: model
-
- do i = 1, ntimes
- ! ------------------------------ Start module load timer ------------------------------
- start_time = omp_get_wtime()
- model = torch_module_load(model_dir//"/"//model_name)
- end_time = omp_get_wtime()
- module_load_durations(i) = end_time - start_time
- ! ------------------------------ End module load timer ------------------------------
-
- ! ------------------------------ Start module deletion timer ------------------------------
- start_time = omp_get_wtime()
- call torch_module_delete(model)
- end_time = omp_get_wtime()
- module_delete_durations(i) = end_time - start_time
- ! ------------------------------ End module deletion timer ------------------------------
- end do
-
- end subroutine time_module
-
subroutine init_common_arrays(ntimes, I_MAX, J_MAX, K_MAX, uuu, vvv, gwfcng_x, gwfcng_y, gwfcng_x_ref, gwfcng_y_ref, lat, psfc, &
- module_load_durations, module_delete_durations, loop_durations, allocation_durations, &
- deallocation_durations, tensor_creation_durations, tensor_deletion_durations, inference_durations, &
- all_durations, messages, start_loop_time, end_loop_time, start_time, end_time)
+ loop_durations, allocation_durations, deallocation_durations, tensor_creation_durations, &
+ tensor_deletion_durations, inference_durations, all_durations, messages, &
+ start_loop_time, end_loop_time, start_time, end_time)
implicit none
@@ -295,8 +268,8 @@ subroutine init_common_arrays(ntimes, I_MAX, J_MAX, K_MAX, uuu, vvv, gwfcng_x, g
real(wp), intent(out), dimension(:,:,:), allocatable :: gwfcng_x_ref, gwfcng_y_ref
real(wp), intent(out), dimension(:,:), allocatable :: lat, psfc
- real(dp), intent(out), dimension(:), allocatable :: module_load_durations, module_delete_durations, loop_durations, inference_durations
- real(dp), intent(out), dimension(:), allocatable :: allocation_durations, deallocation_durations, tensor_creation_durations, tensor_deletion_durations
+ real(dp), intent(out), dimension(:), allocatable :: loop_durations, inference_durations, allocation_durations
+ real(dp), intent(out), dimension(:), allocatable :: deallocation_durations, tensor_creation_durations, tensor_deletion_durations
real(dp), intent(out), dimension(:,:), allocatable :: all_durations
character(len=20), intent(out), dimension(:), allocatable :: messages
@@ -350,20 +323,16 @@ subroutine init_common_arrays(ntimes, I_MAX, J_MAX, K_MAX, uuu, vvv, gwfcng_x, g
close(15)
! Allocate arrays for timings
- allocate(module_load_durations(ntimes))
- allocate(module_delete_durations(ntimes))
allocate(loop_durations(ntimes))
allocate(allocation_durations(ntimes))
allocate(deallocation_durations(ntimes))
allocate(tensor_creation_durations(ntimes))
allocate(tensor_deletion_durations(ntimes))
allocate(inference_durations(ntimes))
- allocate(all_durations(ntimes, 7))
- allocate(messages(7))
+ allocate(all_durations(ntimes, 5))
+ allocate(messages(5))
! Initialise timings with arbitrary large values
- module_load_durations(:) = 100.
- module_delete_durations(:) = 100.
loop_durations(:) = 100.
allocation_durations(:) = 100.
deallocation_durations(:) = 100.
@@ -410,14 +379,14 @@ subroutine init_reshaped_arrays(I_MAX, J_MAX, K_MAX, uuu, vvv, lat, psfc, uuu_fl
end subroutine init_reshaped_arrays
- subroutine deallocate_common_arrays(uuu, vvv, gwfcng_x, gwfcng_y, gwfcng_x_ref, gwfcng_y_ref, lat, psfc, module_load_durations, &
- module_delete_durations, loop_durations, allocation_durations, deallocation_durations, &
- tensor_creation_durations, tensor_deletion_durations, inference_durations, all_durations, messages)
+ subroutine deallocate_common_arrays(uuu, vvv, gwfcng_x, gwfcng_y, gwfcng_x_ref, gwfcng_y_ref, lat, psfc, loop_durations, &
+ allocation_durations, deallocation_durations, tensor_creation_durations, &
+ tensor_deletion_durations, inference_durations, all_durations, messages)
implicit none
- real(dp), intent(inout), dimension(:), allocatable :: module_load_durations, module_delete_durations, loop_durations, inference_durations
- real(dp), intent(inout), dimension(:), allocatable :: allocation_durations, deallocation_durations, tensor_creation_durations, tensor_deletion_durations
+ real(dp), intent(inout), dimension(:), allocatable :: loop_durations, inference_durations, allocation_durations
+ real(dp), intent(inout), dimension(:), allocatable :: deallocation_durations, tensor_creation_durations, tensor_deletion_durations
real(dp), intent(inout), dimension(:,:), allocatable :: all_durations
character(len=20), intent(inout), dimension(:), allocatable :: messages
@@ -425,8 +394,6 @@ subroutine deallocate_common_arrays(uuu, vvv, gwfcng_x, gwfcng_y, gwfcng_x_ref,
real(wp), intent(inout), dimension(:,:,:), allocatable :: gwfcng_x_ref, gwfcng_y_ref
real(wp), intent(inout), dimension(:,:), allocatable :: lat, psfc
- deallocate(module_load_durations)
- deallocate(module_delete_durations)
deallocate(loop_durations)
deallocate(allocation_durations)
deallocate(deallocation_durations)
diff --git a/benchmark_large_stride/benchmarker_large_stride_torch.f90 b/benchmark_large_stride/benchmarker_large_stride_torch.f90
index 17ffb9e..fa9001c 100644
--- a/benchmark_large_stride/benchmarker_large_stride_torch.f90
+++ b/benchmark_large_stride/benchmarker_large_stride_torch.f90
@@ -95,8 +95,8 @@ subroutine main()
! Create input and output tensors for the model.
! ------------------------------ Start tensor creation timer ------------------------------
start_time = omp_get_wtime()
- input_array(1) = torch_tensor_from_blob(c_loc(big_array), 2, shape_2d, torch_wp, input_device, stride_2d)
- result_tensor = torch_tensor_from_blob(c_loc(big_result), 2, shape_2d, torch_wp, torch_kCPU, stride_2d)
+ input_array(1) = torch_tensor_from_blob(c_loc(big_array), 2, shape_2d, stride_2d, torch_wp, input_device)
+ result_tensor = torch_tensor_from_blob(c_loc(big_result), 2, shape_2d, stride_2d, torch_wp, torch_kCPU)
end_time = omp_get_wtime()
tensor_creation_durations(i) = end_time - start_time
! ------------------------------ End tensor creation timer ------------------------------
diff --git a/benchmark_mima/cg_drag_torch_mod.f90 b/benchmark_mima/cg_drag_torch_mod.f90
index 4bd8748..79d01aa 100644
--- a/benchmark_mima/cg_drag_torch_mod.f90
+++ b/benchmark_mima/cg_drag_torch_mod.f90
@@ -169,18 +169,21 @@ subroutine cg_drag_ML(uuu, vvv, psfc, lat, gwfcng_x, gwfcng_y)
lat = lat*RADIAN
! Create input/output tensors from the above arrays
- model_input_arr(3) = torch_tensor_from_blob(c_loc(lat), dims_1D, shape_1D, torch_wp, torch_kCPU, stride_1D)
- model_input_arr(2) = torch_tensor_from_blob(c_loc(psfc), dims_1D, shape_1D, torch_wp, torch_kCPU, stride_1D)
+ model_input_arr(3) = torch_tensor_from_blob(c_loc(lat), dims_1D, shape_1D, stride_1D, torch_wp, torch_kCPU)
+ model_input_arr(2) = torch_tensor_from_blob(c_loc(psfc), dims_1D, shape_1D, stride_1D, torch_wp, torch_kCPU)
! Zonal
- model_input_arr(1) = torch_tensor_from_blob(c_loc(uuu), dims_2D, shape_2D, torch_wp, torch_kCPU, stride_2D)
- gwfcng_x_tensor = torch_tensor_from_blob(c_loc(gwfcng_x), dims_out, shape_out, torch_wp, torch_kCPU, stride_out)
+ model_input_arr(1) = torch_tensor_from_blob(c_loc(uuu), dims_2D, shape_2D, stride_2D, torch_wp, torch_kCPU)
+ gwfcng_x_tensor = torch_tensor_from_blob(c_loc(gwfcng_x), dims_out, shape_out, stride_out, torch_wp, torch_kCPU)
! Run model and Infer
call torch_module_forward(model, model_input_arr, n_inputs, gwfcng_x_tensor)
+ ! Clean up here before this points to a new tensor
+ call torch_tensor_delete(model_input_arr(1))
+
! Meridional
- model_input_arr(1) = torch_tensor_from_blob(c_loc(vvv), dims_2D, shape_2D, torch_wp, torch_kCPU, stride_2D)
- gwfcng_y_tensor = torch_tensor_from_blob(c_loc(gwfcng_y), dims_out, shape_out, torch_wp, torch_kCPU, stride_out)
+ model_input_arr(1) = torch_tensor_from_blob(c_loc(vvv), dims_2D, shape_2D, stride_2D, torch_wp, torch_kCPU)
+ gwfcng_y_tensor = torch_tensor_from_blob(c_loc(gwfcng_y), dims_out, shape_out, stride_out, torch_wp, torch_kCPU)
! Run model and Infer
call torch_module_forward(model, model_input_arr, n_inputs, gwfcng_y_tensor)
diff --git a/benchmark_resnet/benchmarker_resnet_forpy.f90 b/benchmark_resnet/benchmarker_resnet_forpy.f90
index 2212fcb..541424f 100644
--- a/benchmark_resnet/benchmarker_resnet_forpy.f90
+++ b/benchmark_resnet/benchmarker_resnet_forpy.f90
@@ -24,8 +24,8 @@ subroutine main()
real(wp), dimension(:,:), allocatable, asynchronous :: out_data
real(dp) :: start_time, end_time, start_loop_time, end_loop_time
- real(dp), dimension(:), allocatable :: module_load_durations, module_delete_durations, loop_durations
- real(dp), dimension(:), allocatable :: inference_durations, tensor_creation_durations, tensor_deletion_durations
+ real(dp), dimension(:), allocatable :: loop_durations, inference_durations
+ real(dp), dimension(:), allocatable :: tensor_creation_durations, tensor_deletion_durations
real(dp), dimension(:,:), allocatable :: all_durations
character(len=20), dimension(:), allocatable :: messages
@@ -37,6 +37,8 @@ subroutine main()
character(len=:), allocatable :: model_dir, model_name
character(len=128) :: msg1, msg2, msg3, msg4
integer :: ntimes
+ character(len=10) :: input_device
+ logical :: use_cuda = .false.
type(ndarray) :: out_data_nd, in_data_nd
@@ -54,23 +56,25 @@ subroutine main()
print *, "====== FORPY ======"
- call setup(model_dir, model_name, ntimes, n)
+ call setup(model_dir, model_name, ntimes, n, use_cuda=use_cuda)
+
+ if (use_cuda) then
+ input_device = "cuda"
+ else
+ input_device = "cpu"
+ end if
allocate(in_data(1, 3, 224, 224))
allocate(out_data(1, 1000))
allocate(probabilities(1, 1000))
- allocate(module_load_durations(ntimes))
- allocate(module_delete_durations(ntimes))
allocate(loop_durations(ntimes))
allocate(tensor_creation_durations(ntimes))
allocate(tensor_deletion_durations(ntimes))
allocate(inference_durations(ntimes))
- allocate(all_durations(ntimes, 5))
- allocate(messages(5))
+ allocate(all_durations(ntimes, 3))
+ allocate(messages(3))
! Initialise timings with arbitrary large values
- module_load_durations(:) = 100.
- module_delete_durations(:) = 100.
loop_durations(:) = 100.
tensor_creation_durations(:) = 100.
tensor_deletion_durations(ntimes) = 100.
@@ -92,7 +96,7 @@ subroutine main()
#else
print *, "generate model in python runtime"
#endif
- call load_module(model_dir, model_name, run_emulator, model)
+ call load_module(model_dir, model_name, run_emulator, model, use_cuda)
call load_data(data_file, tensor_length, in_data)
@@ -108,10 +112,11 @@ subroutine main()
ie = ndarray_create_nocopy(out_data_nd, out_data)
! create model input args as tuple
- ie = tuple_create(args,3)
+ ie = tuple_create(args, 4)
ie = args%setitem(0, model)
ie = args%setitem(1, in_data_nd)
- ie = args%setitem(2, out_data_nd)
+ ie = args%setitem(2, trim(input_device))
+ ie = args%setitem(3, out_data_nd)
end_time = omp_get_wtime()
tensor_creation_durations(i) = end_time - start_time
! ------------------------------ End tensor creation timer ------------------------------
@@ -148,7 +153,7 @@ subroutine main()
probability = maxval(probabilities)
! Check top probability matches expected value
- call assert(probability, expected_prob, test_name="Check probability", rtol_opt=1.0e-5_wp)
+ call assert(probability, expected_prob, test_name="Check probability", rtol_opt=1.0e-2_wp)
write(msg1, '(A, I10, A, F10.6, A)') "check iteration create tensors", i, " (", tensor_creation_durations(i), " s)"
write(msg2, '(A, I15, A, F10.6, A)') "check iteration inference", i, " (", inference_durations(i), " s)"
@@ -161,23 +166,19 @@ subroutine main()
end do
- call time_module(ntimes, model_dir, model_name, module_load_durations, module_delete_durations, run_emulator, model)
+ call forpy_finalize
! Call individual print for loop, to avoid adding to combined mean
call print_time_stats(loop_durations, "full loop")
- all_durations(:, 1) = module_load_durations
- all_durations(:, 2) = module_delete_durations
- all_durations(:, 3) = tensor_creation_durations
- all_durations(:, 4) = tensor_deletion_durations
- all_durations(:, 5) = inference_durations
- messages = [character(len=20) :: "module creation", "module deletion", "tensor creation", "tensor deletion", "forward pass"]
+ all_durations(:, 1) = tensor_creation_durations
+ all_durations(:, 2) = tensor_deletion_durations
+ all_durations(:, 3) = inference_durations
+ messages = [character(len=20) :: "tensor creation", "tensor deletion", "forward pass"]
call print_all_time_stats(all_durations, messages)
deallocate(in_data)
deallocate(out_data)
- deallocate(module_load_durations)
- deallocate(module_delete_durations)
deallocate(loop_durations)
deallocate(tensor_creation_durations)
deallocate(tensor_deletion_durations)
@@ -236,45 +237,12 @@ subroutine calc_probs(out_data, probabilities)
end subroutine calc_probs
- subroutine time_module(ntimes, model_dir, model_name, module_load_durations, module_delete_durations, run_emulator, model)
-
- implicit none
-
- integer, intent(in) :: ntimes
- character(len=*), intent(in) :: model_dir, model_name
- real(dp), dimension(:), intent(inout) :: module_load_durations, module_delete_durations
- type(module_py), intent(out) :: run_emulator
- type(object), intent(out) :: model
-
- integer :: i
- real(dp) :: start_time, end_time
-
- do i = 1, ntimes
- ! ------------------------------ Start module load timer ------------------------------
- start_time = omp_get_wtime()
- call load_module(model_dir, model_name, run_emulator, model)
- end_time = omp_get_wtime()
- module_load_durations(i) = end_time - start_time
- ! ------------------------------ End module load timer ------------------------------
-
- ! ------------------------------ Start module deletion timer ------------------------------
- ! We can only call forpy_finalize once
- if (i == ntimes) then
- start_time = omp_get_wtime()
- call forpy_finalize
- end_time = omp_get_wtime()
- module_delete_durations(:) = (end_time - start_time) / (ntimes + 1)
- end if
- ! ------------------------------ End module deletion timer ------------------------------
- end do
-
- end subroutine time_module
-
- subroutine load_module(model_dir, model_name, run_emulator, model)
+ subroutine load_module(model_dir, model_name, run_emulator, model, use_cuda)
implicit none
character(len=*), intent(in) :: model_dir, model_name
+ logical, intent(in) :: use_cuda
type(module_py), intent(out) :: run_emulator
type(object), intent(out) :: model
@@ -301,7 +269,11 @@ subroutine load_module(model_dir, model_name, run_emulator, model)
#ifdef USETS
! load torchscript saved model
ie = tuple_create(args,1)
- ie = str_create(filename, trim(model_dir//"/"//"saved_resnet18_model_cpu.pt"))
+ if (use_cuda) then
+ ie = str_create(filename, trim(model_dir//"/"//"saved_resnet18_model_gpu.pt"))
+ else
+ ie = str_create(filename, trim(model_dir//"/"//"saved_resnet18_model_cpu.pt"))
+ end if
ie = args%setitem(0, filename)
ie = call_py(model, run_emulator, "initialize_ts", args)
call args%destroy
diff --git a/benchmark_resnet/benchmarker_resnet_torch.f90 b/benchmark_resnet/benchmarker_resnet_torch.f90
index 836b95b..193b0e0 100644
--- a/benchmark_resnet/benchmarker_resnet_torch.f90
+++ b/benchmark_resnet/benchmarker_resnet_torch.f90
@@ -1,18 +1,15 @@
program benchmark_resnet_test
- use, intrinsic :: iso_c_binding, only: c_int64_t, c_loc
use :: omp_lib, only : omp_get_wtime
use :: utils, only : assert, setup, print_time_stats, print_all_time_stats
! Import our library for interfacing with PyTorch
use :: ftorch
! Define working precision for C primitives and Fortran reals
! Precision must match `wp` in resnet18.py and `wp_torch` in pt2ts.py
- use :: precision, only: c_wp, wp, dp
+ use :: precision, only: wp, dp
implicit none
- integer, parameter :: torch_wp = torch_kFloat32
-
call main()
contains
@@ -23,21 +20,21 @@ subroutine main()
integer :: i, ii, n
real(dp) :: start_time, end_time, start_loop_time, end_loop_time
- real(dp), dimension(:), allocatable :: module_load_durations, module_delete_durations, loop_durations
- real(dp), dimension(:), allocatable :: inference_durations, tensor_creation_durations, tensor_deletion_durations
+ real(dp), dimension(:), allocatable :: loop_durations, inference_durations
+ real(dp), dimension(:), allocatable :: tensor_creation_durations, tensor_deletion_durations
real(dp), dimension(:,:), allocatable :: all_durations
character(len=20), dimension(:), allocatable :: messages
- real(c_wp), dimension(:,:,:,:), allocatable, target :: in_data
- integer(c_int), parameter :: n_inputs = 1
- real(c_wp), dimension(:,:), allocatable, target :: out_data
+ real(wp), dimension(:,:,:,:), allocatable, target :: in_data
+ real(wp), dimension(:,:), allocatable, target :: out_data
+ integer, parameter :: n_inputs = 1
- integer(c_int), parameter :: in_dims = 4
- integer(c_int64_t) :: in_shape(in_dims) = [1, 3, 224, 224]
- integer(c_int) :: in_layout(in_dims) = [1,2,3,4]
- integer(c_int), parameter :: out_dims = 2
- integer(c_int64_t) :: out_shape(out_dims) = [1, 1000]
- integer(c_int) :: out_layout(out_dims) = [1,2]
+ integer, parameter :: in_dims = 4
+ integer :: in_shape(in_dims) = [1, 3, 224, 224]
+ integer :: in_layout(in_dims) = [1, 2, 3, 4]
+ integer, parameter :: out_dims = 2
+ integer :: out_shape(out_dims) = [1, 1000]
+ integer :: out_layout(out_dims) = [1, 2]
character(len=:), allocatable :: model_dir, model_name
character(len=128) :: msg1, msg2, msg3, msg4
@@ -74,18 +71,14 @@ subroutine main()
allocate(out_data(out_shape(1), out_shape(2)))
allocate(probabilities(out_shape(1), out_shape(2)))
- allocate(module_load_durations(ntimes))
- allocate(module_delete_durations(ntimes))
allocate(loop_durations(ntimes))
allocate(tensor_creation_durations(ntimes))
allocate(tensor_deletion_durations(ntimes))
allocate(inference_durations(ntimes))
- allocate(all_durations(ntimes, 5))
- allocate(messages(5))
+ allocate(all_durations(ntimes, 3))
+ allocate(messages(3))
! Initialise timings with arbitrary large values
- module_load_durations(:) = 100.
- module_delete_durations(:) = 100.
loop_durations(:) = 100.
tensor_creation_durations(:) = 100.
tensor_deletion_durations(ntimes) = 100.
@@ -101,7 +94,7 @@ subroutine main()
return
end if
- ! Load model (creation/deletion timed at end)
+ ! Load model
model = torch_module_load(model_dir//"/"//model_name)
! Initialise data - previously in loop, but not modified?
@@ -115,8 +108,8 @@ subroutine main()
! Create input and output tensors for the model.
! ------------------------------ Start tensor creation timer ------------------------------
start_time = omp_get_wtime()
- in_tensor(1) = torch_tensor_from_blob(c_loc(in_data), in_dims, in_shape, torch_wp, input_device, in_layout)
- out_tensor = torch_tensor_from_blob(c_loc(out_data), out_dims, out_shape, torch_wp, torch_kCPU, out_layout)
+ in_tensor(1) = torch_tensor_from_array(in_data, in_layout, input_device)
+ out_tensor = torch_tensor_from_array(out_data, out_layout, torch_kCPU)
end_time = omp_get_wtime()
tensor_creation_durations(i) = end_time - start_time
! ------------------------------ End tensor creation timer ------------------------------
@@ -162,26 +155,20 @@ subroutine main()
end do
- ! Delete model (creation/deletion timed at end)
+ ! Delete model
call torch_module_delete(model)
- call time_module(ntimes, model_dir, model_name, module_load_durations, module_delete_durations)
-
! Call individual print for loop, to avoid adding to combined mean
call print_time_stats(loop_durations, "full loop")
- all_durations(:, 1) = module_load_durations
- all_durations(:, 2) = module_delete_durations
- all_durations(:, 3) = tensor_creation_durations
- all_durations(:, 4) = tensor_deletion_durations
- all_durations(:, 5) = inference_durations
- messages = [character(len=20) :: "module creation", "module deletion", "tensor creation", "tensor deletion", "forward pass"]
+ all_durations(:, 1) = tensor_creation_durations
+ all_durations(:, 2) = tensor_deletion_durations
+ all_durations(:, 3) = inference_durations
+ messages = [character(len=20) :: "tensor creation", "tensor deletion", "forward pass"]
call print_all_time_stats(all_durations, messages)
deallocate(in_data)
deallocate(out_data)
- deallocate(module_load_durations)
- deallocate(module_delete_durations)
deallocate(loop_durations)
deallocate(tensor_creation_durations)
deallocate(tensor_deletion_durations)
@@ -198,9 +185,9 @@ subroutine load_data(filename, tensor_length, in_data)
character(len=*), intent(in) :: filename
integer, intent(in) :: tensor_length
- real(c_wp), dimension(:,:,:,:), intent(out) :: in_data
+ real(wp), dimension(:,:,:,:), intent(out) :: in_data
- real(c_wp) :: flat_data(tensor_length)
+ real(wp) :: flat_data(tensor_length)
integer :: ios
character(len=100) :: ioerrmsg
@@ -229,7 +216,7 @@ subroutine calc_probs(out_data, probabilities)
implicit none
- real(c_wp), dimension(:,:), intent(in) :: out_data
+ real(wp), dimension(:,:), intent(in) :: out_data
real(wp), dimension(:,:), intent(out) :: probabilities
real(wp) :: prob_sum
@@ -240,33 +227,4 @@ subroutine calc_probs(out_data, probabilities)
end subroutine calc_probs
- subroutine time_module(ntimes, model_dir, model_name, module_load_durations, module_delete_durations)
-
- implicit none
-
- integer, intent(in) :: ntimes
- real(dp), dimension(:), intent(out) :: module_load_durations, module_delete_durations
- integer :: i
- real(dp) :: start_time, end_time
- character(len=*), intent(in) :: model_dir, model_name
- type(torch_module) :: model
-
- do i = 1, ntimes
- ! ------------------------------ Start module load timer ------------------------------
- start_time = omp_get_wtime()
- model = torch_module_load(model_dir//"/"//model_name)
- end_time = omp_get_wtime()
- module_load_durations(i) = end_time - start_time
- ! ------------------------------ End module load timer ------------------------------
-
- ! ------------------------------ Start module deletion timer ------------------------------
- start_time = omp_get_wtime()
- call torch_module_delete(model)
- end_time = omp_get_wtime()
- module_delete_durations(i) = end_time - start_time
- ! ------------------------------ End module deletion timer ------------------------------
- end do
-
- end subroutine time_module
-
end program benchmark_resnet_test
diff --git a/cgdrag_model/run_emulator_davenet.py b/cgdrag_model/run_emulator_davenet.py
index fb92b18..b4b3994 100644
--- a/cgdrag_model/run_emulator_davenet.py
+++ b/cgdrag_model/run_emulator_davenet.py
@@ -4,7 +4,7 @@
It needs in the same directory as `arch_DaveNet.py` which describes the
model architecture, and `network_wst.pkl` which contains the model weights.
"""
-from torch import load, device, no_grad, tensor, float64, jit
+from torch import load, device, no_grad, tensor, float64, jit, device
import arch_davenet as m
@@ -69,22 +69,26 @@ def compute_reshape_drag(*args):
output prellocated in MiMA (128, num_col, 40)
num_col :
# of latitudes on this proc
+ device : str
+ Device type ("cpu", "cuda" or "mps"), and optional device ordinal for
+ the device type, to move input_batch to. Must match device of model.
Returns
-------
Y_out :
Results to be returned to MiMA
"""
- model, wind, lat, p_surf, Y_out, num_col = args
+ model, wind, lat, p_surf, Y_out, num_col, input_device = args
+ input_device = device(input_device)
# Reshape and put all input variables together [wind, lat, p_surf]
- wind_T = tensor(wind)
+ wind_T = tensor(wind).to(input_device)
# lat_T = zeros((imax * num_col, 1), dtype=float64)
- lat_T = tensor(lat)
+ lat_T = tensor(lat).to(input_device)
# pressure_T = zeros((imax * num_col, 1), dtype=float64)
- pressure_T = tensor(p_surf)
+ pressure_T = tensor(p_surf).to(input_device)
# Apply model.
with no_grad():
@@ -94,7 +98,7 @@ def compute_reshape_drag(*args):
temp = model(wind_T, pressure_T, lat_T)
# Place in output array for MiMA.
- Y_out[:, :] = temp
+ Y_out[:, :] = temp.cpu()
del temp
return Y_out
diff --git a/resnet_model/resnet18.py b/resnet_model/resnet18.py
index db8d246..75c0efa 100644
--- a/resnet_model/resnet18.py
+++ b/resnet_model/resnet18.py
@@ -144,15 +144,19 @@ def compute(*args):
ResNet model ready to be deployed.
input_batch : torch.Tensor
Input batch to operate on
+ device : str
+ Device type ("cpu", "cuda" or "mps"), and optional device ordinal for
+ the device type, to move input_batch to. Must match device of model.
Returns
-------
output :
Results from ResNet model
"""
- model, input_batch, result = args
+ model, input_batch, device, result = args
+ device = torch.device(device)
- input_batch = torch.from_numpy(input_batch)
+ input_batch = torch.from_numpy(input_batch).to(device)
# Apply model.
with torch.no_grad():
@@ -160,7 +164,7 @@ def compute(*args):
assert model.training is False
output = model(input_batch)
- result[:, :] = output
+ result[:, :] = output.cpu()
return result
diff --git a/run_benchmarks.sh b/run_benchmarks.sh
index e46f71f..56e7fc2 100755
--- a/run_benchmarks.sh
+++ b/run_benchmarks.sh
@@ -1,19 +1,16 @@
#!/usr/bin/env bash
nrun=1000
-NDIM=256
for n in {1,4,8};
do
export OMP_NUM_THREADS=$n
- ./benchmarker_cgdrag_forpy ../cgdrag_model run_emulator_davenet $nrun 10 --alloc_in_loop | tee cgdrag_forpy_$n.out
- ./benchmarker_cgdrag_torch ../cgdrag_model saved_cgdrag_model_cpu.pt $nrun 10 --alloc_in_loop --explicit_reshape | tee cgdrag_torch_explicit_$n.out
- ./benchmarker_cgdrag_torch ../cgdrag_model saved_cgdrag_model_cpu.pt $nrun 10 | tee cgdrag_torch_implicit_$n.out
+ date;/usr/bin/time -v ./benchmarker_cgdrag_forpy ../cgdrag_model run_emulator_davenet $nrun 10 | tee cgdrag_forpy_$n.out;date
+ date;/usr/bin/time -v ./benchmarker_cgdrag_torch ../cgdrag_model saved_cgdrag_model_cpu.pt $nrun 10 --explicit_reshape | tee cgdrag_torch_explicit_$n.out;date
+ date;/usr/bin/time -v ./benchmarker_cgdrag_torch ../cgdrag_model saved_cgdrag_model_cpu.pt $nrun 10 | tee cgdrag_torch_implicit_$n.out;date
- ./benchmarker_resnet_forpy ../resnet_model resnet18 $nrun 10 | tee resnet_forpy_$n.out
- ./benchmarker_resnet_torch ../resnet_model saved_resnet18_model_cpu.pt $nrun 10 | tee resnet_torch_$n.out
+ date;/usr/bin/time -v ./benchmarker_resnet_forpy ../resnet_model resnet18 $nrun 10 | tee resnet_forpy_$n.out;date
+ date;/usr/bin/time -v ./benchmarker_resnet_torch ../resnet_model saved_resnet18_model_cpu.pt $nrun 10 | tee resnet_torch_$n.out;date
- ./benchmarker_large_stride_forpy ../large_stride_model run_emulator_stride $nrun $NDIM | tee ls_forpy_$n.out
- ./benchmarker_large_stride_torch ../large_stride_model saved_large_stride_model_cpu.pt $nrun $NDIM | tee ls_torch_$n.out
done
diff --git a/run_benchmarks_gpu.sh b/run_benchmarks_gpu.sh
index 4874d1e..79dbafe 100644
--- a/run_benchmarks_gpu.sh
+++ b/run_benchmarks_gpu.sh
@@ -1,19 +1,16 @@
#!/usr/bin/env bash
-nrun=1000
-NDIM=256
+nrun=10000
-for n in {1,4,8};
+for n in {1,8};
do
export OMP_NUM_THREADS=$n
- ./benchmarker_cgdrag_forpy ../cgdrag_model run_emulator_davenet $nrun 10 | tee cgdrag_forpy_$n.out
- ./benchmarker_cgdrag_torch ../cgdrag_model saved_cgdrag_model_gpu.pt $nrun 10 --alloc_in_loop --explicit_reshape --use_cuda | tee cgdrag_torch_explicit_gpu_$n.out
- ./benchmarker_cgdrag_torch ../cgdrag_model saved_cgdrag_model_gpu.pt $nrun 10 --use_cuda | tee cgdrag_torch_implicit_gpu_$n.out
+ date;/usr/bin/time -v ./benchmarker_cgdrag_forpy ../cgdrag_model run_emulator_davenet $nrun 10 --use_cuda | tee cgdrag_forpy_gpu_$n.out;date
+ date;/usr/bin/time -v ./benchmarker_cgdrag_torch ../cgdrag_model saved_cgdrag_model_gpu.pt $nrun 10 --explicit_reshape --use_cuda | tee cgdrag_torch_explicit_gpu_$n.out;date
+ date;/usr/bin/time -v ./benchmarker_cgdrag_torch ../cgdrag_model saved_cgdrag_model_gpu.pt $nrun 10 --use_cuda | tee cgdrag_torch_implicit_gpu_$n.out;date
- ./benchmarker_resnet_forpy ../resnet_model resnet18 $nrun 10 | tee resnet_forpy_$n.out
- ./benchmarker_resnet_torch ../resnet_model saved_resnet18_model_gpu.pt $nrun 10 --use_cuda | tee resnet_torch_gpu_$n.out
+ date;/usr/bin/time -v ./benchmarker_resnet_forpy ../resnet_model resnet18 $nrun 10 --use_cuda | tee resnet_forpy_gpu_$n.out;date
+ date;/usr/bin/time -v ./benchmarker_resnet_torch ../resnet_model saved_resnet18_model_gpu.pt $nrun 10 --use_cuda | tee resnet_torch_gpu_$n.out;date
- ./benchmarker_large_stride_forpy ../large_stride_model run_emulator_stride $nrun $NDIM | tee ls_forpy_$n.out
- ./benchmarker_large_stride_torch ../large_stride_model saved_large_stride_model_gpu.pt $nrun $NDIM --use_cuda | tee ls_torch_gpu_$n.out
done
diff --git a/utils/read_benchmarks.py b/utils/read_benchmarks.py
index 7eb8b6c..843c28e 100644
--- a/utils/read_benchmarks.py
+++ b/utils/read_benchmarks.py
@@ -1,25 +1,26 @@
"""Helper functions to read and plot benchmarking data."""
+from typing import Union
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
def read_iteration_data(directory: str, filename: str, labels: list) -> pd.DataFrame:
- """Read benchmarking data from each loop iteration
+ """Read benchmarking data from each loop iteration.
Parameters
----------
- directory: str
+ directory : str
Directory of file containing benchmarking data to be read.
- filename: str
- Path to file containing benchmarking data to be read.
- labels: list
+ filename : str
+ Name of file containing benchmarking data to be read.
+ labels : list
List of labels in output file to read.
List does not need to be complete, but must be given in order of output.
Returns
-------
- df: pd.DataFrame
+ df : pd.DataFrame
Dataframe of durations, with columns corresponding to each input label.
"""
df = pd.DataFrame(columns=labels)
@@ -63,16 +64,16 @@ def read_summary_data(directory: str, filename: str, labels: list) -> dict:
Parameters
----------
- directory: str
+ directory : str
Directory of file containing benchmarking data to be read.
- filename: str
+ filename : str
Path to file containing benchmarking data to be read.
- labels: list
+ labels : list
List of labels to read summary information for.
Returns
-------
- results: dict
+ results : dict
Nested dictionary with keys for each label passed, and nested keys for
the mean, min, max and stddev for each label.
"""
@@ -118,13 +119,13 @@ def read_summary_data(directory: str, filename: str, labels: list) -> dict:
def plot_df(df: pd.DataFrame, labels: list) -> None:
- """Plot scatter plots for each column in input dataframe
+ """Plot scatter plots for each column in input dataframe.
Parameters
----------
- df: pd.DataFrame)
+ df : pd.DataFrame)
Dataframe containing data to be plotted.
- labels: list
+ labels : list
List of columns in dataframe to plot.
"""
# Create separate plots for each label.
@@ -138,14 +139,13 @@ def plot_df(df: pd.DataFrame, labels: list) -> None:
def plot_summary_means(data: dict, labels: list) -> None:
- """Plot a bar chart for each labelled duration comparing the files
- specified by keys of the input data.
+ """Plot bar chart comparing durations for specified files and keys.
Parameters
----------
- data: dict
+ data : dict
Dictionary of summary data in the form data[file][label][mean].
- labels: list
+ labels : list
List of summary labels to plot bar charts for.
"""
alpha = 0.9
@@ -174,13 +174,13 @@ def plot_summary_means(data: dict, labels: list) -> None:
def plot_summary_with_stddev(data: dict, labels: list) -> None:
- """Plot scatter plot with error bars of summary data from benchmarking output files
+ """Plot scatter plot with error bars of summary data from benchmarking output files.
Parameters
----------
- data: dict
+ data : dict
Dictionary of summary data in the form data[file][label][mean, stddev].
- labels: list
+ labels : list
List of summary labels to plot on the same graph.
"""
# Loop over each file
@@ -224,3 +224,203 @@ def plot_summary_with_stddev(data: dict, labels: list) -> None:
fontsize=7.5,
)
plt.show()
+
+
+def read_slurm_walltime(filepath: str, labels: list) -> dict:
+ """Read benchmarking data from each loop iteration.
+
+ Parameters
+ ----------
+ filepath : str
+ Path to file containing benchmarking data to be read.
+ labels : list
+ List of all benchmarks run, matching the run order.
+ Typically of the form [model]_[forpy/torch]_[cpu/gpu].
+
+ Returns
+ -------
+ benchmarks : dict
+ Dictionary of times, with keys corresponding to each input label.
+ """
+ print(f"Reading: {filepath}")
+
+ current_label = ""
+ i = 0
+ benchmarks = {}
+
+ with open(filepath) as f:
+ lines = f.readlines()
+ for line in lines:
+ if "Command being timed" in line:
+ # Cut from 'Command being timed: "./benchmarker_cgdrag_forpy...'
+ # to 'cgdrag_forpy'
+ current_label = line.split()[3][15:]
+ if "Elapsed (wall clock) time" in line:
+ if current_label in labels[i]:
+ benchmarks[labels[i]] = convert_to_seconds(line.split()[7])
+ i += 1
+
+ return benchmarks
+
+
+def convert_to_seconds(time_str: str):
+ """
+ Convert wall time string from /usr/bin/time to time in seconds.
+
+ Parameters
+ ----------
+ time_str : str
+ Time in the format h:mm:ss or m:ss.
+
+ Returns
+ -------
+ time : float
+ Time in seconds.
+ """
+ time = time_str.split(":")
+ if len(time) == 3:
+ return float(time[0]) * 3600 + float(time[1]) * 60 + float(time[2])
+ elif len(time) == 2:
+ return float(time[0]) * 60 + float(time[1])
+ else:
+ raise ValueError("Time format not supported. Expected format: h:mm:ss or m:ss")
+
+
+def plot_walltimes(
+ benchmarks: dict,
+ labels: list,
+ normalise: bool = False,
+ title: Union[str, None] = None,
+ ylabel: Union[str, None] = None,
+ xlabel: Union[str, None] = None,
+ alpha: float = 0.9,
+ bar_width: float = 1.0,
+ yscale: str = "linear",
+ ylim: Union[float, tuple] = 0.0,
+ legend_labels: dict = {},
+ xticklabels: Union[list, None] = None,
+ save_path: Union[str, None] = None,
+):
+ """Plot bar charts comparing walltimes for all labels given.
+
+ Parameters
+ ----------
+ benchmarks : dict
+ Dictionary of times, with keys corresponding to each input label.
+ labels : list
+ List containing subset of benchmark keys to plot.
+ normalise : bool
+ Whether to normalise data, so the maximum value is 1.
+ title : Union[str, None]
+ Title for plot.
+ ylabel : Union[str, None]
+ Y-axis label for plot.
+ xlabel : Union[str, None]
+ X-axis label for plot.
+ alpha : float
+ Opaqu
+ bar_width : float
+ Width(s) of bars.
+ yscale : str
+ Y-axis scale type.
+ ylim : Union[float, tuple]
+ Y-axis value range.
+ legend_labels : dict
+ Dictionary of legend labels for each benchmark. Each key should
+ be present in an item in `labels`, while values specify the legend
+ labels plotted.
+ xticklabels : Union[list, None]
+ List of x-axis tick labels.
+ save_path : Union[str, None]
+ File path to save plot.
+ """
+ if len(benchmarks) == 0:
+ raise ValueError("No data passed in `benchmarks`.")
+
+ # If legend_labels unspecified, create bar for every label
+ num_legend_labels = len(legend_labels)
+ if num_legend_labels == 0:
+ num_x_groups = len(labels)
+ # If legend_labels is specified, check legend labels match benchmark labels
+ # or repeat tje list to match the correct length.
+ elif len(labels) % num_legend_labels != 0:
+ raise ValueError(
+ "The number labels specified in `legend_labels` equal or be a factor"
+ " of the number of labels specified in `labels`"
+ )
+ # Calculate number of groups of bars, each with own xtick
+ else:
+ num_x_groups = len(labels) // num_legend_labels
+
+ # Set up list at each xtick value for each legend key
+ group_data = {} # type: dict
+ for key in legend_labels:
+ group_data[key] = []
+
+ # Central xtick coordinates
+ x = np.arange(num_x_groups)
+
+ # Normalise data if requested
+ benchmarks_copy = benchmarks.copy()
+ if normalise:
+ max_time = 0.0
+ for benchmark, value in benchmarks_copy.items():
+ if benchmark in labels and value > max_time:
+ max_time = value
+ for benchmark, value in benchmarks_copy.items():
+ benchmarks_copy[benchmark] = value / max_time
+
+ # Use legend_labels dictionary to extract data for each legend entry
+ if num_legend_labels > 0:
+ for i, label in enumerate(labels):
+ for key in legend_labels:
+ if key in label:
+ group_data[key].append(benchmarks_copy[label])
+
+ # Plot data for each legend entry
+ for i, key in enumerate(legend_labels):
+ # Bars equal on each side of xtick
+ if num_legend_labels % 2 == 0:
+ xticks = np.linspace(
+ -bar_width / num_legend_labels,
+ bar_width / num_legend_labels,
+ num_legend_labels,
+ )
+ # Middle bar centred on xtick
+ else:
+ xticks = np.linspace(-bar_width, bar_width, num_legend_labels)
+ plt.bar(
+ x + xticks[i],
+ group_data[key],
+ alpha=alpha,
+ width=bar_width,
+ label=legend_labels[key],
+ )
+ else:
+ # Plot bar for each label
+ data = []
+ for label in labels:
+ data.append(benchmarks_copy[label])
+ plt.bar(
+ x,
+ data,
+ alpha=alpha,
+ width=bar_width,
+ )
+
+ plt.xticks(ticks=x, labels=xticklabels)
+ plt.yscale(yscale)
+ plt.ylim(ylim)
+ if ylabel is not None:
+ plt.ylabel(ylabel)
+ else:
+ plt.ylabel("Time / s")
+ if xlabel is not None:
+ plt.xlabel(xlabel)
+ if title is not None:
+ plt.title(title)
+ if legend_labels is not None:
+ plt.legend()
+ if save_path is not None:
+ plt.savefig(save_path, dpi=300)
+ plt.show()
diff --git a/utils/visualise.ipynb b/utils/visualise.ipynb
index e2682fd..4d5b8c1 100644
--- a/utils/visualise.ipynb
+++ b/utils/visualise.ipynb
@@ -15,12 +15,81 @@
"metadata": {},
"outputs": [],
"source": [
- "directory = \"../build/\"\n",
- "files = [\n",
- " \"cgdrag_forpy_1.out\",\n",
- " \"cgdrag_torch_1_explicit.out\",\n",
- " \"cgdrag_torch_1_implicit.out\",\n",
- "]"
+ "directory = \"../../results/\"\n",
+ "\n",
+ "# Icelake CPU results with NoGradMode set in torch_jit_module_forward\n",
+ "filepath_nograd_fix_1k = directory + \"icelake_nograd/output_1000_fix/slurm-34050567.out\"\n",
+ "filepath_nograd_fix_10k = directory + \"icelake_nograd/output_10000_fix/slurm-34051232.out\"\n",
+ "\n",
+ "# ampere (A100 GPU) results with NoGradMode set in torch_jit_module_forward\n",
+ "filepath_nograd_fix_gpu_10k = directory + \"ampere_nograd/output_10000_fix/slurm-34050397.out\"\n",
+ "filepath_nograd_fix_gpu_100k = directory + \"ampere_nograd/output_100000_fix/slurm-34051212.out\"\n",
+ "\n",
+ "# Comparison between gradients enabled, NoGradMode, and NoGradMode with a frozen model, on Sapphire Rapids CPUs\n",
+ "filepath_options_1k = directory + \"sapphire_options_grad/output_1k/slurm-37212483.out\"\n",
+ "filepath_options_10k = directory + \"sapphire_options_grad/output_10k/slurm-37224854.out\"\n",
+ "\n",
+ "# Comparison between gradient enabled, NoGradMode, and NoGradMode with a frozen model, on ampere (A100 GPU)\n",
+ "filepath_options_gpu_10k = directory + \"ampere_options_grad/output_10k/slurm-37429435.out\"\n",
+ "filepath_options_gpu_100k = directory + \"ampere_options_grad/output_100k/slurm-37431441.out\"\n",
+ "\n",
+ "# Comparison between InferenceMode and NoGradMode (both with a frozen model) on Sapphire Rapids CPUs\n",
+ "filepath_infer_1k = directory + \"sapphire_infer/output_1k/slurm-37447583.out\"\n",
+ "filepath_infer_10k = directory + \"sapphire_infer/output_10k/slurm-37449094.out\"\n",
+ "\n",
+ "# Comparison between InferenceMode and NoGradMode (both with a frozen model) on ampere (A100 GPU)\n",
+ "filepath_infer_gpu_10k = directory + \"ampere_infer/output_10k/slurm-37521108.out\"\n",
+ "filepath_infer_gpu_100k = directory + \"ampere_infer/output_100k/slurm-37521956.out\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# FTorch and Forpy on Sapphire Rapids CPUs"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Output files here use a significantly modified version of run_benchmarks.sh to allow comparisons between not only Forpy and FTorch, but also switching off gradients and freezing the model, e.g.:\n",
+ "\n",
+ "```bash\n",
+ "nrun=1000\n",
+ "\n",
+ "# Run with gradients on\n",
+ "for n in {1,4,8};\n",
+ "do\n",
+ " export OMP_NUM_THREADS=$n\n",
+ " date;/usr/bin/time -v ./benchmarker_cgdrag_forpy ../cgdrag_model run_emulator_davenet $nrun 10 | tee cgdrag_forpy_$n.out;date\n",
+ " date;/usr/bin/time -v ./benchmarker_cgdrag_torch ../cgdrag_model saved_cgdrag_model_cpu.pt $nrun 10 --explicit_reshape | tee cgdrag_torch_explicit_$n.out;date\n",
+ " date;/usr/bin/time -v ./benchmarker_cgdrag_torch ../cgdrag_model saved_cgdrag_model_cpu.pt $nrun 10 | tee cgdrag_torch_implicit_$n.out;date\n",
+ "\n",
+ " date;/usr/bin/time -v ./benchmarker_resnet_forpy ../resnet_model resnet18 $nrun 10 | tee resnet_forpy_$n.out;date\n",
+ " date;/usr/bin/time -v ./benchmarker_resnet_torch ../resnet_model saved_resnet18_model_cpu.pt $nrun 10 | tee resnet_torch_$n.out;date\n",
+ "done\n",
+ "\n",
+ "cd ../build_sapphire_options_nograd\n",
+ "\n",
+ "# Run with gradients off\n",
+ "for n in {1,4,8};\n",
+ "do\n",
+ " export OMP_NUM_THREADS=$n\n",
+ " date;/usr/bin/time -v ./benchmarker_cgdrag_torch ../cgdrag_model saved_cgdrag_model_cpu.pt $nrun 10 --explicit_reshape | tee cgdrag_torch_explicit_$n.out;date\n",
+ " date;/usr/bin/time -v ./benchmarker_cgdrag_torch ../cgdrag_model saved_cgdrag_model_cpu.pt $nrun 10 | tee cgdrag_torch_implicit_$n.out;date\n",
+ " date;/usr/bin/time -v ./benchmarker_resnet_torch ../resnet_model saved_resnet18_model_cpu.pt $nrun 10 | tee resnet_torch_$n.out;date\n",
+ "done\n",
+ "\n",
+ "# Run with gradients off and use frozen models\n",
+ "for n in {1,4,8};\n",
+ "do\n",
+ " export OMP_NUM_THREADS=$n\n",
+ " date;/usr/bin/time -v ./benchmarker_cgdrag_torch ../cgdrag_model saved_cgdrag_model_freeze_cpu.pt $nrun 10 --explicit_reshape | tee cgdrag_torch_explicit_freeze_$n.out;date\n",
+ " date;/usr/bin/time -v ./benchmarker_cgdrag_torch ../cgdrag_model saved_cgdrag_model_freeze_cpu.pt $nrun 10 | tee cgdrag_torch_implicit_freeze_$n.out;date\n",
+ " date;/usr/bin/time -v ./benchmarker_resnet_torch ../resnet_model saved_resnet18_model_freeze_cpu.pt $nrun 10 | tee resnet_torch_freeze_$n.out;date\n",
+ "done\n",
+ "```"
]
},
{
@@ -29,15 +98,40 @@
"metadata": {},
"outputs": [],
"source": [
- "# Labels can be skipped, but those present must be in order of output \"check iteration [label]\"\n",
- "iteration_labels = [\n",
- " \"inference\",\n",
- " \"create tensors\",\n",
- " \"delete tensors\",\n",
- " \"allocate arrays\",\n",
- " \"deallocate arrays\",\n",
- " \"full loop\",\n",
- "]"
+ "# Create labels matching the order of tests in run_benchmarks.sh\n",
+ "\n",
+ "cpu_options_labels = []\n",
+ "\n",
+ "cpu_tests_grad = [\n",
+ " \"cgdrag_forpy\",\n",
+ " \"cgdrag_torch_explicit\",\n",
+ " \"cgdrag_torch_implicit\",\n",
+ " \"resnet_forpy\",\n",
+ " \"resnet_torch\",\n",
+ "]\n",
+ "\n",
+ "cpu_tests_nograd = [\n",
+ " \"cgdrag_torch_explicit\",\n",
+ " \"cgdrag_torch_implicit\",\n",
+ " \"resnet_torch\",\n",
+ "]\n",
+ "\n",
+ "cpu_tests_nograd_freeze = [\n",
+ " \"cgdrag_torch_explicit\",\n",
+ " \"cgdrag_torch_implicit\",\n",
+ " \"resnet_torch\",\n",
+ "]\n",
+ "cpu_n_threads = [\"1\", \"4\", \"8\"]\n",
+ "\n",
+ "for threads in cpu_n_threads:\n",
+ " for test in cpu_tests_grad:\n",
+ " cpu_options_labels.append(f\"{test}_{threads}_grad\")\n",
+ "for threads in cpu_n_threads:\n",
+ " for test in cpu_tests_nograd:\n",
+ " cpu_options_labels.append(f\"{test}_{threads}_nograd\")\n",
+ "for threads in cpu_n_threads:\n",
+ " for test in cpu_tests_nograd_freeze:\n",
+ " cpu_options_labels.append(f\"{test}_{threads}_freeze\")"
]
},
{
@@ -49,20 +143,22 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Reading: ../build/cgdrag_forpy_1.out\n",
- "Number of runs: 500\n",
- "Reading: ../build/cgdrag_torch_1_explicit.out\n",
- "Number of runs: 500\n",
- "Reading: ../build/cgdrag_torch_1_implicit.out\n",
- "Number of runs: 500\n"
+ "Reading: ../../results/sapphire_options_grad/output_1k/slurm-37212483.out\n",
+ "Reading: ../../results/sapphire_options_grad/output_10k/slurm-37224854.out\n"
]
}
],
"source": [
- "dataframes = {}\n",
+ "benchmarks_options_1k = rb.read_slurm_walltime(filepath_options_1k, cpu_options_labels)\n",
+ "benchmarks_options_10k = rb.read_slurm_walltime(filepath_options_10k, cpu_options_labels)\n",
+ "\n",
+ "\n",
+ "# Scale by number of iterations\n",
+ "for key, value in benchmarks_options_1k.items():\n",
+ " benchmarks_options_1k[key] = value / 1000\n",
"\n",
- "for file in files:\n",
- " dataframes[file] = rb.read_iteration_data(directory, file, iteration_labels)"
+ "for key, value in benchmarks_options_10k.items():\n",
+ " benchmarks_options_10k[key] = value / 10000"
]
},
{
@@ -72,175 +168,37 @@
"outputs": [
{
"data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " inference | \n",
- " create tensors | \n",
- " delete tensors | \n",
- " allocate arrays | \n",
- " deallocate arrays | \n",
- " full loop | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 0.3735 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0002 | \n",
- " 0.0002 | \n",
- " 0.3766 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 0.3501 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0000 | \n",
- " 0.0000 | \n",
- " 0.3521 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 0.3442 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0000 | \n",
- " 0.0000 | \n",
- " 0.3463 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 0.3540 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0000 | \n",
- " 0.0000 | \n",
- " 0.3560 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 0.3402 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0000 | \n",
- " 0.0000 | \n",
- " 0.3424 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 495 | \n",
- " 0.4860 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0000 | \n",
- " 0.0000 | \n",
- " 0.4881 | \n",
- "
\n",
- " \n",
- " 496 | \n",
- " 0.4618 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0000 | \n",
- " 0.0000 | \n",
- " 0.4654 | \n",
- "
\n",
- " \n",
- " 497 | \n",
- " 0.4590 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0000 | \n",
- " 0.0000 | \n",
- " 0.4613 | \n",
- "
\n",
- " \n",
- " 498 | \n",
- " 0.4457 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0000 | \n",
- " 0.0000 | \n",
- " 0.4479 | \n",
- "
\n",
- " \n",
- " 499 | \n",
- " 0.4261 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0000 | \n",
- " 0.0000 | \n",
- " 0.4282 | \n",
- "
\n",
- " \n",
- "
\n",
- "
500 rows × 6 columns
\n",
- "
"
- ],
- "text/plain": [
- " inference create tensors delete tensors allocate arrays \\\n",
- "0 0.3735 0.0 0.0 0.0002 \n",
- "1 0.3501 0.0 0.0 0.0000 \n",
- "2 0.3442 0.0 0.0 0.0000 \n",
- "3 0.3540 0.0 0.0 0.0000 \n",
- "4 0.3402 0.0 0.0 0.0000 \n",
- ".. ... ... ... ... \n",
- "495 0.4860 0.0 0.0 0.0000 \n",
- "496 0.4618 0.0 0.0 0.0000 \n",
- "497 0.4590 0.0 0.0 0.0000 \n",
- "498 0.4457 0.0 0.0 0.0000 \n",
- "499 0.4261 0.0 0.0 0.0000 \n",
- "\n",
- " deallocate arrays full loop \n",
- "0 0.0002 0.3766 \n",
- "1 0.0000 0.3521 \n",
- "2 0.0000 0.3463 \n",
- "3 0.0000 0.3560 \n",
- "4 0.0000 0.3424 \n",
- ".. ... ... \n",
- "495 0.0000 0.4881 \n",
- "496 0.0000 0.4654 \n",
- "497 0.0000 0.4613 \n",
- "498 0.0000 0.4479 \n",
- "499 0.0000 0.4282 \n",
- "\n",
- "[500 rows x 6 columns]"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "dataframes['cgdrag_forpy_1.out']"
+ "image/png": "",
+ "text/plain": [
+ "