Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Time full program runtime #34

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 44 additions & 72 deletions benchmark_cgdrag/benchmarker_cgdrag_forpy.f90
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ subroutine main()

integer :: i, j, n
real(dp) :: start_time, end_time, start_loop_time, end_loop_time
real(dp), dimension(:), allocatable :: module_load_durations, module_delete_durations, loop_durations, inference_durations
real(dp), dimension(:), allocatable :: allocation_durations, deallocation_durations, tensor_creation_durations, tensor_deletion_durations
real(dp), dimension(:), allocatable :: loop_durations, inference_durations, allocation_durations
real(dp), dimension(:), allocatable :: deallocation_durations, tensor_creation_durations, tensor_deletion_durations
real(dp), dimension(:,:), allocatable :: all_durations
character(len=20), dimension(:), allocatable :: messages

Expand All @@ -47,6 +47,8 @@ subroutine main()
character(len=:), allocatable :: model_dir, model_name
character(len=128) :: msg1, msg2, msg3, msg4, msg5, msg6
integer :: ntimes
character(len=10) :: input_device
logical :: use_cuda = .false.

type(ndarray) :: uuu_nd, vvv_nd, gwfcng_x_nd, gwfcng_y_nd, lat_nd, psfc_nd

Expand All @@ -56,16 +58,22 @@ subroutine main()

print *, "====== FORPY ======"

call setup(model_dir, model_name, ntimes, n, alloc_in_loop)
call setup(model_dir, model_name, ntimes, n, alloc_in_loop, use_cuda=use_cuda)
if (ntimes .lt. 2) then
write(*,*) "Error: ntimes must be at least 2"
return
end if

if (use_cuda) then
input_device = "cuda"
else
input_device = "cpu"
end if

! Allocate arrays shared with FTorch implementation and read in data
call init_common_arrays(ntimes, I_MAX, J_MAX, K_MAX, uuu, vvv, gwfcng_x, gwfcng_y, gwfcng_x_ref, gwfcng_y_ref, lat, psfc, &
module_load_durations, module_delete_durations, loop_durations, allocation_durations, deallocation_durations, &
tensor_creation_durations, tensor_deletion_durations, inference_durations, all_durations, messages, &
loop_durations, allocation_durations, deallocation_durations, tensor_creation_durations, &
tensor_deletion_durations, inference_durations, all_durations, messages, &
start_loop_time, end_loop_time, start_time, end_time)

! Reshape arrays, if not done for every loop
Expand All @@ -80,7 +88,7 @@ subroutine main()
#else
print *, "generate model in python runtime"
#endif
call load_module(model_dir, model_name, run_emulator, model)
call load_module(model_dir, model_name, run_emulator, model, use_cuda)

do i = 1, ntimes

Expand Down Expand Up @@ -108,13 +116,14 @@ subroutine main()
ie = ndarray_create_nocopy(gwfcng_y_nd, gwfcng_y_flattened)

! create model input args as tuple
ie = tuple_create(args,6)
ie = tuple_create(args, 7)
ie = args%setitem(0, model)
ie = args%setitem(1, uuu_nd)
ie = args%setitem(2, lat_nd)
ie = args%setitem(3, psfc_nd)
ie = args%setitem(4, gwfcng_x_nd)
ie = args%setitem(5, J_MAX)
ie = args%setitem(6, trim(input_device))
end_time = omp_get_wtime()
tensor_creation_durations(i) = end_time - start_time
! ------------------------------ End tensor creation timer ------------------------------
Expand Down Expand Up @@ -210,71 +219,36 @@ subroutine main()

end do

call time_module(ntimes, model_dir, model_name, module_load_durations, module_delete_durations, run_emulator, model)
call forpy_finalize

! Call individual print for loop, to avoid adding to combined mean
call print_time_stats(loop_durations, "full loop")

all_durations(:, 1) = module_load_durations
all_durations(:, 2) = module_delete_durations
all_durations(:, 3) = allocation_durations
all_durations(:, 4) = deallocation_durations
all_durations(:, 5) = tensor_creation_durations
all_durations(:, 6) = tensor_deletion_durations
all_durations(:, 7) = inference_durations
messages = [character(len=20) :: "module creation", "module deletion", "array allocation", "array deallocation", &
all_durations(:, 1) = allocation_durations
all_durations(:, 2) = deallocation_durations
all_durations(:, 3) = tensor_creation_durations
all_durations(:, 4) = tensor_deletion_durations
all_durations(:, 5) = inference_durations
messages = [character(len=20) :: "array allocation", "array deallocation", &
"tensor creation", "tensor deletion", "forward pass"]
call print_all_time_stats(all_durations, messages)

call deallocate_common_arrays(uuu, vvv, gwfcng_x, gwfcng_y, gwfcng_x_ref, gwfcng_y_ref, lat, psfc, module_load_durations, &
module_delete_durations, loop_durations, allocation_durations, deallocation_durations, &
tensor_creation_durations, tensor_deletion_durations, inference_durations, all_durations, messages)
call deallocate_common_arrays(uuu, vvv, gwfcng_x, gwfcng_y, gwfcng_x_ref, gwfcng_y_ref, lat, psfc, &
loop_durations, allocation_durations, deallocation_durations, tensor_creation_durations, &
tensor_deletion_durations, inference_durations, all_durations, messages)

if (.not. alloc_in_loop) then
call deallocate_reshaped_arrays(uuu_flattened, vvv_flattened, lat_reshaped, psfc_reshaped, gwfcng_x_flattened, gwfcng_y_flattened)
end if

end subroutine main

subroutine time_module(ntimes, model_dir, model_name, module_load_durations, module_delete_durations, run_emulator, model)

implicit none

integer, intent(in) :: ntimes
character(len=*), intent(in) :: model_dir, model_name
real(dp), dimension(:), intent(inout) :: module_load_durations, module_delete_durations
type(module_py), intent(out) :: run_emulator
type(object), intent(out) :: model

integer :: i
real(dp) :: start_time, end_time

do i = 1, ntimes
! ------------------------------ Start module load timer ------------------------------
start_time = omp_get_wtime()
call load_module(model_dir, model_name, run_emulator, model)
end_time = omp_get_wtime()
module_load_durations(i) = end_time - start_time
! ------------------------------ End module load timer ------------------------------

! ------------------------------ Start module deletion timer ------------------------------
! We can only call forpy_finalize once
if (i == ntimes) then
start_time = omp_get_wtime()
call forpy_finalize
end_time = omp_get_wtime()
module_delete_durations(:) = (end_time - start_time) / (ntimes + 1)
end if
! ------------------------------ End module deletion timer ------------------------------
end do

end subroutine time_module

subroutine load_module(model_dir, model_name, run_emulator, model)
subroutine load_module(model_dir, model_name, run_emulator, model, use_cuda)

implicit none

character(len=*), intent(in) :: model_dir, model_name
logical, intent(in) :: use_cuda
type(module_py), intent(out) :: run_emulator
type(object), intent(out) :: model

Expand All @@ -301,7 +275,11 @@ subroutine load_module(model_dir, model_name, run_emulator, model)
#ifdef USETS
! load torchscript saved model
ie = tuple_create(args,1)
ie = str_create(filename, trim(model_dir//"/"//"saved_cgdrag_model_cpu.pt"))
if (use_cuda) then
ie = str_create(filename, trim(model_dir//"/"//"saved_cgdrag_model_gpu.pt"))
else
ie = str_create(filename, trim(model_dir//"/"//"saved_cgdrag_model_cpu.pt"))
end if
ie = args%setitem(0, filename)
ie = call_py(model, run_emulator, "initialize_ts", args)
call args%destroy
Expand All @@ -318,8 +296,8 @@ subroutine load_module(model_dir, model_name, run_emulator, model)
end subroutine load_module

subroutine init_common_arrays(ntimes, I_MAX, J_MAX, K_MAX, uuu, vvv, gwfcng_x, gwfcng_y, gwfcng_x_ref, gwfcng_y_ref, lat, psfc, &
module_load_durations, module_delete_durations, loop_durations, allocation_durations, &
deallocation_durations, tensor_creation_durations, tensor_deletion_durations, inference_durations, &
loop_durations, allocation_durations, deallocation_durations, &
tensor_creation_durations, tensor_deletion_durations, inference_durations, &
all_durations, messages, start_loop_time, end_loop_time, start_time, end_time)

implicit none
Expand All @@ -330,8 +308,8 @@ subroutine init_common_arrays(ntimes, I_MAX, J_MAX, K_MAX, uuu, vvv, gwfcng_x, g
real(wp), intent(out), dimension(:,:,:), allocatable :: gwfcng_x_ref, gwfcng_y_ref
real(wp), intent(out), dimension(:,:), allocatable :: lat, psfc

real(dp), intent(out), dimension(:), allocatable :: module_load_durations, module_delete_durations, loop_durations, inference_durations
real(dp), intent(out), dimension(:), allocatable :: allocation_durations, deallocation_durations, tensor_creation_durations, tensor_deletion_durations
real(dp), intent(out), dimension(:), allocatable :: loop_durations, inference_durations, allocation_durations
real(dp), intent(out), dimension(:), allocatable :: deallocation_durations, tensor_creation_durations, tensor_deletion_durations
real(dp), intent(out), dimension(:,:), allocatable :: all_durations
character(len=20), intent(out), dimension(:), allocatable :: messages

Expand Down Expand Up @@ -385,20 +363,16 @@ subroutine init_common_arrays(ntimes, I_MAX, J_MAX, K_MAX, uuu, vvv, gwfcng_x, g
close(15)

! Allocate arrays for timings
allocate(module_load_durations(ntimes))
allocate(module_delete_durations(ntimes))
allocate(loop_durations(ntimes))
allocate(allocation_durations(ntimes))
allocate(deallocation_durations(ntimes))
allocate(tensor_creation_durations(ntimes))
allocate(tensor_deletion_durations(ntimes))
allocate(inference_durations(ntimes))
allocate(all_durations(ntimes, 7))
allocate(messages(7))
allocate(all_durations(ntimes, 5))
allocate(messages(5))

! Initialise timings with arbitrary large values
module_load_durations(:) = 100.
module_delete_durations(:) = 100.
loop_durations(:) = 100.
allocation_durations(:) = 100.
deallocation_durations(:) = 100.
Expand Down Expand Up @@ -445,23 +419,21 @@ subroutine init_reshaped_arrays(I_MAX, J_MAX, K_MAX, uuu, vvv, lat, psfc, uuu_fl

end subroutine init_reshaped_arrays

subroutine deallocate_common_arrays(uuu, vvv, gwfcng_x, gwfcng_y, gwfcng_x_ref, gwfcng_y_ref, lat, psfc, module_load_durations, &
module_delete_durations, loop_durations, allocation_durations, deallocation_durations, &
tensor_creation_durations, tensor_deletion_durations, inference_durations, all_durations, messages)
subroutine deallocate_common_arrays(uuu, vvv, gwfcng_x, gwfcng_y, gwfcng_x_ref, gwfcng_y_ref, lat, psfc, &
loop_durations, allocation_durations, deallocation_durations, tensor_creation_durations, &
tensor_deletion_durations, inference_durations, all_durations, messages)

implicit none

real(dp), intent(inout), dimension(:), allocatable :: module_load_durations, module_delete_durations, loop_durations, inference_durations
real(dp), intent(inout), dimension(:), allocatable :: allocation_durations, deallocation_durations, tensor_creation_durations, tensor_deletion_durations
real(dp), intent(inout), dimension(:), allocatable :: loop_durations, inference_durations, allocation_durations
real(dp), intent(inout), dimension(:), allocatable :: deallocation_durations, tensor_creation_durations, tensor_deletion_durations
real(dp), intent(inout), dimension(:,:), allocatable :: all_durations
character(len=20), intent(inout), dimension(:), allocatable :: messages

real(wp), intent(inout), dimension(:,:,:), allocatable :: uuu, vvv, gwfcng_x, gwfcng_y
real(wp), intent(inout), dimension(:,:,:), allocatable :: gwfcng_x_ref, gwfcng_y_ref
real(wp), intent(inout), dimension(:,:), allocatable :: lat, psfc

deallocate(module_load_durations)
deallocate(module_delete_durations)
deallocate(loop_durations)
deallocate(allocation_durations)
deallocate(deallocation_durations)
Expand Down
Loading