From 9d9ae2954d3f1758ea659ed8e614dd28e8a368dd Mon Sep 17 00:00:00 2001 From: Ramesh Errabolu Date: Fri, 10 Nov 2023 13:02:49 -0600 Subject: [PATCH] amdgpu_plugin: Refactor code used to implement Checkpoint Refactor code used to Checkpoint DRM devices. Code is moved into amdgpu_plugin_drm.c file which hosts various methods to checkpoint and restore a workload. Signed-off-by: Ramesh Errabolu --- plugins/amdgpu/amdgpu_plugin.c | 62 ++++++++++++++---------------- plugins/amdgpu/amdgpu_plugin_drm.c | 38 ++++++++++++++++++ plugins/amdgpu/amdgpu_plugin_drm.h | 6 +++ 3 files changed, 73 insertions(+), 33 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 60e04f9735..a579158d0d 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -49,6 +49,13 @@ struct vma_metadata { /************************************ Global Variables ********************************************/ +/** + * FD of KFD device used to checkpoint. On a multi-process + * tree the order of checkpointing goes from parent to child + * and so on - so saving the FD will not be overwritten + */ +static int kfd_checkpoint_fd; + static LIST_HEAD(update_vma_info_list); size_t kfd_max_buffer_size; @@ -990,6 +997,10 @@ static int unpause_process(int fd) goto exit; } + // Reset the KFD FD + kfd_checkpoint_fd = -1; + sys_close_drm_render_devices(&src_topology); + exit: pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret); @@ -1181,44 +1192,25 @@ int amdgpu_plugin_dump_file(int fd, int id) return -1; } + /* Initialize number of device files that will be checkpointed */ + init_gpu_count(&src_topology); + /* Check whether this plugin was called for kfd or render nodes */ if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) { + /* This is RenderD dumper plugin, for now just save renderD * minor number to be used during restore. In later phases this * needs to save more data for video decode etc. */ - - CriuRenderNode rd = CRIU_RENDER_NODE__INIT; - struct tp_node *tp_node; - - pr_info("Dumper called for /dev/dri/renderD%d, FD = %d, ID = %d\n", minor(st.st_rdev), fd, id); - - tp_node = sys_get_node_by_render_minor(&src_topology, minor(st.st_rdev)); - if (!tp_node) { - pr_err("Failed to find a device with minor number = %d\n", minor(st.st_rdev)); - - return -ENODEV; - } - - rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id); - if (!rd.gpu_id) - return -ENODEV; - - len = criu_render_node__get_packed_size(&rd); - buf = xmalloc(len); - if (!buf) - return -ENOMEM; - - criu_render_node__pack(&rd, buf); - - snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id); - ret = write_img_file(img_path, buf, len); - if (ret) { - xfree(buf); + ret = amdgpu_plugin_drm_dump_file(fd, id, &st); + if (ret) return ret; - } - xfree(buf); + /* Invoke unpause process if needed */ + decrement_checkpoint_count(); + if (checkpoint_is_complete()) { + ret = unpause_process(kfd_checkpoint_fd); + } /* Need to return success here so that criu can call plugins for renderD nodes */ return ret; @@ -1315,11 +1307,15 @@ int amdgpu_plugin_dump_file(int fd, int id) ret = write_img_file(img_path, buf, len); xfree(buf); + exit: - /* Restore all queues */ - unpause_process(fd); + /* Restore all queues if conditions permit */ + kfd_checkpoint_fd = fd; + decrement_checkpoint_count(); + if (checkpoint_is_complete()) { + ret = unpause_process(fd); + } - sys_close_drm_render_devices(&src_topology); xfree((void *)args.devices); xfree((void *)args.bos); xfree((void *)args.priv_data); diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c index a48dc68f0f..689d620720 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.c +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -61,3 +61,41 @@ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st) } +int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm) +{ + CriuRenderNode rd = CRIU_RENDER_NODE__INIT; + struct tp_node *tp_node; + char path[PATH_MAX]; + unsigned char *buf; + int minor; + int len; + int ret; + + /* Get the topology node of the DRM device */ + minor = minor(drm->st_rdev); + tp_node = sys_get_node_by_render_minor(&src_topology, minor); + if (!tp_node) { + pr_err("Failed to find a device with minor number = %d\n", minor); + return -ENODEV; + } + + /* Get the GPU_ID of the DRM device */ + rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id); + if (!rd.gpu_id) { + pr_err("Failed to find valid gpu_id for the device = %d\n", rd.gpu_id); + return -ENODEV; + } + + len = criu_render_node__get_packed_size(&rd); + buf = xmalloc(len); + if (!buf) + return -ENOMEM; + + criu_render_node__pack(&rd, buf); + + snprintf(path, sizeof(path), IMG_DRM_FILE, id); + ret = write_img_file(path, buf, len); + xfree(buf); + return ret; +} + diff --git a/plugins/amdgpu/amdgpu_plugin_drm.h b/plugins/amdgpu/amdgpu_plugin_drm.h index 37009c8ba7..6f0c1a9a63 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.h +++ b/plugins/amdgpu/amdgpu_plugin_drm.h @@ -17,6 +17,12 @@ */ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *drm); +/** + * Serialize meta-data about a particular DRM device, its number of BOs, + * etc into a file. The serialized filename has in it the value ID that + * is passed in as a parameter + */ +int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm); #endif /* __AMDGPU_PLUGIN_DRM_H__ */