Skip to content

Commit

Permalink
amdgpu_plugin: Refactor code used to implement Checkpoint
Browse files Browse the repository at this point in the history
Refactor code used to Checkpoint DRM devices. Code is moved
into amdgpu_plugin_drm.c file which hosts various methods to
checkpoint and restore a workload.

Signed-off-by: Ramesh Errabolu <[email protected]>
  • Loading branch information
rerrabolu authored and avagin committed Feb 5, 2024
1 parent 81f2c41 commit 9d9ae29
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 33 deletions.
62 changes: 29 additions & 33 deletions plugins/amdgpu/amdgpu_plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,13 @@ struct vma_metadata {

/************************************ Global Variables ********************************************/

/**
* FD of KFD device used to checkpoint. On a multi-process
* tree the order of checkpointing goes from parent to child
* and so on - so saving the FD will not be overwritten
*/
static int kfd_checkpoint_fd;

static LIST_HEAD(update_vma_info_list);

size_t kfd_max_buffer_size;
Expand Down Expand Up @@ -990,6 +997,10 @@ static int unpause_process(int fd)
goto exit;
}

// Reset the KFD FD
kfd_checkpoint_fd = -1;
sys_close_drm_render_devices(&src_topology);

exit:
pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret);

Expand Down Expand Up @@ -1181,44 +1192,25 @@ int amdgpu_plugin_dump_file(int fd, int id)
return -1;
}

/* Initialize number of device files that will be checkpointed */
init_gpu_count(&src_topology);

/* Check whether this plugin was called for kfd or render nodes */
if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) {

/* This is RenderD dumper plugin, for now just save renderD
* minor number to be used during restore. In later phases this
* needs to save more data for video decode etc.
*/

CriuRenderNode rd = CRIU_RENDER_NODE__INIT;
struct tp_node *tp_node;

pr_info("Dumper called for /dev/dri/renderD%d, FD = %d, ID = %d\n", minor(st.st_rdev), fd, id);

tp_node = sys_get_node_by_render_minor(&src_topology, minor(st.st_rdev));
if (!tp_node) {
pr_err("Failed to find a device with minor number = %d\n", minor(st.st_rdev));

return -ENODEV;
}

rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id);
if (!rd.gpu_id)
return -ENODEV;

len = criu_render_node__get_packed_size(&rd);
buf = xmalloc(len);
if (!buf)
return -ENOMEM;

criu_render_node__pack(&rd, buf);

snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id);
ret = write_img_file(img_path, buf, len);
if (ret) {
xfree(buf);
ret = amdgpu_plugin_drm_dump_file(fd, id, &st);
if (ret)
return ret;
}

xfree(buf);
/* Invoke unpause process if needed */
decrement_checkpoint_count();
if (checkpoint_is_complete()) {
ret = unpause_process(kfd_checkpoint_fd);
}

/* Need to return success here so that criu can call plugins for renderD nodes */
return ret;
Expand Down Expand Up @@ -1315,11 +1307,15 @@ int amdgpu_plugin_dump_file(int fd, int id)
ret = write_img_file(img_path, buf, len);

xfree(buf);

exit:
/* Restore all queues */
unpause_process(fd);
/* Restore all queues if conditions permit */
kfd_checkpoint_fd = fd;
decrement_checkpoint_count();
if (checkpoint_is_complete()) {
ret = unpause_process(fd);
}

sys_close_drm_render_devices(&src_topology);
xfree((void *)args.devices);
xfree((void *)args.bos);
xfree((void *)args.priv_data);
Expand Down
38 changes: 38 additions & 0 deletions plugins/amdgpu/amdgpu_plugin_drm.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,41 @@ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st)
}


int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm)
{
CriuRenderNode rd = CRIU_RENDER_NODE__INIT;
struct tp_node *tp_node;
char path[PATH_MAX];
unsigned char *buf;
int minor;
int len;
int ret;

/* Get the topology node of the DRM device */
minor = minor(drm->st_rdev);
tp_node = sys_get_node_by_render_minor(&src_topology, minor);
if (!tp_node) {
pr_err("Failed to find a device with minor number = %d\n", minor);
return -ENODEV;
}

/* Get the GPU_ID of the DRM device */
rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id);
if (!rd.gpu_id) {
pr_err("Failed to find valid gpu_id for the device = %d\n", rd.gpu_id);
return -ENODEV;
}

len = criu_render_node__get_packed_size(&rd);
buf = xmalloc(len);
if (!buf)
return -ENOMEM;

criu_render_node__pack(&rd, buf);

snprintf(path, sizeof(path), IMG_DRM_FILE, id);
ret = write_img_file(path, buf, len);
xfree(buf);
return ret;
}

6 changes: 6 additions & 0 deletions plugins/amdgpu/amdgpu_plugin_drm.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@
*/
int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *drm);

/**
* Serialize meta-data about a particular DRM device, its number of BOs,
* etc into a file. The serialized filename has in it the value ID that
* is passed in as a parameter
*/
int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm);

#endif /* __AMDGPU_PLUGIN_DRM_H__ */

0 comments on commit 9d9ae29

Please sign in to comment.