diff --git a/crates/libcgroups/src/common.rs b/crates/libcgroups/src/common.rs index aba574f58..a0d20424d 100644 --- a/crates/libcgroups/src/common.rs +++ b/crates/libcgroups/src/common.rs @@ -316,17 +316,22 @@ pub enum CreateCgroupSetupError { Systemd(#[from] systemd::manager::SystemdManagerError), } -pub fn create_cgroup_manager>( - cgroup_path: P, - systemd_cgroup: bool, - container_name: &str, +#[derive(Clone)] +pub struct CgroupConfig { + pub cgroup_path: PathBuf, + pub systemd_cgroup: bool, + pub container_name: String, +} + +pub fn create_cgroup_manager( + config: CgroupConfig, ) -> Result { let cgroup_setup = get_cgroup_setup().map_err(|err| match err { GetCgroupSetupError::WrappedIo(err) => CreateCgroupSetupError::WrappedIo(err), GetCgroupSetupError::NonDefault => CreateCgroupSetupError::NonDefault, GetCgroupSetupError::FailedToDetect => CreateCgroupSetupError::FailedToDetect, })?; - let cgroup_path = cgroup_path.into(); + let cgroup_path = config.cgroup_path.as_path(); match cgroup_setup { CgroupSetup::Legacy | CgroupSetup::Hybrid => { @@ -334,17 +339,17 @@ pub fn create_cgroup_manager>( } CgroupSetup::Unified => { // ref https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md#cgroups-path - if cgroup_path.is_absolute() || !systemd_cgroup { + if cgroup_path.is_absolute() || !config.systemd_cgroup { return Ok(create_v2_cgroup_manager(cgroup_path)?.any()); } - Ok(create_systemd_cgroup_manager(cgroup_path, container_name)?.any()) + Ok(create_systemd_cgroup_manager(cgroup_path, config.container_name.as_str())?.any()) } } } #[cfg(feature = "v1")] fn create_v1_cgroup_manager( - cgroup_path: PathBuf, + cgroup_path: &Path, ) -> Result { tracing::info!("cgroup manager V1 will be used"); v1::manager::Manager::new(cgroup_path) @@ -352,29 +357,29 @@ fn create_v1_cgroup_manager( #[cfg(not(feature = "v1"))] fn create_v1_cgroup_manager( - _cgroup_path: PathBuf, + _cgroup_path: &Path, ) -> Result { Err(v1::manager::V1ManagerError::NotEnabled) } #[cfg(feature = "v2")] fn create_v2_cgroup_manager( - cgroup_path: PathBuf, + cgroup_path: &Path, ) -> Result { tracing::info!("cgroup manager V2 will be used"); - v2::manager::Manager::new(DEFAULT_CGROUP_ROOT.into(), cgroup_path) + v2::manager::Manager::new(DEFAULT_CGROUP_ROOT.into(), cgroup_path.to_owned()) } #[cfg(not(feature = "v2"))] fn create_v2_cgroup_manager( - _cgroup_path: PathBuf, + _cgroup_path: &Path, ) -> Result { Err(v2::manager::V2ManagerError::NotEnabled) } #[cfg(feature = "systemd")] fn create_systemd_cgroup_manager( - cgroup_path: PathBuf, + cgroup_path: &Path, container_name: &str, ) -> Result { if !systemd::booted() { @@ -391,7 +396,7 @@ fn create_systemd_cgroup_manager( ); systemd::manager::Manager::new( DEFAULT_CGROUP_ROOT.into(), - cgroup_path, + cgroup_path.to_owned(), container_name.into(), use_system, ) @@ -399,7 +404,7 @@ fn create_systemd_cgroup_manager( #[cfg(not(feature = "systemd"))] fn create_systemd_cgroup_manager( - _cgroup_path: PathBuf, + _cgroup_path: &Path, _container_name: &str, ) -> Result { Err(systemd::manager::SystemdManagerError::NotEnabled) diff --git a/crates/libcgroups/src/systemd/manager.rs b/crates/libcgroups/src/systemd/manager.rs index 21a2e60d8..286b2fe34 100644 --- a/crates/libcgroups/src/systemd/manager.rs +++ b/crates/libcgroups/src/systemd/manager.rs @@ -180,7 +180,7 @@ impl Manager { container_name: String, use_system: bool, ) -> Result { - let mut destructured_path = cgroups_path.as_path().try_into()?; + let mut destructured_path: CgroupsPath = cgroups_path.as_path().try_into()?; ensure_parent_unit(&mut destructured_path, use_system); let client = match use_system { diff --git a/crates/libcgroups/src/v1/manager.rs b/crates/libcgroups/src/v1/manager.rs index e1376a883..022ffbbcb 100644 --- a/crates/libcgroups/src/v1/manager.rs +++ b/crates/libcgroups/src/v1/manager.rs @@ -82,10 +82,10 @@ pub enum V1ManagerError { impl Manager { /// Constructs a new cgroup manager with cgroups_path being relative to the root of the subsystem - pub fn new(cgroup_path: PathBuf) -> Result { + pub fn new(cgroup_path: &Path) -> Result { let mut subsystems = HashMap::::new(); for subsystem in CONTROLLERS { - if let Ok(subsystem_path) = Self::get_subsystem_path(&cgroup_path, subsystem) { + if let Ok(subsystem_path) = Self::get_subsystem_path(cgroup_path, subsystem) { subsystems.insert(*subsystem, subsystem_path); } else { tracing::warn!("cgroup {} not supported on this system", subsystem); diff --git a/crates/libcontainer/src/container/builder_impl.rs b/crates/libcontainer/src/container/builder_impl.rs index 96d90266b..9122fc9f3 100644 --- a/crates/libcontainer/src/container/builder_impl.rs +++ b/crates/libcontainer/src/container/builder_impl.rs @@ -73,11 +73,11 @@ impl<'a> ContainerBuilderImpl<'a> { &self.container_id, self.rootless.is_some(), ); - let cmanager = libcgroups::common::create_cgroup_manager( - cgroups_path, - self.use_systemd || self.rootless.is_some(), - &self.container_id, - )?; + let cgroup_config = libcgroups::common::CgroupConfig { + cgroup_path: cgroups_path, + systemd_cgroup: self.use_systemd || self.rootless.is_some(), + container_name: self.container_id.to_owned(), + }; let process = self .spec .process() @@ -93,8 +93,10 @@ impl<'a> ContainerBuilderImpl<'a> { // Need to create the notify socket before we pivot root, since the unix // domain socket used here is outside of the rootfs of container. During // exec, need to create the socket before we enter into existing mount - // namespace. - let notify_socket: NotifyListener = NotifyListener::new(&self.notify_path)?; + // namespace. We also need to create to socket before entering into the + // user namespace in the case that the path is located in paths only + // root can access. + let notify_listener = NotifyListener::new(&self.notify_path)?; // If Out-of-memory score adjustment is set in specification. set the score // value for the current process check @@ -139,11 +141,11 @@ impl<'a> ContainerBuilderImpl<'a> { spec: self.spec, rootfs: &self.rootfs, console_socket: self.console_socket, - notify_socket, + notify_listener, preserve_fds: self.preserve_fds, container: &self.container, rootless: &self.rootless, - cgroup_manager: cmanager, + cgroup_config, detached: self.detached, executor_manager: &self.executor_manager, }; @@ -184,11 +186,12 @@ impl<'a> ContainerBuilderImpl<'a> { &self.container_id, self.rootless.is_some(), ); - let cmanager = libcgroups::common::create_cgroup_manager( - cgroups_path, - self.use_systemd || self.rootless.is_some(), - &self.container_id, - )?; + let cmanager = + libcgroups::common::create_cgroup_manager(libcgroups::common::CgroupConfig { + cgroup_path: cgroups_path, + systemd_cgroup: self.use_systemd || self.rootless.is_some(), + container_name: self.container_id.to_string(), + })?; let mut errors = Vec::new(); diff --git a/crates/libcontainer/src/container/container_delete.rs b/crates/libcontainer/src/container/container_delete.rs index 0e25a5456..0322fd392 100644 --- a/crates/libcontainer/src/container/container_delete.rs +++ b/crates/libcontainer/src/container/container_delete.rs @@ -81,11 +81,12 @@ impl Container { // remove the cgroup created for the container // check https://man7.org/linux/man-pages/man7/cgroups.7.html // creating and removing cgroups section for more information on cgroups - let use_systemd = self.systemd(); let cmanager = libcgroups::common::create_cgroup_manager( - &config.cgroup_path, - use_systemd, - self.id(), + libcgroups::common::CgroupConfig { + cgroup_path: config.cgroup_path.to_owned(), + systemd_cgroup: self.systemd(), + container_name: self.id().to_string(), + }, )?; cmanager.remove().map_err(|err| { tracing::error!(cgroup_path = ?config.cgroup_path, "failed to remove cgroup due to: {err:?}"); diff --git a/crates/libcontainer/src/container/container_events.rs b/crates/libcontainer/src/container/container_events.rs index ca4124bc0..ed253be3a 100644 --- a/crates/libcontainer/src/container/container_events.rs +++ b/crates/libcontainer/src/container/container_events.rs @@ -33,11 +33,12 @@ impl Container { return Err(LibcontainerError::IncorrectStatus); } - let cgroups_path = self.spec()?.cgroup_path; - let use_systemd = self.systemd(); - let cgroup_manager = - libcgroups::common::create_cgroup_manager(cgroups_path, use_systemd, self.id())?; + libcgroups::common::create_cgroup_manager(libcgroups::common::CgroupConfig { + cgroup_path: self.spec()?.cgroup_path, + systemd_cgroup: self.systemd(), + container_name: self.id().to_string(), + })?; match stats { true => { let stats = cgroup_manager.stats()?; diff --git a/crates/libcontainer/src/container/container_kill.rs b/crates/libcontainer/src/container/container_kill.rs index faf7b89fe..6b5d08540 100644 --- a/crates/libcontainer/src/container/container_kill.rs +++ b/crates/libcontainer/src/container/container_kill.rs @@ -1,6 +1,6 @@ use super::{Container, ContainerStatus}; use crate::{error::LibcontainerError, signal::Signal}; -use libcgroups::common::{create_cgroup_manager, get_cgroup_setup, CgroupManager}; +use libcgroups::common::{get_cgroup_setup, CgroupManager}; use nix::sys::signal::{self}; impl Container { @@ -78,10 +78,14 @@ impl Container { match get_cgroup_setup()? { libcgroups::common::CgroupSetup::Legacy | libcgroups::common::CgroupSetup::Hybrid => { - let cgroups_path = self.spec()?.cgroup_path; - let use_systemd = self.systemd(); - let cmanger = create_cgroup_manager(cgroups_path, use_systemd, self.id())?; - cmanger.freeze(libcgroups::common::FreezerState::Thawed)?; + let cmanager = libcgroups::common::create_cgroup_manager( + libcgroups::common::CgroupConfig { + cgroup_path: self.spec()?.cgroup_path, + systemd_cgroup: self.systemd(), + container_name: self.id().to_string(), + }, + )?; + cmanager.freeze(libcgroups::common::FreezerState::Thawed)?; } libcgroups::common::CgroupSetup::Unified => {} } @@ -91,11 +95,14 @@ impl Container { fn kill_all_processes>(&self, signal: S) -> Result<(), LibcontainerError> { let signal = signal.into().into_raw(); - let cgroups_path = self.spec()?.cgroup_path; - let use_systemd = self.systemd(); - let cmanger = create_cgroup_manager(cgroups_path, use_systemd, self.id())?; + let cmanager = + libcgroups::common::create_cgroup_manager(libcgroups::common::CgroupConfig { + cgroup_path: self.spec()?.cgroup_path, + systemd_cgroup: self.systemd(), + container_name: self.id().to_string(), + })?; - if let Err(e) = cmanger.freeze(libcgroups::common::FreezerState::Frozen) { + if let Err(e) = cmanager.freeze(libcgroups::common::FreezerState::Frozen) { tracing::warn!( err = ?e, id = ?self.id(), @@ -103,7 +110,7 @@ impl Container { ); } - let pids = cmanger.get_all_pids()?; + let pids = cmanager.get_all_pids()?; pids.iter() .try_for_each(|&pid| { tracing::debug!("kill signal {} to {}", signal, pid); @@ -117,7 +124,7 @@ impl Container { } }) .map_err(LibcontainerError::OtherSyscall)?; - if let Err(err) = cmanger.freeze(libcgroups::common::FreezerState::Thawed) { + if let Err(err) = cmanager.freeze(libcgroups::common::FreezerState::Thawed) { tracing::warn!( err = ?err, id = ?self.id(), diff --git a/crates/libcontainer/src/container/container_pause.rs b/crates/libcontainer/src/container/container_pause.rs index 462f3d475..8e7248e46 100644 --- a/crates/libcontainer/src/container/container_pause.rs +++ b/crates/libcontainer/src/container/container_pause.rs @@ -32,10 +32,12 @@ impl Container { return Err(LibcontainerError::IncorrectStatus); } - let cgroups_path = self.spec()?.cgroup_path; - let use_systemd = self.systemd(); let cmanager = - libcgroups::common::create_cgroup_manager(cgroups_path, use_systemd, self.id())?; + libcgroups::common::create_cgroup_manager(libcgroups::common::CgroupConfig { + cgroup_path: self.spec()?.cgroup_path, + systemd_cgroup: self.systemd(), + container_name: self.id().to_string(), + })?; cmanager.freeze(FreezerState::Frozen)?; tracing::debug!("saving paused status"); diff --git a/crates/libcontainer/src/container/container_resume.rs b/crates/libcontainer/src/container/container_resume.rs index 9e3dae5d8..544b8e8e5 100644 --- a/crates/libcontainer/src/container/container_resume.rs +++ b/crates/libcontainer/src/container/container_resume.rs @@ -34,10 +34,12 @@ impl Container { return Err(LibcontainerError::IncorrectStatus); } - let cgroups_path = self.spec()?.cgroup_path; - let use_systemd = self.systemd(); let cmanager = - libcgroups::common::create_cgroup_manager(cgroups_path, use_systemd, self.id())?; + libcgroups::common::create_cgroup_manager(libcgroups::common::CgroupConfig { + cgroup_path: self.spec()?.cgroup_path, + systemd_cgroup: self.systemd(), + container_name: self.id().to_string(), + })?; // resume the frozen container cmanager.freeze(FreezerState::Thawed)?; diff --git a/crates/libcontainer/src/notify_socket.rs b/crates/libcontainer/src/notify_socket.rs index 9daf16b89..64837ca87 100644 --- a/crates/libcontainer/src/notify_socket.rs +++ b/crates/libcontainer/src/notify_socket.rs @@ -1,6 +1,7 @@ use nix::unistd::{self, close}; use std::env; use std::io::prelude::*; +use std::os::fd::FromRawFd; use std::os::unix::io::AsRawFd; use std::os::unix::net::{UnixListener, UnixStream}; use std::path::{Path, PathBuf}; @@ -9,7 +10,7 @@ pub const NOTIFY_FILE: &str = "notify.sock"; #[derive(Debug, thiserror::Error)] pub enum NotifyListenerError { - #[error("failed to chdir while creating notify socket")] + #[error("failed to chdir {path} while creating notify socket: {source}")] Chdir { source: nix::Error, path: PathBuf }, #[error("invalid path: {0}")] InvalidPath(PathBuf), @@ -43,6 +44,7 @@ pub struct NotifyListener { impl NotifyListener { pub fn new(socket_path: &Path) -> Result { + tracing::debug!(?socket_path, "create notify listener"); // Unix domain socket has a maximum length of 108, different from // normal path length of 255. Due to how docker create the path name // to the container working directory, there is a high chance that @@ -56,6 +58,7 @@ impl NotifyListener { .file_name() .ok_or_else(|| NotifyListenerError::InvalidPath(socket_path.to_owned()))?; let cwd = env::current_dir().map_err(NotifyListenerError::GetCwd)?; + tracing::debug!(?cwd, "the cwd to create the notify socket"); unistd::chdir(workdir).map_err(|e| NotifyListenerError::Chdir { source: e, path: workdir.to_owned(), @@ -93,6 +96,23 @@ impl NotifyListener { } } +impl Clone for NotifyListener { + fn clone(&self) -> Self { + let fd = self.socket.as_raw_fd(); + // This is safe because we just duplicate a valid fd. Theoretically, to + // truly clone a unix listener, we have to use dup(2) to duplicate the + // fd, and then use from_raw_fd to create a new UnixListener. However, + // for our purposes, fd is just an integer to pass around for the same + // socket. Our main usage is to pass the notify_listener across process + // boundary. Since fd tables are cloned during clone/fork calls, this + // should be safe to use, as long as we be careful with not closing the + // same fd in different places. If we observe an issue, we will switch + // to `dup`. + let socket = unsafe { UnixListener::from_raw_fd(fd) }; + Self { socket } + } +} + pub struct NotifySocket { path: PathBuf, } @@ -135,3 +155,36 @@ impl NotifySocket { Ok(()) } } + +#[cfg(test)] +mod test { + use tempfile::tempdir; + + use super::*; + + #[test] + /// Test that the listener can be cloned and function correctly. This test + /// also serves as a test for the normal case. + fn test_notify_listener_clone() { + let tempdir = tempdir().unwrap(); + let socket_path = tempdir.path().join("notify.sock"); + // listener needs to be created first because it will create the socket. + let listener = NotifyListener::new(&socket_path).unwrap(); + let mut socket = NotifySocket::new(socket_path.clone()); + // This is safe without race because the unix domain socket is already + // created. It is OK for the socket to send the start notification + // before the listener wait is called. + let thread_handle = std::thread::spawn({ + move || { + // We clone the listener and listen on the cloned listener to + // make sure the cloned fd functions correctly. + let cloned_listener = listener.clone(); + cloned_listener.wait_for_container_start().unwrap(); + cloned_listener.close().unwrap(); + } + }); + + socket.notify_container_start().unwrap(); + thread_handle.join().unwrap(); + } +} diff --git a/crates/libcontainer/src/process/args.rs b/crates/libcontainer/src/process/args.rs index e59d75f46..f616c7c59 100644 --- a/crates/libcontainer/src/process/args.rs +++ b/crates/libcontainer/src/process/args.rs @@ -1,12 +1,13 @@ -use libcgroups::common::AnyCgroupManager; +use libcgroups::common::CgroupConfig; use oci_spec::runtime::Spec; use std::os::unix::prelude::RawFd; use std::path::PathBuf; +use crate::container::Container; +use crate::notify_socket::NotifyListener; use crate::rootless::Rootless; use crate::syscall::syscall::SyscallType; use crate::workload::ExecutorManager; -use crate::{container::Container, notify_socket::NotifyListener}; #[derive(Debug, Copy, Clone)] pub enum ContainerType { @@ -26,15 +27,15 @@ pub struct ContainerArgs<'a> { /// Socket to communicate the file descriptor of the ptty pub console_socket: Option, /// The Unix Domain Socket to communicate container start - pub notify_socket: NotifyListener, + pub notify_listener: NotifyListener, /// File descriptors preserved/passed to the container init process. pub preserve_fds: i32, /// Container state pub container: &'a Option, /// Options for rootless containers pub rootless: &'a Option>, - /// Cgroup Manager - pub cgroup_manager: AnyCgroupManager, + /// Cgroup Manager Config + pub cgroup_config: CgroupConfig, /// If the container is to be run in detached mode pub detached: bool, /// Manage the functions that actually run on the container diff --git a/crates/libcontainer/src/process/container_init_process.rs b/crates/libcontainer/src/process/container_init_process.rs index 73065ab3d..a25a67355 100644 --- a/crates/libcontainer/src/process/container_init_process.rs +++ b/crates/libcontainer/src/process/container_init_process.rs @@ -341,6 +341,7 @@ pub fn container_init_process( let hooks = spec.hooks().as_ref(); let container = args.container.as_ref(); let namespaces = Namespaces::try_from(linux.namespaces().as_ref())?; + let notify_listener = &args.notify_listener; setsid().map_err(|err| { tracing::error!(?err, "failed to setsid to create a session"); @@ -652,13 +653,11 @@ pub fn container_init_process( })?; // listing on the notify socket for container start command - args.notify_socket - .wait_for_container_start() - .map_err(|err| { - tracing::error!(?err, "failed to wait for container start"); - err - })?; - args.notify_socket.close().map_err(|err| { + notify_listener.wait_for_container_start().map_err(|err| { + tracing::error!(?err, "failed to wait for container start"); + err + })?; + notify_listener.close().map_err(|err| { tracing::error!(?err, "failed to close notify socket"); err })?; diff --git a/crates/libcontainer/src/process/container_intermediate_process.rs b/crates/libcontainer/src/process/container_intermediate_process.rs index b8ab33d4c..1d8b8918d 100644 --- a/crates/libcontainer/src/process/container_intermediate_process.rs +++ b/crates/libcontainer/src/process/container_intermediate_process.rs @@ -43,6 +43,8 @@ pub fn container_intermediate_process( let spec = &args.spec; let linux = spec.linux().as_ref().ok_or(MissingSpecError::Linux)?; let namespaces = Namespaces::try_from(linux.namespaces().as_ref())?; + let cgroup_manager = + libcgroups::common::create_cgroup_manager(args.cgroup_config.to_owned()).unwrap(); // this needs to be done before we create the init process, so that the init // process will already be captured by the cgroup. It also needs to be done @@ -55,7 +57,7 @@ pub fn container_intermediate_process( // the cgroup of the process will form the root of the cgroup hierarchy in // the cgroup namespace. apply_cgroups( - &args.cgroup_manager, + &cgroup_manager, linux.resources().as_ref(), matches!(args.container_type, ContainerType::InitContainer), )?; diff --git a/crates/youki/src/commands/mod.rs b/crates/youki/src/commands/mod.rs index 0c763c529..d99a0ec57 100644 --- a/crates/youki/src/commands/mod.rs +++ b/crates/youki/src/commands/mod.rs @@ -59,12 +59,11 @@ fn create_cgroup_manager>( container_id: &str, ) -> Result { let container = load_container(root_path, container_id)?; - let cgroups_path = container.spec()?.cgroup_path; - let systemd_cgroup = container.systemd(); - Ok(libcgroups::common::create_cgroup_manager( - cgroups_path, - systemd_cgroup, - container.id(), + libcgroups::common::CgroupConfig { + cgroup_path: container.spec()?.cgroup_path, + systemd_cgroup: container.systemd(), + container_name: container.id().to_string(), + }, )?) }