diff --git a/docs/doc-draft.md b/docs/doc-draft.md index 72eb37f2a..a23e6b2ba 100644 --- a/docs/doc-draft.md +++ b/docs/doc-draft.md @@ -7,6 +7,10 @@ These are references to various documentations and specifications, which can be - [OCI runtime specification] : The specification for a container runtime. Any OCI complaisant runtime must follow this. - [runc man pages] : has information on various commandline options supported by runc, can be used to understand commands and their options. - [cgroups man page](https://man7.org/linux/man-pages/man7/cgroups.7.html) : contains information about cgroups, their creation, deletion etc. +- [pseudoterminal man page](https://man7.org/linux/man-pages/man7/pty.7.html) : Information about the pseudoterminal system, useful to understand console_socket parameter in create subcommand +- [Unix Sockets man page](https://man7.org/linux/man-pages/man7/unix.7.html) : Useful to understand sockets +- [prctl man page](https://man7.org/linux/man-pages/man2/prctl.2.html) : Process control man pages +- [OCI Linux spec](https://github.com/opencontainers/runtime-spec/blob/master/config-linux.md) : Linux specific section of OCI Spec --- @@ -49,5 +53,12 @@ On invoking Youki, main function parses args passed to it, which contains direct From there it matches subcommand arg with possible subcommand and takes appropriate actions, such as creating a new container, deleting a container erc. +### create container + +One thing to note is that in the end, container is just another process in Linux. It has specific/different control group, namespace, using which program executing in it can be given impression that is is running on a complete system, but on the system which it is running, it is just another process, and has attributes such as pid, file descriptors, etc. associated with it like any other process. + +When given create command, Youki will load the specification, configuration, sockets etc. +forks the process into parent an child (C1), forks the child process again (C2), applies the limits, namespaces etc to the child of child (C2)process ,and runs the command/program in the C2. After the command / program is finished the C2 returns. The C1 is waiting for the C2 to exit, after which it also exits. + [oci runtime specification]: https://github.com/opencontainers/runtime-spec/blob/master/runtime.md [runc man pages]: (https://github.com/opencontainers/runc/blob/master/man/runc.8.md) diff --git a/src/create.rs b/src/create.rs index 5cd967c52..4719e303f 100644 --- a/src/create.rs +++ b/src/create.rs @@ -1,3 +1,4 @@ +//! This handles the creation of a new container use std::fs; use std::path::{Path, PathBuf}; use std::process; @@ -19,19 +20,34 @@ use crate::tty; use crate::utils; use crate::{capabilities, command::Command}; +/// This is the main structure which stores various commandline options given by +/// high-level container runtime #[derive(Clap, Debug)] pub struct Create { + /// File to write pid of the container created + // note that in the end, container is just another process #[clap(short, long)] pid_file: Option, + /// path to the bundle directory, containing config.json and root filesystem #[clap(short, long, default_value = ".")] bundle: PathBuf, + /// Unix socket (file) path , which will receive file descriptor of the writing end of the pseudoterminal #[clap(short, long)] console_socket: Option, + /// name of the container instance to be started pub container_id: String, } +// One thing to note is that in the end, container is just another process in Linux +// it has specific/different control group, namespace, using which program executing in it +// can be given impression that is is running on a complete system, but on the system which +// it is running, it is just another process, and has attributes such as pid, file descriptors, etc. +// associated with it like any other process. impl Create { + /// Starts a new container process pub fn exec(&self, root_path: PathBuf, command: impl Command) -> Result<()> { + // create a directory for the container to store state etc. + // if already present, return error let bundle_canonicalized = fs::canonicalize(&self.bundle) .unwrap_or_else(|_| panic!("failed to canonicalied {:?}", &self.bundle)); let container_dir = root_path.join(&self.container_id); @@ -41,16 +57,21 @@ impl Create { bail!("{} already exists", self.container_id) } + // change directory to the bundle directory, and load configuration, + // copy that to the container's directory unistd::chdir(&self.bundle)?; let spec = oci_spec::Spec::load("config.json")?; fs::copy("config.json", container_dir.join("config.json"))?; log::debug!("spec: {:?}", spec); + // convert path to absolute path, as relative path will be evaluated + // relative to where youki command is executed, and will be difficult to manipulate let container_dir = fs::canonicalize(container_dir)?; unistd::chdir(&*container_dir)?; log::debug!("{:?}", &container_dir); + let container = Container::new( &self.container_id, ContainerStatus::Creating, @@ -61,9 +82,10 @@ impl Create { container.save()?; let mut notify_socket: NotifyListener = NotifyListener::new(&container_dir)?; - + // convert path of root file system of the container to absolute path let rootfs = fs::canonicalize(&spec.root.path)?; - + // if socket file path is given in commandline options, + // get file descriptors of console and console socket let (csocketfd, _consolefd) = { if let Some(console_socket) = &self.console_socket { let (csocketfd, consolefd) = @@ -83,13 +105,16 @@ impl Create { container, command, )?; + // the run_container forks the process, so not after return if in + // parent process, exit ; as the work of creating the container is done if let Process::Parent(_) = process { process::exit(0); } + // if in the child process after fork, then just return Ok(()) } } - +/// Fork the process and actually start the container process fn run_container>( pid_file: Option

, notify_socket: &mut NotifyListener, @@ -99,13 +124,18 @@ fn run_container>( container: Container, command: impl Command, ) -> Result { + // disable core dump for the process, check https://man7.org/linux/man-pages/man2/prctl.2.html for more information prctl::set_dumpable(false).unwrap(); + + // get Linux specific section of OCI spec, + // refer https://github.com/opencontainers/runtime-spec/blob/master/config-linux.md for more information let linux = spec.linux.as_ref().unwrap(); let namespaces: Namespaces = linux.namespaces.clone().into(); let cgroups_path = utils::get_cgroup_path(&linux.cgroups_path, container.id()); let cmanager = cgroups::common::create_cgroup_manager(&cgroups_path)?; + // first fork, which creates process, which will later create actual container process match fork::fork_first( pid_file, namespaces @@ -115,8 +145,11 @@ fn run_container>( &container, cmanager, )? { + // In the parent process, which called run_container Process::Parent(parent) => Ok(Process::Parent(parent)), + // in child process Process::Child(child) => { + // set limits and namespaces to the process for rlimit in spec.process.rlimits.iter() { command.set_rlimit(rlimit)? } @@ -125,22 +158,29 @@ fn run_container>( let without = sched::CloneFlags::CLONE_NEWUSER; namespaces.apply_unshare(without)?; + // set up tty if specified if let Some(csocketfd) = csocketfd { tty::ready(csocketfd)?; } + // set namespaces namespaces.apply_setns()?; + // fork second time, which will later create container match fork::fork_init(child)? { - Process::Child(child) => Ok(Process::Child(child)), + Process::Child(_child) => unreachable!(), + // This is actually the child process after fork Process::Init(mut init) => { + // setup args and env vars as in the spec let spec_args: &Vec = &spec.process.args.clone(); let envs: &Vec = &spec.process.env.clone(); + // prepare process init_process(spec, command, rootfs, namespaces)?; init.ready()?; notify_socket.wait_for_container_start()?; - + // actually run the command / program to be run in container utils::do_exec(&spec_args[0], spec_args, envs)?; + // the command / program is done executing container.update_status(ContainerStatus::Stopped)?.save()?; Ok(Process::Init(init)) @@ -152,6 +192,7 @@ fn run_container>( } } +/// setup hostname, rootfs for the container process fn init_process( spec: oci_spec::Spec, command: impl Command, @@ -173,6 +214,7 @@ fn init_process( .contains(sched::CloneFlags::CLONE_NEWUSER), )?; + // change the root of filesystem of the process to the rootfs command.pivot_rootfs(&rootfs)?; command.set_id(Uid::from_raw(proc.user.uid), Gid::from_raw(proc.user.gid))?;