diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 68e805cd2..a4c40fe7f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -7,30 +7,68 @@ on: - main jobs: + rustfmt: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/cache@v2 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + target/ + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + - run: rustup component add rustfmt + - name: Check formatting + run: cargo fmt --all -- --check tests: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 + - uses: actions/cache@v2 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + target/ + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} - run: rustup component add clippy + - run: sudo apt-get -y update + - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev - uses: actions-rs/clippy-check@v1 with: token: ${{ secrets.GITHUB_TOKEN }} args: --all-features - - run: cargo install cargo-when - name: Build run: ./build.sh - name: Run tests run: cargo test + - name: Run doc tests + run: cargo test --doc integration_tests: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 with: submodules: recursive + - uses: actions/cache@v2 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + target/ + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} - uses: actions-rs/toolchain@v1 with: toolchain: stable - - run: cargo install cargo-when + - run: sudo apt-get -y update + - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev - name: Build run: ./build.sh - uses: actions/setup-go@v2 diff --git a/Cargo.lock b/Cargo.lock index 8578fa343..8d338e067 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -23,17 +23,6 @@ version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28b2cd92db5cbd74e8e5028f7e27dd7aa3090e89e4f2a197cc7c8dfb69c7063b" -[[package]] -name = "atty" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi", - "libc", - "winapi", -] - [[package]] name = "autocfg" version = "1.0.1" @@ -46,6 +35,12 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" +[[package]] +name = "build-env" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1522ac6ee801a11bf9ef3f80403f4ede6eb41291fac3dde3de09989679305f25" + [[package]] name = "byteorder" version = "1.4.3" @@ -69,6 +64,12 @@ version = "1.0.68" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a72c244c1ff497a746a7e1fb3d14bd08420ecda70c8f25c7112f2781652d787" +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + [[package]] name = "cfg-if" version = "1.0.0" @@ -84,6 +85,7 @@ dependencies = [ "libc", "num-integer", "num-traits", + "serde", "time", "winapi", ] @@ -94,14 +96,12 @@ version = "3.0.0-beta.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4bd1061998a501ee7d4b6d449020df3266ca3124b941ec56cf2005c3779ca142" dependencies = [ - "atty", "bitflags", "clap_derive", "indexmap", "lazy_static", "os_str_bytes", "strsim", - "termcolor", "textwrap", "unicode-width", "vec_map", @@ -126,7 +126,27 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81156fece84ab6a9f2afdb109ce3ae577e42b1228441eded99bd77f627953b1a" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", +] + +[[package]] +name = "cstr-argument" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20bd4e8067c20c7c3a4dea759ef91d4b18418ddb5bd8837ef6e2f2f93ca7ccbb" +dependencies = [ + "cfg-if 0.1.10", + "memchr", +] + +[[package]] +name = "dbus" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f597e08dfa79b593f23bbfc7840b23b2c5aa2e3a98d8e68b67b5b9ff800dc0db" +dependencies = [ + "libc", + "libdbus-sys", ] [[package]] @@ -166,12 +186,39 @@ version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd3aec53de10fe96d7d8c565eb17f2c687bb5518a2ec453b5b1252964526abe0" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "crc32fast", "libc", "miniz_oxide", ] +[[package]] +name = "foreign-types" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d737d9aa519fb7b749cbc3b962edcf310a8dd1f4b67c91c4f83975dbdd17d965" +dependencies = [ + "foreign-types-macros", + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-macros" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63f713f8b2aa9e24fec85b0e290c56caee12e3b6ae0aeeda238a75b28251afd6" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "foreign-types-shared" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7684cf33bb7f28497939e8c7cf17e3e4e3b8d9a0080ffa4f8ae2f515442ee855" + [[package]] name = "futures" version = "0.3.15" @@ -279,7 +326,7 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "libc", "wasi", ] @@ -324,6 +371,15 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "instant" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61124eeebbd69b8190558df225adf7e4caafce0d743919e5d6b19652314ec5ec" +dependencies = [ + "cfg-if 1.0.0", +] + [[package]] name = "itoa" version = "0.4.7" @@ -342,13 +398,42 @@ version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "789da6d93f1b866ffe175afc5322a4d76c038605a1c3319bb57b06967ca98a36" +[[package]] +name = "libdbus-sys" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc12a3bc971424edbbf7edaf6e5740483444db63aa8e23d3751ff12a30f306f0" +dependencies = [ + "pkg-config", +] + +[[package]] +name = "libsystemd-sys" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e03fd580bcecda68dcdcd5297085ade6a3dc552cd8b030d2b94a9b089ef7ab8" +dependencies = [ + "build-env", + "libc", + "pkg-config", +] + +[[package]] +name = "lock_api" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0382880606dff6d15c9476c416d18690b72742aa7b605bb6dd6ec9030fbf07eb" +dependencies = [ + "scopeguard", +] + [[package]] name = "log" version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", ] [[package]] @@ -406,7 +491,7 @@ checksum = "b2ccba0cfe4fdf15982d1674c69b1fd80bad427d293849982668dfe454bd61f2" dependencies = [ "bitflags", "cc", - "cfg-if", + "cfg-if 1.0.0", "libc", ] @@ -418,7 +503,7 @@ checksum = "5c3728fec49d363a50a8828a190b379a446cc5cf085c06259bbbeb34447e4ec7" dependencies = [ "bitflags", "cc", - "cfg-if", + "cfg-if 1.0.0", "libc", "memoffset", ] @@ -485,6 +570,31 @@ version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "afb2e1c3ee07430c2cf76151675e583e0f19985fa6efae47d6848a3e2c824f85" +[[package]] +name = "parking_lot" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d7744ac029df22dca6284efe4e898991d28e3085c706c972bcd7da4a27a15eb" +dependencies = [ + "instant", + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7a782938e745763fe6907fc6ba86946d72f49fe7e21de074e08128a99fb018" +dependencies = [ + "cfg-if 1.0.0", + "instant", + "libc", + "redox_syscall", + "smallvec", + "winapi", +] + [[package]] name = "pin-project-lite" version = "0.2.6" @@ -497,6 +607,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkg-config" +version = "0.3.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3831453b3449ceb48b6d9c7ad7c96d5ea673e9b470a1dc578c2ce6521230884c" + [[package]] name = "prctl" version = "1.0.0" @@ -605,6 +721,15 @@ dependencies = [ "getrandom", ] +[[package]] +name = "redox_syscall" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ab49abadf3f9e1c4bc499e8845e152ad87d2ad2d30371841171169e9d75feee" +dependencies = [ + "bitflags", +] + [[package]] name = "regex" version = "1.5.4" @@ -628,6 +753,12 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + [[package]] name = "serde" version = "1.0.126" @@ -659,12 +790,40 @@ dependencies = [ "serde", ] +[[package]] +name = "serial_test" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0bccbcf40c8938196944a3da0e133e031a33f4d6b72db3bda3cc556e361905d" +dependencies = [ + "lazy_static", + "parking_lot", + "serial_test_derive", +] + +[[package]] +name = "serial_test_derive" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2acd6defeddb41eb60bb468f8825d0cfd0c2a76bc03bfd235b6a1dc4f6a1ad5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "slab" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f173ac3d1a7e3b28003f40de0b5ce7fe2710f9b9dc3fc38664cebee46b3b6527" +[[package]] +name = "smallvec" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" + [[package]] name = "strsim" version = "0.10.0" @@ -683,12 +842,27 @@ dependencies = [ ] [[package]] -name = "termcolor" -version = "1.1.2" +name = "systemd" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f722cabda922e471742300045f56dbaa53fafbb4520fca304e51258019bfe91d" +dependencies = [ + "cstr-argument", + "foreign-types", + "libc", + "libsystemd-sys", + "log", + "memchr", + "utf8-cstr", +] + +[[package]] +name = "tabwriter" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4" +checksum = "36205cfc997faadcc4b0b87aaef3fbedafe20d38d4959a7ca6ff803564051111" dependencies = [ - "winapi-util", + "unicode-width", ] [[package]] @@ -748,6 +922,12 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" +[[package]] +name = "utf8-cstr" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55bcbb425141152b10d5693095950b51c3745d019363fc2929ffd8f61449b628" + [[package]] name = "vec_map" version = "0.8.2" @@ -782,15 +962,6 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" -[[package]] -name = "winapi-util" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" -dependencies = [ - "winapi", -] - [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" @@ -805,6 +976,7 @@ dependencies = [ "caps", "chrono", "clap", + "dbus", "futures", "libc", "log", @@ -815,7 +987,9 @@ dependencies = [ "prctl", "procfs", "quickcheck", - "regex", "serde", "serde_json", + "serial_test", + "systemd", + "tabwriter", ] diff --git a/Cargo.toml b/Cargo.toml index 6ed6f9b10..f734a094b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,8 +5,16 @@ authors = ["utam0k "] edition = "2018" description = "A container runtime written in Rust" +[features] +default = ["systemd_cgroups"] +systemd_cgroups = ["systemd"] + +[dependencies.clap] +version = "3.0.0-beta.2" +default-features = false +features = ["std", "suggestions", "derive"] + [dependencies] -clap = "3.0.0-beta.2" nix = "0.19.1" procfs = "0.9.1" caps = "0.5.1" @@ -17,12 +25,18 @@ libc = "0.2.84" log = "0.4" anyhow = "1.0" mio = { version = "0.7", features = ["os-ext", "os-poll"] } -chrono = "0.4" +chrono = { version="0.4", features = ["serde"] } once_cell = "1.6.0" futures = { version = "0.3", features = ["thread-pool"] } -regex = "1.5" oci_spec = { version = "0.1.0", path = "./oci_spec" } +systemd = { version = "0.8", default-features = false, optional = true } +dbus = "0.9.2" +tabwriter = "1" [dev-dependencies] oci_spec = { version = "0.1.0", path = "./oci_spec", features = ["proptests"] } quickcheck = "1" +serial_test = "0.5.1" + +[profile.release] +lto = true diff --git a/README.md b/README.md index 374219f7a..89dc3ba8a 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,21 @@ Here is why I am rewriting a new container runtime in Rust. youki is not at the practical stage yet. However, it is getting closer to practical use, running with docker and passing all the default tests provided by [opencontainers/runtime-tools](https://github.com/opencontainers/runtime-tools). ![youki demo](docs/demo.gif) +## Features + +- [x] run with docker +- [ ] run with podman(WIP on [#24](https://github.com/containers/youki/issues/24)) +- [x] pivot root +- [x] mount devices +- [x] namespaces +- [x] capabilities +- [x] rlimits +- [ ] cgroups v1(WIP on [#9](https://github.com/containers/youki/issues/9)) +- [ ] cgroups v2(WIP on [#78](https://github.com/containers/youki/issues/78)) +- [ ] seccomp(WIP on [#25](https://github.com/containers/youki/issues/25)) +- [ ] hooks(WIP on [#13](https://github.com/containers/youki/issues/13)) +- [ ] rootless(WIP on [#77](https://github.com/containers/youki/issues/77)) + # Getting Started Local build is only supported on linux. @@ -40,12 +55,28 @@ For other platforms, please use the devcontainer that we prepared. - Rust(See [here](https://www.rust-lang.org/tools/install)) - Docker(See [here](https://docs.docker.com/engine/install)) -## Building +## Dependencies + +### Debian, Ubuntu and related distributions + +```sh +$ sudo apt-get install \ + pkg-config \ + libsystemd-dev \ + libdbus-glib-1-dev +``` + +### Fedora, Centos, RHEL and related distributions ```sh -$ cargo install cargo-when # installs prerequisite for building youki +$ sudo dnf install \ + pkg-config \ + systemd-devel \ + dbus-devel ``` +## Build + ```sh $ git clone git@github.com:containers/youki.git $ cd youki @@ -113,21 +144,6 @@ We also have an active [Discord](https://discord.gg/h7R3HgWUct) if you'd like to TBD(WIP on [#14](https://github.com/containers/youki/issues/14)) -# Features - -- [x] run with docker -- [ ] run with podman -- [x] pivot root -- [x] mount devices -- [x] namespaces -- [x] capabilities -- [x] rlimits -- [ ] cgroups v1(WIP on [#9](https://github.com/containers/youki/issues/9)) -- [ ] cgroups v2 -- [ ] seccomp -- [ ] hooks(WIP on [#13](https://github.com/containers/youki/issues/13)) -- [ ] rootless - # Contribution This project welcomes your PR and issues. diff --git a/build.sh b/build.sh index d8de37c30..1beaa739a 100755 --- a/build.sh +++ b/build.sh @@ -8,8 +8,7 @@ VERSION=debug if [[ "$1" == "--release" ]]; then VERSION=release fi -cargo when --channel=stable build --verbose $TGT $1 && \ -cargo when --channel=beta build --verbose $TGT $1 && \ -cargo when --channel=nightly build --verbose --features nightly $TGT $1 && \ + +cargo build --verbose $TGT $1 rm -f youki cp target/$TARGET/$VERSION/youki . diff --git a/docs/doc-draft.md b/docs/doc-draft.md index 4abe5b428..c8dfd7d40 100644 --- a/docs/doc-draft.md +++ b/docs/doc-draft.md @@ -92,3 +92,16 @@ This also provides implementation for Linux syscalls for the trait. [oci runtime specification]: https://github.com/opencontainers/runtime-spec/blob/master/runtime.md [runc man pages]: (https://github.com/opencontainers/runc/blob/master/man/runc.8.md) + +## Capabilities + +This has functions related to set and reset specific capabilities, as well as to drop extra privileges + +- [Simple explanation of capabilities](https://blog.container-solutions.com/linux-capabilities-in-practice) +- [man page for capabilities](https://man7.org/linux/man-pages/man7/capabilities.7.html) + +## Info + +This is primarily for printing info about system running youki, such as OS release, architecture, cpu info, cgroups info etc. , as this info can be helpful when reporting issues. + +- [about /etc/os-release](https://www.freedesktop.org/software/systemd/man/os-release.html) diff --git a/integration_test.sh b/integration_test.sh index ef16abe82..dac099cdf 100755 --- a/integration_test.sh +++ b/integration_test.sh @@ -9,6 +9,8 @@ test_cases=("default/default.t" "linux_cgroups_devices/linux_cgroups_devices.t" "linux_cgroups_relative_devices/linux_cgroups_relative_devices.t" "linux_cgroups_relative_hugetlb/linux_cgroups_relative_hugetlb.t" "linux_cgroups_relative_memory/linux_cgroups_relative_memory.t" "linux_cgroups_relative_network/linux_cgroups_relative_network.t" "linux_cgroups_relative_pids/linux_cgroups_relative_pids.t" "create/create.t" "kill/kill.t" "delete/delete.t" "state/state.t") +# Record the tests that runc also fails to pass below, maybe we will fix this by origin integration test, issue: https://github.com/containers/youki/issues/56 +# no_paas_test_case=("start/start.t") for case in "${test_cases[@]}"; do echo "Running $case" if [ 0 -ne $(sudo RUST_BACKTRACE=1 YOUKI_LOG_LEVEL=debug RUNTIME=$root/youki $root/integration_test/src/github.com/opencontainers/runtime-tools/validation/$case | grep "not ok" | wc -l) ]; then diff --git a/oci_spec/src/lib.rs b/oci_spec/src/lib.rs index 4fbc56371..c3a9d68e5 100644 --- a/oci_spec/src/lib.rs +++ b/oci_spec/src/lib.rs @@ -1,9 +1,9 @@ use nix::sys::stat::SFlag; use std::collections::HashMap; use std::fs::File; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; -use anyhow::{bail, Result}; +use anyhow::{bail, Context, Result}; use serde::{Deserialize, Serialize}; #[derive(Serialize, Deserialize, Debug, Clone)] @@ -460,6 +460,7 @@ pub struct LinuxResources { #[serde(default)] pub hugepage_limits: Vec, pub network: Option, + pub freezer: Option, } #[derive(Serialize, Deserialize, Debug, Clone, Copy)] @@ -556,12 +557,19 @@ pub enum LinuxSeccompOperator { ScmpCmpMaskedEq = 7, } +#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq, Eq)] +pub enum FreezerState { + Undefined, + Frozen, + Thawed, +} + #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(rename_all = "camelCase")] pub struct Linux { - #[serde(default, rename = "LinuxIDMapping")] + #[serde(default)] pub uid_mappings: Vec, - #[serde(default, rename = "LinuxIDMapping")] + #[serde(default)] pub gid_mappings: Vec, #[serde(default)] pub sysctl: HashMap, @@ -599,13 +607,19 @@ pub struct Spec { } impl Spec { - pub fn load(path: &str) -> Result { - let file = File::open(path)?; - let mut spec: Spec = serde_json::from_reader(&file)?; - // FIME: It is fail if the caller isn't in the correct directory. - spec.root.path = std::fs::canonicalize(spec.root.path)?; + pub fn load>(path: P) -> Result { + let path = path.as_ref(); + let file = + File::open(path).with_context(|| format!("load spec: failed to open {:?}", path))?; + let spec: Spec = serde_json::from_reader(&file)?; Ok(spec) } + + pub fn canonicalize_rootfs(&mut self) -> Result<()> { + self.root.path = std::fs::canonicalize(&self.root.path) + .with_context(|| format!("failed to canonicalize {:?}", self.root.path))?; + Ok(()) + } } #[cfg(feature = "proptests")] diff --git a/src/capabilities.rs b/src/capabilities.rs index cc35aacfe..051c882be 100644 --- a/src/capabilities.rs +++ b/src/capabilities.rs @@ -1,9 +1,11 @@ -use crate::command::Command; +//! Handles Management of Capabilities +use crate::command::Syscall; use caps::*; use anyhow::Result; use oci_spec::{LinuxCapabilities, LinuxCapabilityType}; +/// Converts a list of capability types to capabilities has set fn to_set(caps: &[LinuxCapabilityType]) -> CapsHashSet { let mut capabilities = CapsHashSet::new(); for c in caps { @@ -12,30 +14,26 @@ fn to_set(caps: &[LinuxCapabilityType]) -> CapsHashSet { capabilities } -pub fn reset_effective(command: &impl Command) -> Result<()> { +/// reset capabilities of process calling this to effective capabilities +/// effective capability set is set of capabilities used by kernel to perform checks +/// see https://man7.org/linux/man-pages/man7/capabilities.7.html for more information +pub fn reset_effective(syscall: &impl Syscall) -> Result<()> { log::debug!("reset all caps"); - command.set_capability(CapSet::Effective, &caps::all())?; + syscall.set_capability(CapSet::Effective, &caps::all())?; Ok(()) } -pub fn drop_privileges(cs: &LinuxCapabilities, command: &impl Command) -> Result<()> { - let all = caps::all(); +/// Drop any extra granted capabilities, and reset to defaults which are in oci specification +pub fn drop_privileges(cs: &LinuxCapabilities, syscall: &impl Syscall) -> Result<()> { log::debug!("dropping bounding capabilities to {:?}", cs.bounding); - for c in all.difference(&to_set(&cs.bounding)) { - match c { - Capability::CAP_PERFMON | Capability::CAP_CHECKPOINT_RESTORE | Capability::CAP_BPF => { - log::warn!("{:?} doesn't support.", c); - continue; - } - _ => caps::drop(None, CapSet::Bounding, *c)?, - } - } + syscall.set_capability(CapSet::Bounding, &to_set(&cs.bounding))?; - command.set_capability(CapSet::Effective, &to_set(&cs.effective))?; - command.set_capability(CapSet::Permitted, &to_set(&cs.permitted))?; - command.set_capability(CapSet::Inheritable, &to_set(&cs.inheritable))?; + syscall.set_capability(CapSet::Effective, &to_set(&cs.effective))?; + syscall.set_capability(CapSet::Permitted, &to_set(&cs.permitted))?; + syscall.set_capability(CapSet::Inheritable, &to_set(&cs.inheritable))?; - if let Err(e) = command.set_capability(CapSet::Ambient, &to_set(&cs.ambient)) { + // check specifically for ambient, as those might not always be available + if let Err(e) = syscall.set_capability(CapSet::Ambient, &to_set(&cs.ambient)) { log::error!("failed to set ambient capabilities: {}", e); } Ok(()) @@ -44,11 +42,11 @@ pub fn drop_privileges(cs: &LinuxCapabilities, command: &impl Command) -> Result #[cfg(test)] mod tests { use super::*; - use crate::command::test::TestHelperCommand; + use crate::command::test::TestHelperSyscall; #[test] fn test_reset_effective() { - let test_command = TestHelperCommand::default(); + let test_command = TestHelperSyscall::default(); assert!(reset_effective(&test_command).is_ok()); let set_capability_args: Vec<_> = test_command .get_set_capability_args() diff --git a/src/cgroups/common.rs b/src/cgroups/common.rs index 56f3b1a57..6e35813eb 100644 --- a/src/cgroups/common.rs +++ b/src/cgroups/common.rs @@ -6,11 +6,16 @@ use std::{ path::{Path, PathBuf}, }; - use anyhow::{bail, Context, Result}; use nix::unistd::Pid; use oci_spec::LinuxResources; use procfs::process::Process; +#[cfg(feature = "systemd_cgroups")] +use systemd::daemon::booted; +#[cfg(not(feature = "systemd_cgroups"))] +fn booted() -> Result { + bail!("This build does not include the systemd cgroups feature") +} use crate::cgroups::v1; use crate::cgroups::v2; @@ -19,7 +24,11 @@ pub const CGROUP_PROCS: &str = "cgroup.procs"; pub const DEFAULT_CGROUP_ROOT: &str = "/sys/fs/cgroup"; pub trait CgroupManager { - fn apply(&self, linux_resources: &LinuxResources, pid: Pid) -> Result<()>; + /// Adds a task specified by its pid to the cgroup + fn add_task(&self, pid: Pid) -> Result<()>; + /// Applies resource restrictions to the cgroup + fn apply(&self, linux_resources: &LinuxResources) -> Result<()>; + /// Removes the cgroup fn remove(&self) -> Result<()>; } @@ -91,7 +100,10 @@ pub fn get_supported_cgroup_fs() -> Result> { Ok(cgroups) } -pub fn create_cgroup_manager>(cgroup_path: P) -> Result> { +pub fn create_cgroup_manager>( + cgroup_path: P, + systemd_cgroup: bool, +) -> Result> { let cgroup_mount = Process::myself()? .mountinfo()? .into_iter() @@ -109,6 +121,16 @@ pub fn create_cgroup_manager>(cgroup_path: P) -> Result { log::info!("cgroup manager V2 will be used"); + if systemd_cgroup { + if !booted()? { + bail!("systemd cgroup flag passed, but systemd support for managing cgroups is not available"); + } + log::info!("systemd cgroup manager will be used"); + return Ok(Box::new(v2::SystemDCGroupManager::new( + cgroup2.mount_point, + cgroup_path.into(), + )?)); + } Ok(Box::new(v2::manager::Manager::new( cgroup2.mount_point, cgroup_path.into(), @@ -119,6 +141,16 @@ pub fn create_cgroup_manager>(cgroup_path: P) -> Result { log::info!("cgroup manager V2 will be used"); + if systemd_cgroup { + if !booted()? { + bail!("systemd cgroup flag passed, but systemd support for managing cgroups is not available"); + } + log::info!("systemd cgroup manager will be used"); + return Ok(Box::new(v2::SystemDCGroupManager::new( + cgroup2.mount_point, + cgroup_path.into(), + )?)); + } Ok(Box::new(v2::manager::Manager::new( cgroup2.mount_point, cgroup_path.into(), diff --git a/src/cgroups/test.rs b/src/cgroups/test.rs index 065986deb..57ad71ef8 100644 --- a/src/cgroups/test.rs +++ b/src/cgroups/test.rs @@ -2,58 +2,13 @@ use anyhow::Result; use std::{ - fs, io::Write, - ops::Deref, path::{Path, PathBuf}, }; use oci_spec::LinuxCpu; -pub struct TempDir { - path: Option, -} - -impl TempDir { - pub fn new>(path: P) -> Result { - let p = path.into(); - std::fs::create_dir_all(&p)?; - Ok(Self { path: Some(p) }) - } - - pub fn path(&self) -> &Path { - self.path - .as_ref() - .expect("temp dir has already been removed") - } - - pub fn remove(&mut self) { - if let Some(p) = &self.path { - let _ = fs::remove_dir_all(p); - self.path = None; - } - } -} - -impl Drop for TempDir { - fn drop(&mut self) { - self.remove(); - } -} - -impl AsRef for TempDir { - fn as_ref(&self) -> &Path { - self.path() - } -} - -impl Deref for TempDir { - type Target = Path; - - fn deref(&self) -> &Self::Target { - self.path() - } -} +use crate::utils::{create_temp_dir, TempDir}; pub fn setup(testname: &str, cgroup_file: &str) -> (TempDir, PathBuf) { let tmp = create_temp_dir(testname).expect("create temp directory for test"); @@ -76,11 +31,6 @@ pub fn set_fixture(temp_dir: &Path, filename: &str, val: &str) -> Result Result { - let dir = TempDir::new(std::env::temp_dir().join(test_name))?; - Ok(dir) -} - pub struct LinuxCpuBuilder { resource: LinuxCpu, } diff --git a/src/cgroups/v1/blkio.rs b/src/cgroups/v1/blkio.rs index 2801eb2b7..3480084c1 100644 --- a/src/cgroups/v1/blkio.rs +++ b/src/cgroups/v1/blkio.rs @@ -1,12 +1,7 @@ -use std::{ - fs::{self}, - path::Path, -}; - -use crate::cgroups::{ - common::{self, CGROUP_PROCS}, - v1::Controller, -}; +use std::path::Path; + +use crate::cgroups::{common, v1::Controller}; +use anyhow::Result; use oci_spec::{LinuxBlockIo, LinuxResources}; const CGROUP_BLKIO_THROTTLE_READ_BPS: &str = "blkio.throttle.read_bps_device"; @@ -17,25 +12,29 @@ const CGROUP_BLKIO_THROTTLE_WRITE_IOPS: &str = "blkio.throttle.write_iops_device pub struct Blkio {} impl Controller for Blkio { - fn apply( - linux_resources: &LinuxResources, - cgroup_root: &Path, - pid: nix::unistd::Pid, - ) -> anyhow::Result<()> { + type Resource = LinuxBlockIo; + + fn apply(linux_resources: &LinuxResources, cgroup_root: &Path) -> Result<()> { log::debug!("Apply blkio cgroup config"); - fs::create_dir_all(cgroup_root)?; - if let Some(blkio) = &linux_resources.block_io { + if let Some(blkio) = Self::needs_to_handle(linux_resources) { Self::apply(cgroup_root, blkio)?; } - common::write_cgroup_file(cgroup_root.join(CGROUP_PROCS), pid)?; Ok(()) } + + fn needs_to_handle(linux_resources: &LinuxResources) -> Option<&Self::Resource> { + if let Some(blkio) = &linux_resources.block_io { + return Some(blkio); + } + + None + } } impl Blkio { - fn apply(root_path: &Path, blkio: &LinuxBlockIo) -> anyhow::Result<()> { + fn apply(root_path: &Path, blkio: &LinuxBlockIo) -> Result<()> { for trbd in &blkio.blkio_throttle_read_bps_device { common::write_cgroup_file_str( &root_path.join(CGROUP_BLKIO_THROTTLE_READ_BPS), @@ -70,6 +69,8 @@ impl Blkio { #[cfg(test)] mod tests { + use std::fs; + use super::*; use crate::cgroups::test::setup; use oci_spec::{LinuxBlockIo, LinuxThrottleDevice}; diff --git a/src/cgroups/v1/controller.rs b/src/cgroups/v1/controller.rs index 84e0b3cc2..9aaa8fcae 100644 --- a/src/cgroups/v1/controller.rs +++ b/src/cgroups/v1/controller.rs @@ -1,10 +1,25 @@ -use std::path::Path; +use std::{fs, path::Path}; use anyhow::Result; use nix::unistd::Pid; use oci_spec::LinuxResources; +use crate::cgroups::common::{self, CGROUP_PROCS}; + pub trait Controller { - fn apply(linux_resources: &LinuxResources, cgroup_root: &Path, pid: Pid) -> Result<()>; + type Resource; + + /// Adds a new task specified by its pid to the cgroup + fn add_task(pid: Pid, cgroup_path: &Path) -> Result<()> { + fs::create_dir_all(cgroup_path)?; + common::write_cgroup_file(cgroup_path.join(CGROUP_PROCS), pid)?; + Ok(()) + } + + /// Applies resource restrictions to the cgroup + fn apply(linux_resources: &LinuxResources, cgroup_root: &Path) -> Result<()>; + + /// Checks if the controller needs to handle this request + fn needs_to_handle(linux_resources: &LinuxResources) -> Option<&Self::Resource>; } diff --git a/src/cgroups/v1/controller_type.rs b/src/cgroups/v1/controller_type.rs index 68cb25572..6fbf0f37f 100644 --- a/src/cgroups/v1/controller_type.rs +++ b/src/cgroups/v1/controller_type.rs @@ -1,5 +1,6 @@ use std::fmt::Display; +#[derive(Hash, PartialEq, Eq, Debug, Clone)] pub enum ControllerType { Cpu, CpuAcct, @@ -11,6 +12,7 @@ pub enum ControllerType { Blkio, NetworkPriority, NetworkClassifier, + Freezer, } impl Display for ControllerType { @@ -26,6 +28,7 @@ impl Display for ControllerType { Self::Blkio => "blkio", Self::NetworkPriority => "net_prio", Self::NetworkClassifier => "net_cls", + Self::Freezer => "freezer", }; write!(f, "{}", print) @@ -43,4 +46,5 @@ pub const CONTROLLERS: &[ControllerType] = &[ ControllerType::Blkio, ControllerType::NetworkPriority, ControllerType::NetworkClassifier, + ControllerType::Freezer, ]; diff --git a/src/cgroups/v1/cpu.rs b/src/cgroups/v1/cpu.rs index 50a7c7eb5..006bc09cf 100644 --- a/src/cgroups/v1/cpu.rs +++ b/src/cgroups/v1/cpu.rs @@ -1,10 +1,9 @@ -use std::{fs, path::Path}; +use std::path::Path; use anyhow::Result; -use nix::unistd::Pid; use oci_spec::{LinuxCpu, LinuxResources}; -use crate::cgroups::common::{self, CGROUP_PROCS}; +use crate::cgroups::common; use super::Controller; @@ -17,16 +16,32 @@ const CGROUP_CPU_RT_PERIOD: &str = "cpu.rt_period_us"; pub struct Cpu {} impl Controller for Cpu { - fn apply(linux_resources: &LinuxResources, cgroup_root: &Path, pid: Pid) -> Result<()> { + type Resource = LinuxCpu; + + fn apply(linux_resources: &LinuxResources, cgroup_root: &Path) -> Result<()> { log::debug!("Apply Cpu cgroup config"); - fs::create_dir_all(cgroup_root)?; - if let Some(cpu) = &linux_resources.cpu { + + if let Some(cpu) = Self::needs_to_handle(linux_resources) { Self::apply(cgroup_root, cpu)?; } - common::write_cgroup_file(cgroup_root.join(CGROUP_PROCS), pid)?; Ok(()) } + + fn needs_to_handle(linux_resources: &LinuxResources) -> Option<&Self::Resource> { + if let Some(cpu) = &linux_resources.cpu { + if cpu.shares.is_some() + || cpu.period.is_some() + || cpu.quota.is_some() + || cpu.realtime_period.is_some() + || cpu.realtime_runtime.is_some() + { + return Some(cpu); + } + } + + None + } } impl Cpu { diff --git a/src/cgroups/v1/cpuacct.rs b/src/cgroups/v1/cpuacct.rs index 3060aebcf..2632847e2 100644 --- a/src/cgroups/v1/cpuacct.rs +++ b/src/cgroups/v1/cpuacct.rs @@ -1,21 +1,43 @@ -use std::{fs, path::Path}; +use std::path::Path; use anyhow::Result; -use nix::unistd::Pid; use oci_spec::LinuxResources; -use crate::cgroups::common::{self, CGROUP_PROCS}; - use super::Controller; pub struct CpuAcct {} impl Controller for CpuAcct { - fn apply(_linux_resources: &LinuxResources, cgroup_path: &Path, pid: Pid) -> Result<()> { - log::debug!("Apply cpuacct cgroup config"); - fs::create_dir_all(cgroup_path)?; + type Resource = (); - common::write_cgroup_file(cgroup_path.join(CGROUP_PROCS), pid)?; + fn apply(_linux_resources: &LinuxResources, _cgroup_path: &Path) -> Result<()> { Ok(()) } + + // apply never needs to be called, for accounting only + fn needs_to_handle(_linux_resources: &LinuxResources) -> Option<&Self::Resource> { + None + } +} + +#[cfg(test)] +mod tests { + use std::fs; + + use nix::unistd::Pid; + + use super::*; + use crate::cgroups::{common::CGROUP_PROCS, test::setup}; + + #[test] + fn test_add_task() { + let (tmp, procs) = setup("test_cpuacct_apply", CGROUP_PROCS); + let pid = Pid::from_raw(1000); + + CpuAcct::add_task(pid, &tmp).expect("apply cpuacct"); + + let content = fs::read_to_string(&procs) + .unwrap_or_else(|_| panic!("read {} file content", CGROUP_PROCS)); + assert_eq!(content, "1000"); + } } diff --git a/src/cgroups/v1/cpuset.rs b/src/cgroups/v1/cpuset.rs index c08833c59..b7d669311 100644 --- a/src/cgroups/v1/cpuset.rs +++ b/src/cgroups/v1/cpuset.rs @@ -1,8 +1,9 @@ use std::{fs, path::Path}; use anyhow::{bail, Result}; -use nix::unistd::Pid; +use nix::unistd; use oci_spec::{LinuxCpu, LinuxResources}; +use unistd::Pid; use crate::cgroups::common::{self, CGROUP_PROCS}; @@ -14,20 +15,37 @@ const CGROUP_CPUSET_MEMS: &str = "cpuset.mems"; pub struct CpuSet {} impl Controller for CpuSet { - fn apply(linux_resources: &LinuxResources, cgroup_path: &Path, pid: Pid) -> Result<()> { - log::debug!("Apply CpuSet cgroup config"); + type Resource = LinuxCpu; + + fn add_task(pid: Pid, cgroup_path: &Path) -> Result<()> { fs::create_dir_all(cgroup_path)?; Self::ensure_not_empty(cgroup_path, CGROUP_CPUSET_CPUS)?; Self::ensure_not_empty(cgroup_path, CGROUP_CPUSET_MEMS)?; - if let Some(cpuset) = &linux_resources.cpu { + common::write_cgroup_file(cgroup_path.join(CGROUP_PROCS), pid)?; + Ok(()) + } + + fn apply(linux_resources: &LinuxResources, cgroup_path: &Path) -> Result<()> { + log::debug!("Apply CpuSet cgroup config"); + + if let Some(cpuset) = Self::needs_to_handle(linux_resources) { Self::apply(cgroup_path, cpuset)?; } - common::write_cgroup_file(cgroup_path.join(CGROUP_PROCS), pid)?; Ok(()) } + + fn needs_to_handle(linux_resources: &LinuxResources) -> Option<&Self::Resource> { + if let Some(cpuset) = &linux_resources.cpu { + if cpuset.cpus.is_some() || cpuset.mems.is_some() { + return Some(cpuset); + } + } + + None + } } impl CpuSet { @@ -46,7 +64,7 @@ impl CpuSet { // if a task is moved into the cgroup and a value has not been set for cpus and mems // Errno 28 (no space left on device) will be returned. Therefore we set the value from the parent if required. fn ensure_not_empty(cgroup_path: &Path, interface_file: &str) -> Result<()> { - let mut current = util::get_subsystem_mount_points(&ControllerType::CpuSet.to_string())?; + let mut current = util::get_subsystem_mount_point(&ControllerType::CpuSet)?; let relative_cgroup_path = cgroup_path.strip_prefix(¤t)?; for component in relative_cgroup_path.components() { diff --git a/src/cgroups/v1/devices.rs b/src/cgroups/v1/devices.rs index ce49d2903..3e5f12705 100644 --- a/src/cgroups/v1/devices.rs +++ b/src/cgroups/v1/devices.rs @@ -1,18 +1,18 @@ -use std::{fs::create_dir_all, path::Path}; +use std::path::Path; use anyhow::Result; -use nix::unistd::Pid; -use crate::cgroups::common::{self, CGROUP_PROCS}; +use crate::cgroups::common; use crate::{cgroups::v1::Controller, rootfs::default_devices}; use oci_spec::{LinuxDeviceCgroup, LinuxDeviceType, LinuxResources}; pub struct Devices {} impl Controller for Devices { - fn apply(linux_resources: &LinuxResources, cgroup_root: &Path, pid: Pid) -> Result<()> { + type Resource = (); + + fn apply(linux_resources: &LinuxResources, cgroup_root: &Path) -> Result<()> { log::debug!("Apply Devices cgroup config"); - create_dir_all(&cgroup_root)?; for d in &linux_resources.devices { Self::apply_device(d, cgroup_root)?; @@ -27,9 +27,13 @@ impl Controller for Devices { Self::apply_device(&d, &cgroup_root)?; } - common::write_cgroup_file(cgroup_root.join(CGROUP_PROCS), pid)?; Ok(()) } + + // always needs to be called due to default devices + fn needs_to_handle(_linux_resources: &LinuxResources) -> Option<&Self::Resource> { + Some(&()) + } } impl Devices { @@ -98,7 +102,8 @@ impl Devices { #[cfg(test)] mod tests { use super::*; - use crate::cgroups::test::{create_temp_dir, set_fixture}; + use crate::cgroups::test::set_fixture; + use crate::utils::create_temp_dir; use oci_spec::{LinuxDeviceCgroup, LinuxDeviceType}; use std::fs::read_to_string; diff --git a/src/cgroups/v1/freezer.rs b/src/cgroups/v1/freezer.rs new file mode 100644 index 000000000..4a4dd090c --- /dev/null +++ b/src/cgroups/v1/freezer.rs @@ -0,0 +1,258 @@ +use std::io::prelude::*; +use std::{ + fs::{create_dir_all, OpenOptions}, + path::Path, + thread, time, +}; + +use anyhow::{Result, *}; + +use crate::cgroups::common; +use crate::cgroups::v1::Controller; +use oci_spec::{FreezerState, LinuxResources}; + +const CGROUP_FREEZER_STATE: &str = "freezer.state"; +const FREEZER_STATE_THAWED: &str = "THAWED"; +const FREEZER_STATE_FROZEN: &str = "FROZEN"; +const FREEZER_STATE_FREEZING: &str = "FREEZING"; + +pub struct Freezer {} + +impl Controller for Freezer { + type Resource = FreezerState; + + fn apply(linux_resources: &LinuxResources, cgroup_root: &Path) -> Result<()> { + log::debug!("Apply Freezer cgroup config"); + create_dir_all(&cgroup_root)?; + + if let Some(freezer_state) = Self::needs_to_handle(linux_resources) { + Self::apply(freezer_state, cgroup_root)?; + } + + Ok(()) + } + + fn needs_to_handle(linux_resources: &LinuxResources) -> Option<&Self::Resource> { + if let Some(freezer_state) = &linux_resources.freezer { + return Some(freezer_state); + } + + None + } +} + +impl Freezer { + fn apply(freezer_state: &FreezerState, cgroup_root: &Path) -> Result<()> { + match freezer_state { + FreezerState::Undefined => {} + FreezerState::Thawed => { + common::write_cgroup_file( + cgroup_root.join(CGROUP_FREEZER_STATE), + FREEZER_STATE_THAWED, + )?; + } + FreezerState::Frozen => { + let r = || -> Result<()> { + // We should do our best to retry if FREEZING is seen until it becomes FROZEN. + // Add sleep between retries occasionally helped when system is extremely slow. + // see: + // https://github.com/opencontainers/runc/blob/b9ee9c6314599f1b4a7f497e1f1f856fe433d3b7/libcontainer/cgroups/fs/freezer.go#L42 + for i in 0..1000 { + if i % 50 == 49 { + let _ = common::write_cgroup_file( + cgroup_root.join(CGROUP_FREEZER_STATE), + FREEZER_STATE_THAWED, + ); + thread::sleep(time::Duration::from_millis(10)); + } + + common::write_cgroup_file( + cgroup_root.join(CGROUP_FREEZER_STATE), + FREEZER_STATE_FROZEN, + )?; + + if i % 25 == 24 { + thread::sleep(time::Duration::from_millis(10)); + } + + let r = Self::read_freezer_state(cgroup_root)?; + match r.trim() { + FREEZER_STATE_FREEZING => { + continue; + } + FREEZER_STATE_FROZEN => { + if i > 1 { + log::debug!("frozen after {} retries", i) + } + return Ok(()); + } + _ => { + // should not reach here. + bail!("unexpected state {} while freezing", r.trim()); + } + } + } + bail!("unbale to freeze"); + }(); + + if r.is_err() { + // Freezing failed, and it is bad and dangerous to leave the cgroup in FROZEN or + // FREEZING, so try to thaw it back. + let _ = common::write_cgroup_file( + cgroup_root.join(CGROUP_FREEZER_STATE), + FREEZER_STATE_THAWED, + ); + } + return r; + } + } + Ok(()) + } + + fn read_freezer_state(cgroup_root: &Path) -> Result { + let path = cgroup_root.join(CGROUP_FREEZER_STATE); + let mut content = String::new(); + OpenOptions::new() + .create(false) + .read(true) + .open(path)? + .read_to_string(&mut content)?; + Ok(content) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cgroups::common::CGROUP_PROCS; + use crate::cgroups::test::set_fixture; + use crate::utils::create_temp_dir; + use nix::unistd::Pid; + use oci_spec::FreezerState; + + #[test] + fn test_set_freezer_state() { + let tmp = + create_temp_dir("test_set_freezer_state").expect("create temp directory for test"); + set_fixture(&tmp, CGROUP_FREEZER_STATE, "").expect("Set fixure for freezer state"); + + // set Frozen state. + { + let freezer_state = FreezerState::Frozen; + Freezer::apply(&freezer_state, &tmp).expect("Set freezer state"); + + let state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZER_STATE)).expect("Read to string"); + assert_eq!(FREEZER_STATE_FROZEN, state_content); + } + + // set Thawed state. + { + let freezer_state = FreezerState::Thawed; + Freezer::apply(&freezer_state, &tmp).expect("Set freezer state"); + + let state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZER_STATE)).expect("Read to string"); + assert_eq!(FREEZER_STATE_THAWED, state_content); + } + + // set Undefined state. + { + let old_state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZER_STATE)).expect("Read to string"); + let freezer_state = FreezerState::Undefined; + Freezer::apply(&freezer_state, &tmp).expect("Set freezer state"); + + let state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZER_STATE)).expect("Read to string"); + assert_eq!(old_state_content, state_content); + } + } + + #[test] + fn test_add_and_apply() { + let tmp = create_temp_dir("test_add_task").expect("create temp directory for test"); + set_fixture(&tmp, CGROUP_FREEZER_STATE, "").expect("set fixure for freezer state"); + set_fixture(&tmp, CGROUP_PROCS, "").expect("set fixture for proc file"); + + // set Thawed state. + { + let linux_resources = LinuxResources { + devices: vec![], + disable_oom_killer: false, + oom_score_adj: None, + memory: None, + cpu: None, + pids: None, + block_io: None, + hugepage_limits: vec![], + network: None, + freezer: Some(FreezerState::Thawed), + }; + + let pid = Pid::from_raw(1000); + Freezer::add_task(pid, &tmp).expect("freezer add task"); + ::apply(&linux_resources, &tmp).expect("freezer apply"); + let state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZER_STATE)).expect("read to string"); + assert_eq!(FREEZER_STATE_THAWED, state_content); + let pid_content = + std::fs::read_to_string(tmp.join(CGROUP_PROCS)).expect("read to string"); + assert_eq!(pid_content, "1000"); + } + + // set Frozen state. + { + let linux_resources = LinuxResources { + devices: vec![], + disable_oom_killer: false, + oom_score_adj: None, + memory: None, + cpu: None, + pids: None, + block_io: None, + hugepage_limits: vec![], + network: None, + freezer: Some(FreezerState::Frozen), + }; + + let pid = Pid::from_raw(1001); + Freezer::add_task(pid, &tmp).expect("freezer add task"); + ::apply(&linux_resources, &tmp).expect("freezer apply"); + let state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZER_STATE)).expect("read to string"); + assert_eq!(FREEZER_STATE_FROZEN, state_content); + let pid_content = + std::fs::read_to_string(tmp.join(CGROUP_PROCS)).expect("read to string"); + assert_eq!(pid_content, "1001"); + } + + // set Undefined state. + { + let linux_resources = LinuxResources { + devices: vec![], + disable_oom_killer: false, + oom_score_adj: None, + memory: None, + cpu: None, + pids: None, + block_io: None, + hugepage_limits: vec![], + network: None, + freezer: Some(FreezerState::Undefined), + }; + + let pid = Pid::from_raw(1002); + let old_state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZER_STATE)).expect("read to string"); + Freezer::add_task(pid, &tmp).expect("freezer add task"); + ::apply(&linux_resources, &tmp).expect("freezer apply"); + let state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZER_STATE)).expect("read to string"); + assert_eq!(old_state_content, state_content); + let pid_content = + std::fs::read_to_string(tmp.join(CGROUP_PROCS)).expect("read to string"); + assert_eq!(pid_content, "1002"); + } + } +} diff --git a/src/cgroups/v1/hugetlb.rs b/src/cgroups/v1/hugetlb.rs index ad363a821..3e3551285 100644 --- a/src/cgroups/v1/hugetlb.rs +++ b/src/cgroups/v1/hugetlb.rs @@ -1,51 +1,51 @@ -use std::{fs, path::Path}; +use std::path::Path; -use anyhow::bail; -use regex::Regex; +use anyhow::{bail, Result}; -use crate::cgroups::{ - common::{self, CGROUP_PROCS}, - v1::Controller, -}; +use crate::cgroups::{common, v1::Controller}; use oci_spec::{LinuxHugepageLimit, LinuxResources}; pub struct Hugetlb {} impl Controller for Hugetlb { - fn apply( - linux_resources: &LinuxResources, - cgroup_root: &std::path::Path, - pid: nix::unistd::Pid, - ) -> anyhow::Result<()> { + type Resource = Vec; + + fn apply(linux_resources: &LinuxResources, cgroup_root: &std::path::Path) -> Result<()> { log::debug!("Apply Hugetlb cgroup config"); - fs::create_dir_all(cgroup_root)?; - for hugetlb in &linux_resources.hugepage_limits { - Self::apply(cgroup_root, hugetlb)? + if let Some(hugepage_limits) = Self::needs_to_handle(linux_resources) { + for hugetlb in hugepage_limits { + Self::apply(cgroup_root, hugetlb)? + } } - common::write_cgroup_file(cgroup_root.join(CGROUP_PROCS), pid)?; Ok(()) } + + fn needs_to_handle(linux_resources: &LinuxResources) -> Option<&Self::Resource> { + if !linux_resources.hugepage_limits.is_empty() { + return Some(&linux_resources.hugepage_limits); + } + + None + } } impl Hugetlb { - fn apply(root_path: &Path, hugetlb: &LinuxHugepageLimit) -> anyhow::Result<()> { - let re = Regex::new(r"(?P[0-9]+)[KMG]B")?; - let caps = re.captures(&hugetlb.page_size); - match caps { - None => bail!("page size must be in the format [0-9]+[KMG]B"), - Some(caps) => { - let page_size: u64 = caps["pagesize"].parse()?; - if !Self::is_power_of_two(page_size) { - bail!("page size must be in the format of 2^(integer)"); - } - } + fn apply(root_path: &Path, hugetlb: &LinuxHugepageLimit) -> Result<()> { + let page_size: String = hugetlb + .page_size + .chars() + .take_while(|c| c.is_digit(10)) + .collect(); + let page_size: u64 = page_size.parse()?; + if !Self::is_power_of_two(page_size) { + bail!("page size must be in the format of 2^(integer)"); } common::write_cgroup_file( - &root_path.join(format!("hugetlb.{}.limit_in_bytes", hugetlb.page_size)), - &hugetlb.limit, + root_path.join(format!("hugetlb.{}.limit_in_bytes", hugetlb.page_size)), + hugetlb.limit, )?; Ok(()) } @@ -58,7 +58,8 @@ impl Hugetlb { #[cfg(test)] mod tests { use super::*; - use crate::cgroups::test::{create_temp_dir, set_fixture}; + use crate::cgroups::test::set_fixture; + use crate::utils::create_temp_dir; use oci_spec::LinuxHugepageLimit; use std::fs::read_to_string; @@ -102,10 +103,13 @@ mod tests { let result = Hugetlb::apply(&tmp, &hugetlb); - let re = Regex::new(r"(?P[0-9]+)[KMG]B").expect("create regex for parsing pagesize"); - let caps = re.captures(&hugetlb.page_size).expect("should capture pagesize"); + let page_size: String = hugetlb + .page_size + .chars() + .take_while(|c| c.is_digit(10)) + .collect(); + let page_size: u64 = page_size.parse().expect("parse page size"); - let page_size: u64 = caps["pagesize"].parse().expect("should contain captured pagesize"); if Hugetlb::is_power_of_two(page_size) && page_size != 1 { let content = read_to_string(tmp.join(page_file_name)).expect("Read hugetlb file content"); diff --git a/src/cgroups/v1/manager.rs b/src/cgroups/v1/manager.rs index fd5d7a2ea..b3aae533c 100644 --- a/src/cgroups/v1/manager.rs +++ b/src/cgroups/v1/manager.rs @@ -2,15 +2,18 @@ use std::fs; use std::path::Path; use std::{collections::HashMap, path::PathBuf}; +use anyhow::bail; use anyhow::Result; use nix::unistd::Pid; use procfs::process::Process; +use super::ControllerType as CtrlType; use super::{ blkio::Blkio, controller_type::CONTROLLERS, cpu::Cpu, cpuacct::CpuAcct, cpuset::CpuSet, - devices::Devices, hugetlb::Hugetlb, memory::Memory, network_classifier::NetworkClassifier, - network_priority::NetworkPriority, pids::Pids, util, Controller, + devices::Devices, freezer::Freezer, hugetlb::Hugetlb, memory::Memory, + network_classifier::NetworkClassifier, network_priority::NetworkPriority, pids::Pids, util, + Controller, }; use crate::cgroups::common::CGROUP_PROCS; @@ -18,30 +21,32 @@ use crate::utils; use crate::{cgroups::common::CgroupManager, utils::PathBufExt}; use oci_spec::LinuxResources; pub struct Manager { - subsystems: HashMap, + subsystems: HashMap, } impl Manager { + /// Constructs a new cgroup manager with cgroups_path being relative to the root of the subsystem pub fn new(cgroup_path: PathBuf) -> Result { - let mut subsystems = HashMap::::new(); - for subsystem in CONTROLLERS.iter().map(|c| c.to_string()) { - subsystems.insert( - subsystem.to_owned(), - Self::get_subsystem_path(&cgroup_path, &subsystem)?, - ); + let mut subsystems = HashMap::::new(); + for subsystem in CONTROLLERS { + if let Ok(subsystem_path) = Self::get_subsystem_path(&cgroup_path, subsystem) { + subsystems.insert(subsystem.clone(), subsystem_path); + } else { + log::warn!("Cgroup {} not supported on this system", subsystem); + } } Ok(Manager { subsystems }) } - fn get_subsystem_path(cgroup_path: &Path, subsystem: &str) -> anyhow::Result { + fn get_subsystem_path(cgroup_path: &Path, subsystem: &CtrlType) -> Result { log::debug!("Get path for subsystem: {}", subsystem); - let mount_point = util::get_subsystem_mount_points(subsystem)?; + let mount_point = util::get_subsystem_mount_point(subsystem)?; let cgroup = Process::myself()? .cgroups()? .into_iter() - .find(|c| c.controllers.contains(&subsystem.to_owned())) + .find(|c| c.controllers.contains(&subsystem.to_string())) .unwrap(); let p = if cgroup_path.to_string_lossy().into_owned().is_empty() { @@ -54,23 +59,82 @@ impl Manager { Ok(p) } + + fn get_required_controllers( + &self, + linux_resources: &LinuxResources, + ) -> Result> { + let mut required_controllers = HashMap::new(); + + for controller in CONTROLLERS { + let required = match controller { + CtrlType::Cpu => Cpu::needs_to_handle(linux_resources).is_some(), + CtrlType::CpuAcct => CpuAcct::needs_to_handle(linux_resources).is_some(), + CtrlType::CpuSet => CpuSet::needs_to_handle(linux_resources).is_some(), + CtrlType::Devices => Devices::needs_to_handle(linux_resources).is_some(), + CtrlType::HugeTlb => Hugetlb::needs_to_handle(linux_resources).is_some(), + CtrlType::Memory => Memory::needs_to_handle(linux_resources).is_some(), + CtrlType::Pids => Pids::needs_to_handle(linux_resources).is_some(), + CtrlType::Blkio => Blkio::needs_to_handle(linux_resources).is_some(), + CtrlType::NetworkPriority => { + NetworkPriority::needs_to_handle(linux_resources).is_some() + } + CtrlType::NetworkClassifier => { + NetworkClassifier::needs_to_handle(linux_resources).is_some() + } + CtrlType::Freezer => Freezer::needs_to_handle(linux_resources).is_some(), + }; + + if required { + if let Some(subsystem_path) = self.subsystems.get(controller) { + required_controllers.insert(controller, subsystem_path); + } else { + bail!("Cgroup {} is required to fullfill the request, but is not supported by this system", controller); + } + } + } + + Ok(required_controllers) + } } impl CgroupManager for Manager { - fn apply(&self, linux_resources: &LinuxResources, pid: Pid) -> Result<()> { + fn add_task(&self, pid: Pid) -> Result<()> { for subsys in &self.subsystems { - match subsys.0.as_str() { - "cpu" => Cpu::apply(linux_resources, &subsys.1, pid)?, - "cpuacct" => CpuAcct::apply(linux_resources, &subsys.1, pid)?, - "cpuset" => CpuSet::apply(linux_resources, &subsys.1, pid)?, - "devices" => Devices::apply(linux_resources, &subsys.1, pid)?, - "hugetlb" => Hugetlb::apply(linux_resources, &subsys.1, pid)?, - "memory" => Memory::apply(linux_resources, &subsys.1, pid)?, - "pids" => Pids::apply(linux_resources, &subsys.1, pid)?, - "blkio" => Blkio::apply(linux_resources, &subsys.1, pid)?, - "net_prio" => NetworkPriority::apply(linux_resources, &subsys.1, pid)?, - "net_cls" => NetworkClassifier::apply(linux_resources, &subsys.1, pid)?, - _ => unreachable!("every subsystem should have an associated controller"), + match subsys.0 { + CtrlType::Cpu => Cpu::add_task(pid, subsys.1)?, + CtrlType::CpuAcct => CpuAcct::add_task(pid, subsys.1)?, + CtrlType::CpuSet => CpuSet::add_task(pid, subsys.1)?, + CtrlType::Devices => Devices::add_task(pid, subsys.1)?, + CtrlType::HugeTlb => Hugetlb::add_task(pid, subsys.1)?, + CtrlType::Memory => Memory::add_task(pid, subsys.1)?, + CtrlType::Pids => Pids::add_task(pid, subsys.1)?, + CtrlType::Blkio => Blkio::add_task(pid, subsys.1)?, + CtrlType::NetworkPriority => NetworkPriority::add_task(pid, subsys.1)?, + CtrlType::NetworkClassifier => NetworkClassifier::add_task(pid, subsys.1)?, + _ => continue, + } + } + + Ok(()) + } + + fn apply(&self, linux_resources: &LinuxResources) -> Result<()> { + for subsys in self.get_required_controllers(linux_resources)? { + match subsys.0 { + CtrlType::Cpu => Cpu::apply(linux_resources, &subsys.1)?, + CtrlType::CpuAcct => CpuAcct::apply(linux_resources, &subsys.1)?, + CtrlType::CpuSet => CpuSet::apply(linux_resources, &subsys.1)?, + CtrlType::Devices => Devices::apply(linux_resources, &subsys.1)?, + CtrlType::HugeTlb => Hugetlb::apply(linux_resources, &subsys.1)?, + CtrlType::Memory => Memory::apply(linux_resources, &subsys.1)?, + CtrlType::Pids => Pids::apply(linux_resources, &subsys.1)?, + CtrlType::Blkio => Blkio::apply(linux_resources, &subsys.1)?, + CtrlType::NetworkPriority => NetworkPriority::apply(linux_resources, &subsys.1)?, + CtrlType::NetworkClassifier => { + NetworkClassifier::apply(linux_resources, &subsys.1)? + } + CtrlType::Freezer => Freezer::apply(linux_resources, &subsys.1)?, } } diff --git a/src/cgroups/v1/memory.rs b/src/cgroups/v1/memory.rs index c60409820..641fd0eea 100644 --- a/src/cgroups/v1/memory.rs +++ b/src/cgroups/v1/memory.rs @@ -1,13 +1,10 @@ use std::io::{prelude::*, Write}; -use std::{ - fs::{create_dir_all, OpenOptions}, - path::Path, -}; +use std::{fs::OpenOptions, path::Path}; use anyhow::{Result, *}; -use nix::{errno::Errno, unistd::Pid}; +use nix::errno::Errno; -use crate::cgroups::common::{self, CGROUP_PROCS}; +use crate::cgroups::common::{self}; use crate::cgroups::v1::Controller; use oci_spec::{LinuxMemory, LinuxResources}; @@ -25,11 +22,12 @@ const CGROUP_KERNEL_TCP_MEMORY_LIMIT: &str = "memory.kmem.tcp.limit_in_bytes"; pub struct Memory {} impl Controller for Memory { - fn apply(linux_resources: &LinuxResources, cgroup_root: &Path, pid: Pid) -> Result<()> { + type Resource = LinuxMemory; + + fn apply(linux_resources: &LinuxResources, cgroup_root: &Path) -> Result<()> { log::debug!("Apply Memory cgroup config"); - create_dir_all(&cgroup_root)?; - if let Some(memory) = &linux_resources.memory { + if let Some(memory) = Self::needs_to_handle(linux_resources) { let reservation = memory.reservation.unwrap_or(0); Self::apply(&memory, cgroup_root)?; @@ -76,9 +74,16 @@ impl Controller for Memory { } } - common::write_cgroup_file(cgroup_root.join(CGROUP_PROCS), pid)?; Ok(()) } + + fn needs_to_handle(linux_resources: &LinuxResources) -> Option<&Self::Resource> { + if let Some(memory) = &linux_resources.memory { + return Some(memory); + } + + None + } } impl Memory { @@ -239,7 +244,9 @@ impl Memory { #[cfg(test)] mod tests { use super::*; - use crate::cgroups::test::{create_temp_dir, set_fixture}; + use crate::cgroups::common::CGROUP_PROCS; + use crate::cgroups::test::set_fixture; + use crate::utils::create_temp_dir; use oci_spec::LinuxMemory; #[test] @@ -336,7 +343,7 @@ mod tests { } quickcheck! { - fn property_test_set_memory(linux_memory: LinuxMemory, disable_oom_killer: bool, pid_int: i32) -> bool { + fn property_test_set_memory(linux_memory: LinuxMemory, disable_oom_killer: bool) -> bool { let tmp = create_temp_dir("property_test_set_memory").expect("create temp directory for test"); set_fixture(&tmp, CGROUP_MEMORY_USAGE, "0").expect("Set fixure for memory usage"); @@ -364,10 +371,10 @@ mod tests { block_io: None, hugepage_limits: vec![], network: None, + freezer: None, }; - let pid = Pid::from_raw(pid_int); - let result = ::apply(&linux_resources, &tmp, pid); + let result = ::apply(&linux_resources, &tmp); if result.is_err() { if let Some(swappiness) = memory_limits.swappiness { @@ -453,10 +460,6 @@ mod tests { } }; - // check procs file - let procs_content = std::fs::read_to_string(tmp.join(CGROUP_PROCS)).expect("read procs file"); - let procs_check = procs_content == pid.to_string(); - // useful for debugging println!("reservation_check: {:?}", reservation_check); println!("kernel_check: {:?}", kernel_check); @@ -465,7 +468,7 @@ mod tests { println!("limit_swap_check: {:?}", limit_swap_check); // combine all the checks - reservation_check && kernel_check && kernel_tcp_check && swappiness_check && limit_swap_check && procs_check + reservation_check && kernel_check && kernel_tcp_check && swappiness_check && limit_swap_check } } } diff --git a/src/cgroups/v1/mod.rs b/src/cgroups/v1/mod.rs index 9816dc9f5..ff1855143 100644 --- a/src/cgroups/v1/mod.rs +++ b/src/cgroups/v1/mod.rs @@ -5,6 +5,7 @@ mod cpu; mod cpuacct; mod cpuset; mod devices; +mod freezer; mod hugetlb; pub mod manager; mod memory; diff --git a/src/cgroups/v1/network_classifier.rs b/src/cgroups/v1/network_classifier.rs index 0ecd9c873..551fc6726 100644 --- a/src/cgroups/v1/network_classifier.rs +++ b/src/cgroups/v1/network_classifier.rs @@ -1,27 +1,33 @@ -use std::{fs::create_dir_all, path::Path}; +use std::path::Path; use anyhow::Result; -use nix::unistd::Pid; use crate::cgroups::common; -use crate::cgroups::common::CGROUP_PROCS; use crate::cgroups::v1::Controller; use oci_spec::{LinuxNetwork, LinuxResources}; pub struct NetworkClassifier {} impl Controller for NetworkClassifier { - fn apply(linux_resources: &LinuxResources, cgroup_root: &Path, pid: Pid) -> Result<()> { + type Resource = LinuxNetwork; + + fn apply(linux_resources: &LinuxResources, cgroup_root: &Path) -> Result<()> { log::debug!("Apply NetworkClassifier cgroup config"); - create_dir_all(&cgroup_root)?; - if let Some(network) = linux_resources.network.as_ref() { + if let Some(network) = Self::needs_to_handle(linux_resources) { Self::apply(cgroup_root, network)?; } - common::write_cgroup_file(cgroup_root.join(CGROUP_PROCS), pid)?; Ok(()) } + + fn needs_to_handle(linux_resources: &LinuxResources) -> Option<&Self::Resource> { + if let Some(network) = &linux_resources.network { + return Some(network); + } + + None + } } impl NetworkClassifier { @@ -36,9 +42,9 @@ impl NetworkClassifier { #[cfg(test)] mod tests { - use crate::cgroups::test::{create_temp_dir, set_fixture}; - use super::*; + use crate::cgroups::test::set_fixture; + use crate::utils::create_temp_dir; #[test] fn test_apply_network_classifier() { diff --git a/src/cgroups/v1/network_priority.rs b/src/cgroups/v1/network_priority.rs index d12c66fe5..63683bc3c 100644 --- a/src/cgroups/v1/network_priority.rs +++ b/src/cgroups/v1/network_priority.rs @@ -1,27 +1,33 @@ -use std::{fs::create_dir_all, path::Path}; +use std::path::Path; use anyhow::Result; -use nix::unistd::Pid; use crate::cgroups::common; -use crate::cgroups::common::CGROUP_PROCS; use crate::cgroups::v1::Controller; use oci_spec::{LinuxNetwork, LinuxResources}; pub struct NetworkPriority {} impl Controller for NetworkPriority { - fn apply(linux_resources: &LinuxResources, cgroup_root: &Path, pid: Pid) -> Result<()> { + type Resource = LinuxNetwork; + + fn apply(linux_resources: &LinuxResources, cgroup_root: &Path) -> Result<()> { log::debug!("Apply NetworkPriority cgroup config"); - create_dir_all(&cgroup_root)?; - if let Some(network) = linux_resources.network.as_ref() { + if let Some(network) = Self::needs_to_handle(linux_resources) { Self::apply(cgroup_root, network)?; } - common::write_cgroup_file(cgroup_root.join(CGROUP_PROCS), pid)?; Ok(()) } + + fn needs_to_handle(linux_resources: &LinuxResources) -> Option<&Self::Resource> { + if let Some(network) = &linux_resources.network { + return Some(network); + } + + None + } } impl NetworkPriority { @@ -36,7 +42,8 @@ impl NetworkPriority { #[cfg(test)] mod tests { use super::*; - use crate::cgroups::test::{create_temp_dir, set_fixture}; + use crate::cgroups::test::set_fixture; + use crate::utils::create_temp_dir; use oci_spec::LinuxInterfacePriority; #[test] diff --git a/src/cgroups/v1/pids.rs b/src/cgroups/v1/pids.rs index bb2f3af45..025bed7d9 100644 --- a/src/cgroups/v1/pids.rs +++ b/src/cgroups/v1/pids.rs @@ -1,34 +1,32 @@ -use std::{ - fs::{self}, - path::Path, -}; +use std::path::Path; use anyhow::Result; -use crate::cgroups::{ - common::{self, CGROUP_PROCS}, - v1::Controller, -}; +use crate::cgroups::{common, v1::Controller}; use oci_spec::{LinuxPids, LinuxResources}; pub struct Pids {} impl Controller for Pids { - fn apply( - linux_resources: &LinuxResources, - cgroup_root: &std::path::Path, - pid: nix::unistd::Pid, - ) -> anyhow::Result<()> { + type Resource = LinuxPids; + + fn apply(linux_resources: &LinuxResources, cgroup_root: &Path) -> Result<()> { log::debug!("Apply pids cgroup config"); - fs::create_dir_all(cgroup_root)?; if let Some(pids) = &linux_resources.pids { Self::apply(cgroup_root, pids)?; } - common::write_cgroup_file(cgroup_root.join(CGROUP_PROCS), pid)?; Ok(()) } + + fn needs_to_handle(linux_resources: &LinuxResources) -> Option<&Self::Resource> { + if let Some(pids) = &linux_resources.pids { + return Some(pids); + } + + None + } } impl Pids { @@ -46,9 +44,9 @@ impl Pids { #[cfg(test)] mod tests { - use crate::cgroups::test::{create_temp_dir, set_fixture}; - use super::*; + use crate::cgroups::test::set_fixture; + use crate::utils::create_temp_dir; use oci_spec::LinuxPids; #[test] diff --git a/src/cgroups/v1/util.rs b/src/cgroups/v1/util.rs index 7a31e28bf..389bbddba 100644 --- a/src/cgroups/v1/util.rs +++ b/src/cgroups/v1/util.rs @@ -3,21 +3,22 @@ use std::{collections::HashMap, path::PathBuf}; use anyhow::{anyhow, Result}; use procfs::process::Process; -use super::controller_type::CONTROLLERS; +use super::{controller_type::CONTROLLERS, ControllerType}; -pub fn list_subsystem_mount_points() -> Result> { +pub fn list_subsystem_mount_points() -> Result> { let mut mount_paths = HashMap::with_capacity(CONTROLLERS.len()); for controller in CONTROLLERS { - if let Ok(mount_point) = get_subsystem_mount_points(&controller.to_string()) { - mount_paths.insert(controller.to_string(), mount_point); + if let Ok(mount_point) = get_subsystem_mount_point(controller) { + mount_paths.insert(controller.to_owned(), mount_point); } } Ok(mount_paths) } -pub fn get_subsystem_mount_points(subsystem: &str) -> Result { +pub fn get_subsystem_mount_point(subsystem: &ControllerType) -> Result { + let subsystem = subsystem.to_string(); Process::myself()? .mountinfo()? .into_iter() diff --git a/src/cgroups/v2/controller_type.rs b/src/cgroups/v2/controller_type.rs index 016a4064f..c5cfb7201 100644 --- a/src/cgroups/v2/controller_type.rs +++ b/src/cgroups/v2/controller_type.rs @@ -5,6 +5,7 @@ pub enum ControllerType { Memory, HugeTlb, Pids, + Freezer, } impl ToString for ControllerType { @@ -16,6 +17,7 @@ impl ToString for ControllerType { Self::Memory => "memory".into(), Self::HugeTlb => "hugetlb".into(), Self::Pids => "pids".into(), + Self::Freezer => "freezer".into(), } } } diff --git a/src/cgroups/v2/cpu.rs b/src/cgroups/v2/cpu.rs index ae39f7670..184be42ba 100644 --- a/src/cgroups/v2/cpu.rs +++ b/src/cgroups/v2/cpu.rs @@ -86,7 +86,8 @@ impl Cpu { #[cfg(test)] mod tests { use super::*; - use crate::cgroups::test::{create_temp_dir, set_fixture, setup, LinuxCpuBuilder}; + use crate::cgroups::test::{set_fixture, setup, LinuxCpuBuilder}; + use crate::utils::create_temp_dir; use std::fs; #[test] diff --git a/src/cgroups/v2/freezer.rs b/src/cgroups/v2/freezer.rs new file mode 100644 index 000000000..5827f813d --- /dev/null +++ b/src/cgroups/v2/freezer.rs @@ -0,0 +1,193 @@ +use anyhow::{bail, Result}; +use std::{ + fs::OpenOptions, + io::{BufRead, BufReader, Read, Seek, SeekFrom, Write}, + path::Path, + str, thread, + time::Duration, +}; + +use oci_spec::{FreezerState, LinuxResources}; + +use super::controller::Controller; + +const CGROUP_FREEZE: &str = "cgroup.freeze"; +const CGROUP_EVENTS: &str = "cgroup.events"; + +pub struct Freezer {} + +impl Controller for Freezer { + fn apply(linux_resources: &LinuxResources, cgroup_path: &Path) -> Result<()> { + if let Some(freezer_state) = linux_resources.freezer { + Self::apply(freezer_state, cgroup_path)?; + } + + Ok(()) + } +} + +impl Freezer { + fn apply(freezer_state: FreezerState, path: &Path) -> Result<()> { + let state_str = match freezer_state { + FreezerState::Undefined => return Ok(()), + FreezerState::Frozen => "1", + FreezerState::Thawed => "0", + }; + + match OpenOptions::new() + .create(false) + .write(true) + .open(path.join(CGROUP_FREEZE)) + { + Err(e) => { + if let FreezerState::Frozen = freezer_state { + bail!("freezer not supported {}", e); + } + return Ok(()); + } + Ok(mut file) => file.write_all(state_str.as_bytes())?, + }; + + // confirm that the cgroup did actually change states. + let actual_state = Self::read_freezer_state(path)?; + if !actual_state.eq(&freezer_state) { + bail!( + "expected \"cgroup.freeze\" to be in state {:?} but was in {:?}", + freezer_state, + actual_state + ); + } + + Ok(()) + } + + fn read_freezer_state(path: &Path) -> Result { + let mut buf = [0; 1]; + OpenOptions::new() + .create(false) + .read(true) + .open(path.join(CGROUP_FREEZE))? + .read_exact(&mut buf)?; + + let state = str::from_utf8(&buf)?; + match state { + "0" => Ok(FreezerState::Thawed), + "1" => Self::wait_frozen(path), + _ => bail!("unknown \"cgroup.freeze\" state: {}", state), + } + } + + // wait_frozen polls cgroup.events until it sees "frozen 1" in it. + fn wait_frozen(path: &Path) -> Result { + let f = OpenOptions::new() + .create(false) + .read(true) + .open(path.join(CGROUP_EVENTS))?; + let mut f = BufReader::new(f); + + let wait_time = Duration::from_millis(10); + let max_iter = 1000; + let mut iter = 0; + let mut line = String::new(); + + loop { + if iter == max_iter { + bail!( + "timeout of {} ms reached waiting for the cgroup to freeze", + wait_time.as_millis() * max_iter + ); + } + line.clear(); + let num_bytes = f.read_line(&mut line)?; + if num_bytes == 0 { + break; + } + if line.starts_with("frozen ") { + if line.starts_with("frozen 1") { + if iter > 1 { + log::debug!("frozen after {} retries", iter) + } + return Ok(FreezerState::Frozen); + } + iter += 1; + thread::sleep(wait_time); + f.seek(SeekFrom::Start(0))?; + } + } + + Ok(FreezerState::Undefined) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cgroups::test::set_fixture; + use crate::utils::create_temp_dir; + use oci_spec::FreezerState; + use std::sync::Arc; + + #[test] + fn test_set_freezer_state() { + let tmp = Arc::new( + create_temp_dir("test_set_freezer_state").expect("create temp directory for test"), + ); + set_fixture(&tmp, CGROUP_FREEZE, "").expect("Set fixure for freezer state"); + set_fixture(&tmp, CGROUP_EVENTS, "populated 0\nfrozen 0") + .expect("Set fixure for freezer state"); + + // set Frozen state. + { + // use another thread to update events file async. + let p = Arc::clone(&tmp); + thread::spawn(move || { + thread::sleep(Duration::from_millis(100)); + set_fixture(&p, CGROUP_EVENTS, "populated 0\nfrozen 1") + .expect("Set fixure for freezer state"); + }); + let freezer_state = FreezerState::Frozen; + Freezer::apply(freezer_state, &tmp).expect("Set freezer state"); + + let state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZE)).expect("Read to string"); + assert_eq!("1", state_content); + } + + // set Thawed state. + { + let freezer_state = FreezerState::Thawed; + Freezer::apply(freezer_state, &tmp).expect("Set freezer state"); + + let state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZE)).expect("Read to string"); + assert_eq!("0", state_content); + } + + // set Undefined state. + { + let old_state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZE)).expect("Read to string"); + let freezer_state = FreezerState::Undefined; + Freezer::apply(freezer_state, &tmp).expect("Set freezer state"); + + let state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZE)).expect("Read to string"); + assert_eq!(old_state_content, state_content); + } + } + + #[test] + fn test_set_freezer_state_error() { + let tmp = create_temp_dir("test_set_freezer_state_error") + .expect("create temp directory for test"); + set_fixture(&tmp, CGROUP_FREEZE, "").expect("Set fixure for freezer state"); + set_fixture(&tmp, CGROUP_EVENTS, "").expect("Set fixure for freezer state"); + + // events file does not contain "frozen 1" + { + let freezer_state = FreezerState::Frozen; + let r = Freezer::apply(freezer_state, &tmp); + assert!(r.is_err()); + } + } +} diff --git a/src/cgroups/v2/hugetlb.rs b/src/cgroups/v2/hugetlb.rs index 9e1a8321b..fe0cf9dab 100644 --- a/src/cgroups/v2/hugetlb.rs +++ b/src/cgroups/v2/hugetlb.rs @@ -1,12 +1,117 @@ -use anyhow::Result; +use anyhow::{bail, Result}; +use std::path::Path; use super::controller::Controller; -use oci_spec::LinuxResources; +use crate::cgroups::common; +use oci_spec::{LinuxHugepageLimit, LinuxResources}; pub struct HugeTlb {} impl Controller for HugeTlb { - fn apply(_: &LinuxResources, _: &std::path::Path) -> Result<()> { + fn apply(linux_resources: &LinuxResources, cgroup_root: &std::path::Path) -> Result<()> { + log::debug!("Apply hugetlb cgroup v2 config"); + if let Some(hugepage_limits) = Self::needs_to_handle(linux_resources) { + for hugetlb in hugepage_limits { + Self::apply(cgroup_root, hugetlb)? + } + } Ok(()) } } + +impl HugeTlb { + fn apply(root_path: &Path, hugetlb: &LinuxHugepageLimit) -> Result<()> { + let page_size: String = hugetlb + .page_size + .chars() + .take_while(|c| c.is_digit(10)) + .collect(); + let page_size: u64 = page_size.parse()?; + if !Self::is_power_of_two(page_size) { + bail!("page size must be in the format of 2^(integer)"); + } + + common::write_cgroup_file( + root_path.join(format!("hugetlb.{}.limit_in_bytes", hugetlb.page_size)), + hugetlb.limit, + )?; + Ok(()) + } + + fn needs_to_handle(linux_resources: &LinuxResources) -> Option<&Vec> { + if !linux_resources.hugepage_limits.is_empty() { + return Some(&linux_resources.hugepage_limits); + } + + None + } + + fn is_power_of_two(number: u64) -> bool { + (number != 0) && (number & (number - 1)) == 0 + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cgroups::test::set_fixture; + use crate::utils::create_temp_dir; + use oci_spec::LinuxHugepageLimit; + use std::fs::read_to_string; + + #[test] + fn test_set_hugetlb() { + let page_file_name = "hugetlb.2MB.limit_in_bytes"; + let tmp = create_temp_dir("test_set_hugetlbv2").expect("create temp directory for test"); + set_fixture(&tmp, page_file_name, "0").expect("Set fixture for 2 MB page size"); + + let hugetlb = LinuxHugepageLimit { + page_size: "2MB".to_owned(), + limit: 16384, + }; + HugeTlb::apply(&tmp, &hugetlb).expect("apply hugetlb"); + let content = read_to_string(tmp.join(page_file_name)).expect("Read hugetlb file content"); + assert_eq!(hugetlb.limit.to_string(), content); + } + + #[test] + fn test_set_hugetlb_with_invalid_page_size() { + let tmp = create_temp_dir("test_set_hugetlbv2_with_invalid_page_size") + .expect("create temp directory for test"); + + let hugetlb = LinuxHugepageLimit { + page_size: "3MB".to_owned(), + limit: 16384, + }; + + let result = HugeTlb::apply(&tmp, &hugetlb); + assert!( + result.is_err(), + "page size that is not a power of two should be an error" + ); + } + + quickcheck! { + fn property_test_set_hugetlb(hugetlb: LinuxHugepageLimit) -> bool { + let page_file_name = format!("hugetlb.{:?}.limit_in_bytes", hugetlb.page_size); + let tmp = create_temp_dir("property_test_set_hugetlbv2").expect("create temp directory for test"); + set_fixture(&tmp, &page_file_name, "0").expect("Set fixture for page size"); + let result = HugeTlb::apply(&tmp, &hugetlb); + + let page_size: String = hugetlb + .page_size + .chars() + .take_while(|c| c.is_digit(10)) + .collect(); + let page_size: u64 = page_size.parse().expect("parse page size"); + + if HugeTlb::is_power_of_two(page_size) && page_size != 1 { + let content = + read_to_string(tmp.join(page_file_name)).expect("Read hugetlb file content"); + hugetlb.limit.to_string() == content + } else { + result.is_err() + } + } + } +} diff --git a/src/cgroups/v2/io.rs b/src/cgroups/v2/io.rs index ebd3ab959..67ef510ab 100644 --- a/src/cgroups/v2/io.rs +++ b/src/cgroups/v2/io.rs @@ -1,12 +1,237 @@ -use anyhow::Result; +use std::path::{Path, PathBuf}; + +use anyhow::{bail, Result}; + +use crate::cgroups::common; use super::controller::Controller; -use oci_spec::LinuxResources; +use oci_spec::{LinuxBlockIo, LinuxResources}; + +const CGROUP_BFQ_IO_WEIGHT: &str = "io.bfq.weight"; +const CGROUP_IO_WEIGHT: &str = "io.weight"; pub struct Io {} impl Controller for Io { - fn apply(_: &LinuxResources, _: &std::path::Path) -> Result<()> { + fn apply(linux_resource: &LinuxResources, cgroup_root: &Path) -> Result<()> { + log::debug!("Apply io cgrup v2 config"); + if let Some(io) = &linux_resource.block_io { + Self::apply(cgroup_root, io)?; + } + Ok(()) + } +} + +impl Io { + fn io_max_path(path: &Path) -> PathBuf { + path.join("io.max") + } + + // linux kernel doc: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html#io + fn apply(root_path: &Path, blkio: &LinuxBlockIo) -> Result<()> { + for wd in &blkio.blkio_weight_device { + common::write_cgroup_file( + root_path.join(CGROUP_BFQ_IO_WEIGHT), + &format!("{}:{} {}", wd.major, wd.minor, wd.weight.unwrap()), + )?; + } + if let Some(leaf_weight) = blkio.blkio_leaf_weight { + if leaf_weight > 0 { + bail!("cannot set leaf_weight with cgroupv2"); + } + } + if let Some(io_weight) = blkio.blkio_weight { + if io_weight > 0 { + common::write_cgroup_file( + root_path.join(CGROUP_IO_WEIGHT), + format!("{}", io_weight), + )?; + } + } + + for trbd in &blkio.blkio_throttle_read_bps_device { + common::write_cgroup_file( + Self::io_max_path(root_path), + &format!("{}:{} rbps={}", trbd.major, trbd.minor, trbd.rate), + )?; + } + + for twbd in &blkio.blkio_throttle_write_bps_device { + common::write_cgroup_file( + Self::io_max_path(root_path), + format!("{}:{} wbps={}", twbd.major, twbd.minor, twbd.rate), + )?; + } + for trid in &blkio.blkio_throttle_read_iops_device { + common::write_cgroup_file( + Self::io_max_path(root_path), + format!("{}:{} riops={}", trid.major, trid.minor, trid.rate), + )?; + } + for twid in &blkio.blkio_throttle_write_iops_device { + common::write_cgroup_file( + Self::io_max_path(root_path), + format!("{}:{} wiops={}", twid.major, twid.minor, twid.rate), + )?; + } Ok(()) } } +#[cfg(test)] +mod test { + use super::*; + use crate::cgroups::test::setup; + use oci_spec::{LinuxBlockIo, LinuxThrottleDevice, LinuxWeightDevice}; + use std::fs; + struct BlockIoBuilder { + block_io: LinuxBlockIo, + } + impl BlockIoBuilder { + fn new() -> Self { + let block_io = LinuxBlockIo { + blkio_weight: Some(0), + blkio_leaf_weight: Some(0), + blkio_weight_device: vec![], + blkio_throttle_read_bps_device: vec![], + blkio_throttle_write_bps_device: vec![], + blkio_throttle_read_iops_device: vec![], + blkio_throttle_write_iops_device: vec![], + }; + + Self { block_io } + } + fn with_write_weight_device(mut self, throttle: Vec) -> Self { + self.block_io.blkio_weight_device = throttle; + self + } + fn with_write_io_weight(mut self, iow: u16) -> Self { + self.block_io.blkio_weight = Some(iow); + self + } + + fn with_read_bps(mut self, throttle: Vec) -> Self { + self.block_io.blkio_throttle_read_bps_device = throttle; + self + } + + fn with_write_bps(mut self, throttle: Vec) -> Self { + self.block_io.blkio_throttle_write_bps_device = throttle; + self + } + + fn with_read_iops(mut self, throttle: Vec) -> Self { + self.block_io.blkio_throttle_read_iops_device = throttle; + self + } + + fn with_write_iops(mut self, throttle: Vec) -> Self { + self.block_io.blkio_throttle_write_iops_device = throttle; + self + } + + fn build(self) -> LinuxBlockIo { + self.block_io + } + } + + #[test] + fn test_set_io_read_bps() { + let (tmp, throttle) = setup("test_set_io_read_bps", "io.max"); + + let blkio = BlockIoBuilder::new() + .with_read_bps(vec![LinuxThrottleDevice { + major: 8, + minor: 0, + rate: 102400, + }]) + .build(); + + Io::apply(&tmp, &blkio).expect("apply blkio"); + let content = fs::read_to_string(throttle).unwrap_or_else(|_| panic!("read rbps content")); + + assert_eq!("8:0 rbps=102400", content); + } + + #[test] + fn test_set_io_write_bps() { + let (tmp, throttle) = setup("test_set_io_write_bps", "io.max"); + + let blkio = BlockIoBuilder::new() + .with_write_bps(vec![LinuxThrottleDevice { + major: 8, + minor: 0, + rate: 102400, + }]) + .build(); + + Io::apply(&tmp, &blkio).expect("apply blkio"); + let content = fs::read_to_string(throttle).unwrap_or_else(|_| panic!("read rbps content")); + + assert_eq!("8:0 wbps=102400", content); + } + + #[test] + fn test_set_io_read_iops() { + let (tmp, throttle) = setup("test_set_io_read_iops", "io.max"); + + let blkio = BlockIoBuilder::new() + .with_read_iops(vec![LinuxThrottleDevice { + major: 8, + minor: 0, + rate: 102400, + }]) + .build(); + + Io::apply(&tmp, &blkio).expect("apply blkio"); + let content = fs::read_to_string(throttle).unwrap_or_else(|_| panic!("read riops content")); + + assert_eq!("8:0 riops=102400", content); + } + + #[test] + fn test_set_io_write_iops() { + let (tmp, throttle) = setup("test_set_io_write_iops", "io.max"); + + let blkio = BlockIoBuilder::new() + .with_write_iops(vec![LinuxThrottleDevice { + major: 8, + minor: 0, + rate: 102400, + }]) + .build(); + + Io::apply(&tmp, &blkio).expect("apply blkio"); + let content = fs::read_to_string(throttle).unwrap_or_else(|_| panic!("read wiops content")); + + assert_eq!("8:0 wiops=102400", content); + } + + #[test] + fn test_set_ioweight_device() { + let (tmp, throttle) = setup("test_set_io_weight_device", CGROUP_BFQ_IO_WEIGHT); + let blkio = BlockIoBuilder::new() + .with_write_weight_device(vec![LinuxWeightDevice { + major: 8, + minor: 0, + weight: Some(80), + leaf_weight: Some(0), + }]) + .build(); + Io::apply(&tmp, &blkio).expect("apply blkio"); + let content = + fs::read_to_string(throttle).unwrap_or_else(|_| panic!("read bfq_io_weight content")); + + assert_eq!("8:0 80", content); + } + + #[test] + fn test_set_ioweight() { + let (tmp, throttle) = setup("test_set_io_weight", CGROUP_IO_WEIGHT); + let blkio = BlockIoBuilder::new().with_write_io_weight(100).build(); + Io::apply(&tmp, &blkio).expect("apply blkio"); + let content = + fs::read_to_string(throttle).unwrap_or_else(|_| panic!("read bfq_io_weight content")); + + assert_eq!("100", content); + } +} diff --git a/src/cgroups/v2/manager.rs b/src/cgroups/v2/manager.rs index 77a04c618..32b17779a 100644 --- a/src/cgroups/v2/manager.rs +++ b/src/cgroups/v2/manager.rs @@ -9,7 +9,10 @@ use anyhow::{bail, Result}; use nix::unistd::Pid; use oci_spec::LinuxResources; -use super::{cpu::Cpu, cpuset::CpuSet, hugetlb::HugeTlb, io::Io, memory::Memory, pids::Pids}; +use super::{ + cpu::Cpu, cpuset::CpuSet, freezer::Freezer, hugetlb::HugeTlb, io::Io, memory::Memory, + pids::Pids, +}; use crate::{ cgroups::v2::controller::Controller, cgroups::{ @@ -29,33 +32,39 @@ const CONTROLLER_TYPES: &[ControllerType] = &[ ControllerType::Io, ControllerType::Memory, ControllerType::Pids, + ControllerType::Freezer, ]; pub struct Manager { root_path: PathBuf, cgroup_path: PathBuf, + full_path: PathBuf, } impl Manager { + /// Constructs a new cgroup manager with root path being the mount point + /// of a cgroup v2 fs and cgroup path being a relative path from the root pub fn new(root_path: PathBuf, cgroup_path: PathBuf) -> Result { + let full_path = root_path.join_absolute_path(&cgroup_path)?; + Ok(Self { root_path, cgroup_path, + full_path, }) } - fn create_unified_cgroup(&self, cgroup_path: &Path, pid: Pid) -> Result { - let full_path = self.root_path.join_absolute_path(cgroup_path)?; + fn create_unified_cgroup(&self, pid: Pid) -> Result<()> { let controllers: Vec = self - .get_available_controllers(&self.root_path)? - .into_iter() + .get_available_controllers()? + .iter() .map(|c| format!("{}{}", "+", c.to_string())) .collect(); Self::write_controllers(&self.root_path, &controllers)?; let mut current_path = self.root_path.clone(); - let mut components = cgroup_path.components().skip(1).peekable(); + let mut components = self.cgroup_path.components().skip(1).peekable(); while let Some(component) = components.next() { current_path = current_path.join(component); if !current_path.exists() { @@ -70,15 +79,12 @@ impl Manager { } } - common::write_cgroup_file(&full_path.join(CGROUP_PROCS), pid)?; - Ok(full_path) + common::write_cgroup_file(&self.full_path.join(CGROUP_PROCS), pid)?; + Ok(()) } - fn get_available_controllers>( - &self, - cgroup_path: P, - ) -> Result> { - let controllers_path = self.root_path.join(cgroup_path).join(CGROUP_CONTROLLERS); + fn get_available_controllers(&self) -> Result> { + let controllers_path = self.root_path.join(CGROUP_CONTROLLERS); if !controllers_path.exists() { bail!( "cannot get available controllers. {:?} does not exist", @@ -95,6 +101,7 @@ impl Manager { "io" => controllers.push(ControllerType::Io), "memory" => controllers.push(ControllerType::Memory), "pids" => controllers.push(ControllerType::Pids), + "freezer" => controllers.push(ControllerType::Freezer), tpe => log::warn!("Controller {} is not yet implemented.", tpe), } } @@ -112,17 +119,21 @@ impl Manager { } impl CgroupManager for Manager { - fn apply(&self, linux_resources: &LinuxResources, pid: Pid) -> Result<()> { - let full_cgroup_path = self.create_unified_cgroup(&self.cgroup_path, pid)?; + fn add_task(&self, pid: Pid) -> Result<()> { + self.create_unified_cgroup(pid)?; + Ok(()) + } + fn apply(&self, linux_resources: &LinuxResources) -> Result<()> { for controller in CONTROLLER_TYPES { match controller { - ControllerType::Cpu => Cpu::apply(linux_resources, &full_cgroup_path)?, - ControllerType::CpuSet => CpuSet::apply(linux_resources, &full_cgroup_path)?, - ControllerType::HugeTlb => HugeTlb::apply(linux_resources, &&full_cgroup_path)?, - ControllerType::Io => Io::apply(linux_resources, &&full_cgroup_path)?, - ControllerType::Memory => Memory::apply(linux_resources, &full_cgroup_path)?, - ControllerType::Pids => Pids::apply(linux_resources, &&full_cgroup_path)?, + ControllerType::Cpu => Cpu::apply(linux_resources, &self.full_path)?, + ControllerType::CpuSet => CpuSet::apply(linux_resources, &self.full_path)?, + ControllerType::HugeTlb => HugeTlb::apply(linux_resources, &self.full_path)?, + ControllerType::Io => Io::apply(linux_resources, &self.full_path)?, + ControllerType::Memory => Memory::apply(linux_resources, &self.full_path)?, + ControllerType::Pids => Pids::apply(linux_resources, &self.full_path)?, + ControllerType::Freezer => Freezer::apply(linux_resources, &self.full_path)?, } } @@ -130,9 +141,8 @@ impl CgroupManager for Manager { } fn remove(&self) -> Result<()> { - let full_path = self.root_path.join_absolute_path(&self.cgroup_path)?; - log::debug!("remove cgroup {:?}", full_path); - fs::remove_dir_all(full_path)?; + log::debug!("remove cgroup {:?}", self.full_path); + fs::remove_dir_all(&self.full_path)?; Ok(()) } diff --git a/src/cgroups/v2/memory.rs b/src/cgroups/v2/memory.rs index a83f7391f..24f6884af 100644 --- a/src/cgroups/v2/memory.rs +++ b/src/cgroups/v2/memory.rs @@ -1,10 +1,16 @@ -use anyhow::Result; +use anyhow::{Result, *}; use std::path::Path; use oci_spec::{LinuxMemory, LinuxResources}; +use crate::cgroups::common; + use super::controller::Controller; +const CGROUP_MEMORY_SWAP: &str = "memory.swap.max"; +const CGROUP_MEMORY_MAX: &str = "memory.max"; +const CGROUP_MEMORY_LOW: &str = "memory.low"; + pub struct Memory {} impl Controller for Memory { @@ -18,7 +24,253 @@ impl Controller for Memory { } impl Memory { - fn apply(_: &Path, _: &LinuxMemory) -> Result<()> { + fn set>(path: P, val: i64) -> Result<()> { + if val == 0 { + Ok(()) + } else if val == -1 { + common::write_cgroup_file_str(path, "max") + } else { + common::write_cgroup_file(path, val) + } + } + + fn apply(path: &Path, memory: &LinuxMemory) -> Result<()> { + // if nothing is set just exit right away + if memory.reservation.is_none() && memory.limit.is_none() && memory.swap.is_none() { + return Ok(()); + } + + match memory.limit { + Some(limit) if limit < -1 => { + bail!("invalid memory value: {}", limit); + } + Some(limit) => match memory.swap { + Some(swap) if swap < -1 => { + bail!("invalid swap value: {}", swap); + } + Some(swap) => { + Memory::set(path.join(CGROUP_MEMORY_SWAP), swap)?; + Memory::set(path.join(CGROUP_MEMORY_MAX), limit)?; + } + None => { + if limit == -1 { + Memory::set(path.join(CGROUP_MEMORY_SWAP), -1)?; + } + Memory::set(path.join(CGROUP_MEMORY_MAX), limit)?; + } + }, + None => { + if memory.swap.is_some() { + bail!("unable to set swap limit without memory limit"); + } + } + }; + + if let Some(reservation) = memory.reservation { + if reservation < -1 { + bail!("invalid memory reservation value: {}", reservation); + } + Memory::set(path.join(CGROUP_MEMORY_LOW), reservation)?; + } + Ok(()) } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::cgroups::test::set_fixture; + use crate::utils::create_temp_dir; + use oci_spec::LinuxMemory; + use std::fs::read_to_string; + + #[test] + fn test_set_memory_v2() { + let tmp = create_temp_dir("test_set_memory_v2").expect("create temp directory for test"); + set_fixture(&tmp, CGROUP_MEMORY_MAX, "0").expect("set fixture for memory limit"); + set_fixture(&tmp, CGROUP_MEMORY_LOW, "0").expect("set fixture for memory reservation"); + set_fixture(&tmp, CGROUP_MEMORY_SWAP, "0").expect("set fixture for swap limit"); + + let limit = 1024; + let reservation = 512; + let swap = 2048; + let memory_limits = &LinuxMemory { + limit: Some(limit), + reservation: Some(reservation), + swap: Some(swap), + kernel: None, + kernel_tcp: None, + swappiness: None, + }; + Memory::apply(&tmp, memory_limits).expect("apply memory limits"); + + let limit_content = read_to_string(tmp.join(CGROUP_MEMORY_MAX)).expect("read memory limit"); + assert_eq!(limit_content, limit.to_string()); + + let swap_content = read_to_string(tmp.join(CGROUP_MEMORY_SWAP)).expect("read swap limit"); + assert_eq!(swap_content, swap.to_string()); + + let reservation_content = + read_to_string(tmp.join(CGROUP_MEMORY_LOW)).expect("read memory reservation"); + assert_eq!(reservation_content, reservation.to_string()); + } + + #[test] + fn test_set_memory_unlimited_v2() { + let tmp = create_temp_dir("test_set_memory_unlimited_v2") + .expect("create temp directory for test"); + set_fixture(&tmp, CGROUP_MEMORY_MAX, "0").expect("set fixture for memory limit"); + set_fixture(&tmp, CGROUP_MEMORY_LOW, "0").expect("set fixture for memory reservation"); + set_fixture(&tmp, CGROUP_MEMORY_SWAP, "0").expect("set fixture for swap limit"); + + let memory_limits = &LinuxMemory { + limit: Some(-1), + reservation: None, + swap: None, + kernel: None, + kernel_tcp: None, + swappiness: None, + }; + Memory::apply(&tmp, memory_limits).expect("apply memory limits"); + + let limit_content = read_to_string(tmp.join(CGROUP_MEMORY_MAX)).expect("read memory limit"); + assert_eq!(limit_content, "max"); + + let swap_content = read_to_string(tmp.join(CGROUP_MEMORY_SWAP)).expect("read swap limit"); + assert_eq!(swap_content, "max"); + } + + #[test] + fn test_err_swap_no_memory_v2() { + let tmp = + create_temp_dir("test_err_swap_no_memory_v2").expect("create temp directory for test"); + set_fixture(&tmp, CGROUP_MEMORY_MAX, "0").expect("set fixture for memory limit"); + set_fixture(&tmp, CGROUP_MEMORY_LOW, "0").expect("set fixture for memory reservation"); + set_fixture(&tmp, CGROUP_MEMORY_SWAP, "0").expect("set fixture for swap limit"); + + let memory_limits = &LinuxMemory { + limit: None, + swap: Some(512), + reservation: None, + kernel: None, + kernel_tcp: None, + swappiness: None, + }; + + let result = Memory::apply(&tmp, memory_limits); + + assert!(result.is_err()); + } + + #[test] + fn test_err_bad_limit_v2() { + let tmp = create_temp_dir("test_err_bad_limit_v2").expect("create temp directory for test"); + set_fixture(&tmp, CGROUP_MEMORY_MAX, "0").expect("set fixture for memory limit"); + set_fixture(&tmp, CGROUP_MEMORY_LOW, "0").expect("set fixture for memory reservation"); + set_fixture(&tmp, CGROUP_MEMORY_SWAP, "0").expect("set fixture for swap limit"); + + let memory_limits = &LinuxMemory { + limit: Some(-2), + swap: None, + reservation: None, + kernel: None, + kernel_tcp: None, + swappiness: None, + }; + + let result = Memory::apply(&tmp, memory_limits); + + assert!(result.is_err()); + } + + #[test] + fn test_err_bad_swap_v2() { + let tmp = create_temp_dir("test_err_bad_swap_v2").expect("create temp directory for test"); + set_fixture(&tmp, CGROUP_MEMORY_MAX, "0").expect("set fixture for memory limit"); + set_fixture(&tmp, CGROUP_MEMORY_LOW, "0").expect("set fixture for memory reservation"); + set_fixture(&tmp, CGROUP_MEMORY_SWAP, "0").expect("set fixture for swap limit"); + + let memory_limits = &LinuxMemory { + limit: Some(512), + swap: Some(-3), + reservation: None, + kernel: None, + kernel_tcp: None, + swappiness: None, + }; + + let result = Memory::apply(&tmp, memory_limits); + + assert!(result.is_err()); + } + + quickcheck! { + fn property_test_set_memory_v2(linux_memory: LinuxMemory) -> bool { + let tmp = create_temp_dir("property_test_set_memory_v2").expect("create temp directory for test"); + set_fixture(&tmp, CGROUP_MEMORY_MAX, "0").expect("set fixture for memory limit"); + set_fixture(&tmp, CGROUP_MEMORY_LOW, "0").expect("set fixture for memory reservation"); + set_fixture(&tmp, CGROUP_MEMORY_SWAP, "0").expect("set fixture for swap limit"); + + let result = Memory::apply(&tmp, &linux_memory); + + // we need to check for expected errors first and foremost or we'll get false negatives + // later + if let Some(limit) = linux_memory.limit { + if limit < -1 { + return result.is_err(); + } + } + + if let Some(swap) = linux_memory.swap { + if swap < -1 { + return result.is_err(); + } + if linux_memory.limit.is_none() { + return result.is_err(); + } + } + + if let Some(reservation) = linux_memory.reservation { + if reservation < -1 { + return result.is_err(); + } + } + + // check the limit file is set as expected + let limit_content = read_to_string(tmp.join(CGROUP_MEMORY_MAX)).expect("read memory limit to string"); + let limit_check = match linux_memory.limit { + Some(limit) if limit == -1 => limit_content == "max", + Some(limit) => limit_content == limit.to_string(), + None => limit_content == "0", + }; + + // check the swap file is set as expected + let swap_content = read_to_string(tmp.join(CGROUP_MEMORY_SWAP)).expect("read swap limit to string"); + let swap_check = match linux_memory.swap { + Some(swap) if swap == -1 => swap_content == "max", + Some(swap) => swap_content == swap.to_string(), + None => { + match linux_memory.limit { + Some(limit) if limit == -1 => swap_content == "max", + _ => swap_content == "0", + } + } + }; + + + // check the resevation file is set as expected + let reservation_content = read_to_string(tmp.join(CGROUP_MEMORY_LOW)).expect("read memory reservation to string"); + let reservation_check = match linux_memory.reservation { + Some(reservation) if reservation == -1 => reservation_content == "max", + Some(reservation) => reservation_content == reservation.to_string(), + None => reservation_content == "0", + }; + + println!("limit_check: {}", limit_check); + println!("swap_check: {}", swap_check); + println!("reservation_check: {}", reservation_check); + limit_check && swap_check && reservation_check + } + } +} diff --git a/src/cgroups/v2/mod.rs b/src/cgroups/v2/mod.rs index cea672f02..f86f1b8a0 100644 --- a/src/cgroups/v2/mod.rs +++ b/src/cgroups/v2/mod.rs @@ -2,9 +2,12 @@ mod controller; mod controller_type; mod cpu; mod cpuset; +mod freezer; mod hugetlb; mod io; pub mod manager; mod memory; mod pids; +pub mod systemd_manager; pub mod util; +pub use systemd_manager::SystemDCGroupManager; diff --git a/src/cgroups/v2/pids.rs b/src/cgroups/v2/pids.rs index 5306715de..9c7faf171 100644 --- a/src/cgroups/v2/pids.rs +++ b/src/cgroups/v2/pids.rs @@ -1,12 +1,68 @@ +use std::path::Path; + use anyhow::Result; +use crate::cgroups::common; + use super::controller::Controller; -use oci_spec::LinuxResources; +use oci_spec::{LinuxPids, LinuxResources}; pub struct Pids {} impl Controller for Pids { - fn apply(_: &LinuxResources, _: &std::path::Path) -> Result<()> { + fn apply(linux_resource: &LinuxResources, cgroup_root: &std::path::Path) -> Result<()> { + log::debug!("Apply pids cgroup v2 config"); + if let Some(pids) = &linux_resource.pids { + Self::apply(cgroup_root, pids)?; + } Ok(()) } } + +impl Pids { + fn apply(root_path: &Path, pids: &LinuxPids) -> Result<()> { + let limit = if pids.limit > 0 { + pids.limit.to_string() + } else { + "max".to_string() + }; + common::write_cgroup_file(&root_path.join("pids.max"), &limit) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cgroups::test::set_fixture; + use crate::utils::create_temp_dir; + use oci_spec::LinuxPids; + + #[test] + fn test_set_pids() { + let pids_file_name = "pids.max"; + let tmp = create_temp_dir("v2_test_set_pids").expect("create temp directory for test"); + set_fixture(&tmp, pids_file_name, "1000").expect("Set fixture for 1000 pids"); + + let pids = LinuxPids { limit: 1000 }; + + Pids::apply(&tmp, &pids).expect("apply pids"); + let content = + std::fs::read_to_string(tmp.join(pids_file_name)).expect("Read pids contents"); + assert_eq!(pids.limit.to_string(), content); + } + + #[test] + fn test_set_pids_max() { + let pids_file_name = "pids.max"; + let tmp = create_temp_dir("v2_test_set_pids_max").expect("create temp directory for test"); + set_fixture(&tmp, pids_file_name, "0").expect("set fixture for 0 pids"); + + let pids = LinuxPids { limit: 0 }; + + Pids::apply(&tmp, &pids).expect("apply pids"); + + let content = + std::fs::read_to_string(tmp.join(pids_file_name)).expect("Read pids contents"); + assert_eq!("max".to_string(), content); + } +} diff --git a/src/cgroups/v2/systemd_manager.rs b/src/cgroups/v2/systemd_manager.rs new file mode 100644 index 000000000..874a5b99d --- /dev/null +++ b/src/cgroups/v2/systemd_manager.rs @@ -0,0 +1,308 @@ +use std::{ + fs::{self}, + os::unix::fs::PermissionsExt, +}; + +use anyhow::{anyhow, bail, Result}; +use nix::unistd::Pid; +use oci_spec::LinuxResources; +use std::path::{Path, PathBuf}; + +use super::{ + cpu::Cpu, cpuset::CpuSet, freezer::Freezer, hugetlb::HugeTlb, io::Io, memory::Memory, + pids::Pids, +}; +use crate::cgroups::common; +use crate::cgroups::common::CgroupManager; +use crate::cgroups::v2::controller::Controller; +use crate::cgroups::v2::controller_type::ControllerType; +use crate::utils::PathBufExt; + +const CGROUP_PROCS: &str = "cgroup.procs"; +const CGROUP_CONTROLLERS: &str = "cgroup.controllers"; +const CGROUP_SUBTREE_CONTROL: &str = "cgroup.subtree_control"; + +// v2 systemd only supports cpu, io, memory and pids. +const CONTROLLER_TYPES: &[ControllerType] = &[ + ControllerType::Cpu, + ControllerType::Io, + ControllerType::Memory, + ControllerType::Pids, +]; + +/// SystemDCGroupManager is a driver for managing cgroups via systemd. +pub struct SystemDCGroupManager { + root_path: PathBuf, + cgroups_path: PathBuf, + full_path: PathBuf, +} + +/// Represents the systemd cgroups path: +/// It should be of the form [slice]:[scope_prefix]:[name]. +/// The slice is the "parent" and should be expanded properly, +/// see expand_slice below. +struct CgroupsPath { + parent: String, + scope: String, + name: String, +} + +impl SystemDCGroupManager { + pub fn new(root_path: PathBuf, cgroups_path: PathBuf) -> Result { + // TODO: create the systemd unit using a dbus client. + let destructured_path = Self::destructure_cgroups_path(cgroups_path)?; + let cgroups_path = Self::construct_cgroups_path(destructured_path)?; + let full_path = root_path.join_absolute_path(&cgroups_path)?; + + Ok(SystemDCGroupManager { + root_path, + cgroups_path, + full_path, + }) + } + + fn destructure_cgroups_path(cgroups_path: PathBuf) -> Result { + // cgroups path may never be empty as it is defaulted to `/youki` + // see 'get_cgroup_path' under utils.rs. + // if cgroups_path was provided it should be of the form [slice]:[scope_prefix]:[name], + // for example: "system.slice:docker:1234". + let mut parent = ""; + let scope; + let name; + if cgroups_path.starts_with("/youki") { + scope = "youki"; + name = cgroups_path + .strip_prefix("/youki/")? + .to_str() + .ok_or_else(|| anyhow!("Failed to parse cgroupsPath field."))?; + } else { + let parts = cgroups_path + .to_str() + .ok_or_else(|| anyhow!("Failed to parse cgroupsPath field."))? + .split(':') + .collect::>(); + parent = parts[0]; + scope = parts[1]; + name = parts[2]; + } + + Ok(CgroupsPath { + parent: parent.to_owned(), + scope: scope.to_owned(), + name: name.to_owned(), + }) + } + + /// get_unit_name returns the unit (scope) name from the path provided by the user + /// for example: foo:docker:bar returns in '/docker-bar.scope' + fn get_unit_name(cgroups_path: CgroupsPath) -> String { + // By default we create a scope unless specified explicitly. + if !cgroups_path.name.ends_with(".slice") { + return format!("{}-{}.scope", cgroups_path.scope, cgroups_path.name); + } + cgroups_path.name + } + + // systemd represents slice hierarchy using `-`, so we need to follow suit when + // generating the path of slice. For example, 'test-a-b.slice' becomes + // '/test.slice/test-a.slice/test-a-b.slice'. + fn expand_slice(slice: &str) -> Result { + let suffix = ".slice"; + if slice.len() <= suffix.len() || !slice.ends_with(suffix) { + bail!("invalid slice name: {}", slice); + } + if slice.contains('/') { + bail!("invalid slice name: {}", slice); + } + let mut path = "".to_owned(); + let mut prefix = "".to_owned(); + let slice_name = slice.trim_end_matches(suffix); + // if input was -.slice, we should just return root now + if slice_name == "-" { + return Ok(Path::new("/").to_path_buf()); + } + for component in slice_name.split('-') { + if component.is_empty() { + anyhow!("Invalid slice name: {}", slice); + } + // Append the component to the path and to the prefix. + path = format!("{}/{}{}{}", path, prefix, component, suffix); + prefix = format!("{}{}-", prefix, component); + } + Ok(Path::new(&path).to_path_buf()) + } + + // get_cgroups_path generates a cgroups path from the one provided by the user via cgroupsPath. + // an example of the final path: "/machine.slice/docker-foo.scope" + fn construct_cgroups_path(cgroups_path: CgroupsPath) -> Result { + // the root slice is under 'machine.slice'. + let mut slice = Path::new("/machine.slice").to_path_buf(); + // if the user provided a '.slice' (as in a branch of a tree) + // we need to "unpack it". + if !cgroups_path.parent.is_empty() { + slice = Self::expand_slice(&cgroups_path.parent)?; + } + let unit_name = Self::get_unit_name(cgroups_path); + let cgroups_path = slice.join(unit_name); + Ok(cgroups_path) + } + + /// create_unified_cgroup verifies sure that *each level* in the downward path from the root cgroup + /// down to the cgroup_path provided by the user is a valid cgroup hierarchy, + /// containing the attached controllers and that it contains the container pid. + fn create_unified_cgroup(&self, pid: Pid) -> Result<()> { + let controllers: Vec = self + .get_available_controllers(&self.root_path)? + .into_iter() + .map(|c| format!("{}{}", "+", c.to_string())) + .collect(); + + // Write the controllers to the root_path. + Self::write_controllers(&self.root_path, &controllers)?; + + let mut current_path = self.root_path.clone(); + let mut components = self.cgroups_path.components().skip(1).peekable(); + // Verify that *each level* in the downward path from the root cgroup + // down to the cgroup_path provided by the user is a valid cgroup hierarchy. + // containing the attached controllers. + while let Some(component) = components.next() { + current_path = current_path.join(component); + if !current_path.exists() { + fs::create_dir(¤t_path)?; + fs::metadata(¤t_path)?.permissions().set_mode(0o755); + } + + // last component cannot have subtree_control enabled due to internal process constraint + // if this were set, writing to the cgroups.procs file will fail with Erno 16 (device or resource busy) + if components.peek().is_some() { + Self::write_controllers(¤t_path, &controllers)?; + } + } + + common::write_cgroup_file(self.full_path.join(CGROUP_PROCS), pid) + } + + fn get_available_controllers>( + &self, + cgroups_path: P, + ) -> Result> { + let controllers_path = self.root_path.join(cgroups_path).join(CGROUP_CONTROLLERS); + if !controllers_path.exists() { + bail!( + "cannot get available controllers. {:?} does not exist", + controllers_path + ) + } + + let mut controllers = Vec::new(); + for controller in fs::read_to_string(&controllers_path)?.split_whitespace() { + match controller { + "cpu" => controllers.push(ControllerType::Cpu), + "io" => controllers.push(ControllerType::Io), + "memory" => controllers.push(ControllerType::Memory), + "pids" => controllers.push(ControllerType::Pids), + _ => continue, + } + } + + Ok(controllers) + } + + fn write_controllers(path: &Path, controllers: &[String]) -> Result<()> { + for controller in controllers { + common::write_cgroup_file_str(path.join(CGROUP_SUBTREE_CONTROL), controller)?; + } + + Ok(()) + } +} + +impl CgroupManager for SystemDCGroupManager { + fn add_task(&self, pid: Pid) -> Result<()> { + // Dont attach any pid to the cgroup if -1 is specified as a pid + if pid.as_raw() == -1 { + return Ok(()); + } + + self.create_unified_cgroup(pid)?; + Ok(()) + } + + fn apply(&self, linux_resources: &LinuxResources) -> Result<()> { + for controller in CONTROLLER_TYPES { + match controller { + ControllerType::Cpu => Cpu::apply(linux_resources, &self.full_path)?, + ControllerType::CpuSet => CpuSet::apply(linux_resources, &self.full_path)?, + ControllerType::HugeTlb => HugeTlb::apply(linux_resources, &self.full_path)?, + ControllerType::Io => Io::apply(linux_resources, &self.full_path)?, + ControllerType::Memory => Memory::apply(linux_resources, &self.full_path)?, + ControllerType::Pids => Pids::apply(linux_resources, &self.full_path)?, + ControllerType::Freezer => Freezer::apply(linux_resources, &self.full_path)?, + } + } + + Ok(()) + } + + fn remove(&self) -> Result<()> { + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn expand_slice_works() -> Result<()> { + assert_eq!( + SystemDCGroupManager::expand_slice("test-a-b.slice")?, + PathBuf::from("/test.slice/test-a.slice/test-a-b.slice"), + ); + + Ok(()) + } + + #[test] + fn get_cgroups_path_works_with_a_complex_slice() -> Result<()> { + let cgroups_path = SystemDCGroupManager::destructure_cgroups_path(PathBuf::from( + "test-a-b.slice:docker:foo", + )) + .expect(""); + + assert_eq!( + SystemDCGroupManager::construct_cgroups_path(cgroups_path)?, + PathBuf::from("/test.slice/test-a.slice/test-a-b.slice/docker-foo.scope"), + ); + + Ok(()) + } + + #[test] + fn get_cgroups_path_works_with_a_simple_slice() -> Result<()> { + let cgroups_path = SystemDCGroupManager::destructure_cgroups_path(PathBuf::from( + "machine.slice:libpod:foo", + )) + .expect(""); + + assert_eq!( + SystemDCGroupManager::construct_cgroups_path(cgroups_path)?, + PathBuf::from("/machine.slice/libpod-foo.scope"), + ); + + Ok(()) + } + + #[test] + fn get_cgroups_path_works_with_scope() -> Result<()> { + let cgroups_path = + SystemDCGroupManager::destructure_cgroups_path(PathBuf::from(":docker:foo")).expect(""); + + assert_eq!( + SystemDCGroupManager::construct_cgroups_path(cgroups_path)?, + PathBuf::from("/machine.slice/docker-foo.scope"), + ); + + Ok(()) + } +} diff --git a/src/command/linux.rs b/src/command/linux.rs index 71282ea4c..cc055c172 100644 --- a/src/command/linux.rs +++ b/src/command/linux.rs @@ -1,8 +1,12 @@ //! Implements Command trait for Linux systems -use std::{any::Any, path::Path}; +use std::ffi::{CStr, OsStr}; +use std::os::unix::ffi::OsStrExt; +use std::sync::Arc; +use std::{any::Any, mem, path::Path, ptr}; use anyhow::{bail, Result}; -use caps::{errors::CapsError, CapSet, CapsHashSet}; +use caps::{errors::CapsError, CapSet, Capability, CapsHashSet}; +use libc::{c_char, uid_t}; use nix::{ errno::Errno, unistd::{fchdir, pivot_root, sethostname}, @@ -20,14 +24,29 @@ use nix::{sched::unshare, sys::stat::Mode}; use oci_spec::LinuxRlimit; -use super::Command; +use super::Syscall; use crate::capabilities; /// Empty structure to implement Command trait for #[derive(Clone)] -pub struct LinuxCommand; +pub struct LinuxSyscall; -impl Command for LinuxCommand { +impl LinuxSyscall { + unsafe fn from_raw_buf<'a, T>(p: *const c_char) -> T + where + T: From<&'a OsStr>, + { + T::from(OsStr::from_bytes(CStr::from_ptr(p).to_bytes())) + } + + /// Reads data from the `c_passwd` and returns it as a `User`. + unsafe fn passwd_to_user(passwd: libc::passwd) -> Arc { + let name: Arc = Self::from_raw_buf(passwd.pw_name); + name + } +} + +impl Syscall for LinuxSyscall { /// To enable dynamic typing, /// see https://doc.rust-lang.org/std/any/index.html for more information fn as_any(&self) -> &dyn Any { @@ -95,7 +114,31 @@ impl Command for LinuxCommand { /// Set capabilities for container process fn set_capability(&self, cset: CapSet, value: &CapsHashSet) -> Result<(), CapsError> { - caps::set(None, cset, value) + match cset { + // caps::set cannot set capabilities in bounding set, + // so we do it differently + CapSet::Bounding => { + // get all capabilities + let all = caps::all(); + // the difference will give capabilities + // which are to be unset + // for each such =, drop that capability + // after this, only those which are to be set will remain set + for c in all.difference(value) { + match c { + Capability::CAP_PERFMON + | Capability::CAP_CHECKPOINT_RESTORE + | Capability::CAP_BPF => { + log::warn!("{:?} is not supported.", c); + continue; + } + _ => caps::drop(None, CapSet::Bounding, *c)?, + } + } + Ok(()) + } + _ => caps::set(None, cset, value), + } } /// Sets hostname for process @@ -118,4 +161,38 @@ impl Command for LinuxCommand { } Ok(()) } + + // taken from https://crates.io/crates/users + fn get_pwuid(&self, uid: uid_t) -> Option> { + let mut passwd = unsafe { mem::zeroed::() }; + let mut buf = vec![0; 2048]; + let mut result = ptr::null_mut::(); + + loop { + let r = unsafe { + libc::getpwuid_r(uid, &mut passwd, buf.as_mut_ptr(), buf.len(), &mut result) + }; + + if r != libc::ERANGE { + break; + } + + let newsize = buf.len().checked_mul(2)?; + buf.resize(newsize, 0); + } + + if result.is_null() { + // There is no such user, or an error has occurred. + // errno gets set if there’s an error. + return None; + } + + if result != &mut passwd { + // The result of getpwuid_r should be its input passwd. + return None; + } + + let user = unsafe { Self::passwd_to_user(result.read()) }; + Some(user) + } } diff --git a/src/command/mod.rs b/src/command/mod.rs index fa2fc01fe..543997e8e 100644 --- a/src/command/mod.rs +++ b/src/command/mod.rs @@ -2,9 +2,9 @@ //! This provides a uniform interface for rest of Youki //! to call syscalls required for container management -#[allow(clippy::module_inception)] -mod command; pub mod linux; +#[allow(clippy::module_inception)] +pub mod syscall; pub mod test; -pub use command::Command; +pub use syscall::Syscall; diff --git a/src/command/command.rs b/src/command/syscall.rs similarity index 70% rename from src/command/command.rs rename to src/command/syscall.rs index 4ad6f3417..f3bba8727 100644 --- a/src/command/command.rs +++ b/src/command/syscall.rs @@ -1,7 +1,7 @@ //! An interface trait so that rest of Youki can call //! necessary functions without having to worry about their //! implementation details -use std::{any::Any, path::Path}; +use std::{any::Any, ffi::OsStr, path::Path, sync::Arc}; use anyhow::Result; use caps::{errors::CapsError, CapSet, CapsHashSet}; @@ -12,9 +12,11 @@ use nix::{ use oci_spec::LinuxRlimit; +use crate::command::{linux::LinuxSyscall, test::TestHelperSyscall}; + /// This specifies various kernel/other functionalities required for /// container management -pub trait Command { +pub trait Syscall { fn as_any(&self) -> &dyn Any; fn pivot_rootfs(&self, path: &Path) -> Result<()>; fn set_ns(&self, rawfd: i32, nstype: CloneFlags) -> Result<()>; @@ -23,4 +25,13 @@ pub trait Command { fn set_capability(&self, cset: CapSet, value: &CapsHashSet) -> Result<(), CapsError>; fn set_hostname(&self, hostname: &str) -> Result<()>; fn set_rlimit(&self, rlimit: &LinuxRlimit) -> Result<()>; + fn get_pwuid(&self, uid: u32) -> Option>; +} + +pub fn create_syscall() -> Box { + if cfg!(test) { + Box::new(TestHelperSyscall::default()) + } else { + Box::new(LinuxSyscall) + } } diff --git a/src/command/test.rs b/src/command/test.rs index fe5540d14..eaa7c3ac0 100644 --- a/src/command/test.rs +++ b/src/command/test.rs @@ -1,21 +1,21 @@ -use std::{any::Any, cell::RefCell}; +use std::{any::Any, cell::RefCell, ffi::OsStr, sync::Arc}; use caps::{errors::CapsError, CapSet, CapsHashSet}; use nix::sched::CloneFlags; use oci_spec::LinuxRlimit; -use super::Command; +use super::Syscall; #[derive(Clone)] -pub struct TestHelperCommand { +pub struct TestHelperSyscall { set_ns_args: RefCell>, unshare_args: RefCell>, set_capability_args: RefCell>, } -impl Default for TestHelperCommand { +impl Default for TestHelperSyscall { fn default() -> Self { - TestHelperCommand { + TestHelperSyscall { set_ns_args: RefCell::new(vec![]), unshare_args: RefCell::new(vec![]), set_capability_args: RefCell::new(vec![]), @@ -23,7 +23,7 @@ impl Default for TestHelperCommand { } } -impl Command for TestHelperCommand { +impl Syscall for TestHelperSyscall { fn as_any(&self) -> &dyn Any { self } @@ -60,9 +60,13 @@ impl Command for TestHelperCommand { fn set_rlimit(&self, _rlimit: &LinuxRlimit) -> anyhow::Result<()> { todo!() } + + fn get_pwuid(&self, _: u32) -> Option> { + todo!() + } } -impl TestHelperCommand { +impl TestHelperSyscall { pub fn get_setns_args(&self) -> Vec<(i32, CloneFlags)> { self.set_ns_args.borrow_mut().clone() } diff --git a/src/container/builder.rs b/src/container/builder.rs new file mode 100644 index 000000000..cfaf6dc98 --- /dev/null +++ b/src/container/builder.rs @@ -0,0 +1,130 @@ +use crate::command::linux::LinuxSyscall; +use std::path::PathBuf; + +use super::{init_builder::InitContainerBuilder, tenant_builder::TenantContainerBuilder}; +pub struct ContainerBuilder { + pub(super) container_id: String, + + pub(super) root_path: PathBuf, + + pub(super) syscall: LinuxSyscall, + + pub(super) pid_file: Option, + + pub(super) console_socket: Option, +} + +/// Builder that can be used to configure the common properties of +/// either a init or a tenant container +/// +/// # Example +/// +/// ```no_run +/// use youki::container::builder::ContainerBuilder; +/// +/// ContainerBuilder::new("74f1a4cb3801".to_owned()) +/// .with_root_path("/run/containers/youki") +/// .with_pid_file("/var/run/docker.pid") +/// .with_console_socket("/var/run/docker/sock.tty") +/// .as_init("/var/run/docker/bundle") +/// .build(); +/// ``` +impl ContainerBuilder { + /// Generates the base configuration for a container which can be + /// transformed into either a init container or a tenant container + /// + /// # Example + /// + /// ```no_run + /// use youki::container::builder::ContainerBuilder; + /// + /// let builder = ContainerBuilder::new("74f1a4cb3801".to_owned()); + /// ``` + pub fn new(container_id: String) -> Self { + let root_path = PathBuf::from("/run/youki"); + + Self { + container_id, + root_path, + syscall: LinuxSyscall, + pid_file: None, + console_socket: None, + } + } + + /// Transforms this builder into a tenant builder + /// # Example + /// + /// ```no_run + /// # use youki::container::builder::ContainerBuilder; + /// + /// ContainerBuilder::new("74f1a4cb3801".to_owned()) + /// .as_tenant() + /// .with_container_command(vec!["sleep".to_owned(), "9001".to_owned()]) + /// .build(); + /// ``` + #[allow(clippy::wrong_self_convention)] + pub fn as_tenant(self) -> TenantContainerBuilder { + TenantContainerBuilder::new(self) + } + + /// Transforms this builder into an init builder + /// # Example + /// + /// ```no_run + /// # use youki::container::builder::ContainerBuilder; + /// + /// ContainerBuilder::new("74f1a4cb3801".to_owned()) + /// .as_init("/var/run/docker/bundle") + /// .with_systemd(false) + /// .build(); + /// ``` + #[allow(clippy::wrong_self_convention)] + pub fn as_init>(self, bundle: P) -> InitContainerBuilder { + InitContainerBuilder::new(self, bundle.into()) + } + + /// Sets the root path which will be used to store the container state + /// # Example + /// + /// ```no_run + /// # use youki::container::builder::ContainerBuilder; + /// + /// ContainerBuilder::new("74f1a4cb3801".to_owned()) + /// .with_root_path("/run/containers/youki"); + /// ``` + pub fn with_root_path>(mut self, path: P) -> Self { + self.root_path = path.into(); + self + } + + /// Sets the pid file which will be used to write the pid of the container + /// process + /// # Example + /// + /// ```no_run + /// # use youki::container::builder::ContainerBuilder; + /// + /// ContainerBuilder::new("74f1a4cb3801".to_owned()) + /// .with_pid_file("/var/run/docker.pid"); + /// ``` + pub fn with_pid_file>(mut self, path: P) -> Self { + self.pid_file = Some(path.into()); + self + } + + /// Sets the console socket, which will be used to send the file descriptor + /// of the pseudoterminal + /// # Example + /// + /// ```no_run + /// # use youki::container::builder::ContainerBuilder; + /// + /// ContainerBuilder::new("74f1a4cb3801".to_owned()) + /// .with_console_socket("/var/run/docker/sock.tty"); + /// ``` + pub fn with_console_socket>(mut self, path: P) -> Self { + self.console_socket = Some(path.into()); + self + } +} diff --git a/src/container/builder_impl.rs b/src/container/builder_impl.rs new file mode 100644 index 000000000..cf85bddc9 --- /dev/null +++ b/src/container/builder_impl.rs @@ -0,0 +1,121 @@ +use std::path::PathBuf; + +use anyhow::Result; +use nix::{ + sched, + unistd::{Gid, Uid}, +}; +use oci_spec::Spec; + +use crate::{ + cgroups, + command::{linux::LinuxSyscall, Syscall}, + namespaces::Namespaces, + notify_socket::NotifyListener, + process::{fork, setup_init_process, Process}, + rootless::Rootless, + stdio::FileDescriptor, + tty, utils, +}; + +use super::{Container, ContainerStatus}; + +pub(super) struct ContainerBuilderImpl { + pub init: bool, + pub syscall: LinuxSyscall, + pub use_systemd: bool, + pub container_id: String, + pub root_path: PathBuf, + pub container_dir: PathBuf, + pub spec: Spec, + pub rootfs: PathBuf, + pub pid_file: Option, + pub console_socket: Option, + pub rootless: Option, + pub notify_socket: NotifyListener, + pub container: Option, +} + +impl ContainerBuilderImpl { + pub(super) fn create(&mut self) -> Result<()> { + if let Process::Parent(_) = self.run_container()? { + std::process::exit(0); + } + + Ok(()) + } + + fn run_container(&mut self) -> Result { + prctl::set_dumpable(false).unwrap(); + + let linux = self.spec.linux.as_ref().unwrap(); + let namespaces: Namespaces = linux.namespaces.clone().into(); + + let cgroups_path = utils::get_cgroup_path(&linux.cgroups_path, &self.container_id); + let cmanager = cgroups::common::create_cgroup_manager(&cgroups_path, self.use_systemd)?; + + // first fork, which creates process, which will later create actual container process + match fork::fork_first( + &self.pid_file, + &self.rootless, + linux, + self.container.as_ref(), + cmanager, + )? { + // In the parent process, which called run_container + Process::Parent(parent) => Ok(Process::Parent(parent)), + // in child process + Process::Child(child) => { + // set limits and namespaces to the process + for rlimit in self.spec.process.rlimits.iter() { + self.syscall.set_rlimit(rlimit)? + } + self.syscall.set_id(Uid::from_raw(0), Gid::from_raw(0))?; + + let without = sched::CloneFlags::CLONE_NEWUSER; + namespaces.apply_unshare(without)?; + + // set up tty if specified + if let Some(csocketfd) = &self.console_socket { + tty::setup_console(csocketfd)?; + } + + // set namespaces + namespaces.apply_setns()?; + + // fork second time, which will later create container + match fork::fork_init(child)? { + Process::Child(_child) => unreachable!(), + // This is actually the child process after fork + Process::Init(mut init) => { + // prepare process + setup_init_process( + &self.spec, + &self.syscall, + self.rootfs.clone(), + &namespaces, + )?; + init.ready()?; + self.notify_socket.wait_for_container_start()?; + // actually run the command / program to be run in container + let args: &Vec = &self.spec.process.args; + let envs: &Vec = &self.spec.process.env; + utils::do_exec(&args[0], args, envs)?; + + if let Some(container) = &self.container { + // the command / program is done executing + container + .refresh_state()? + .update_status(ContainerStatus::Stopped) + .save()?; + } + + Ok(Process::Init(init)) + } + Process::Parent(_) => unreachable!(), + } + } + _ => unreachable!(), + } + } +} diff --git a/src/container/container.rs b/src/container/container.rs index 378d30096..d144e9497 100644 --- a/src/container/container.rs +++ b/src/container/container.rs @@ -1,10 +1,16 @@ +use std::ffi::OsString; use std::fs; use std::path::{Path, PathBuf}; use anyhow::Result; +use chrono::DateTime; use nix::unistd::Pid; + +use chrono::Utc; use procfs::process::Process; +use crate::command::syscall::create_syscall; + use crate::container::{ContainerStatus, State}; /// Structure representing the container data @@ -39,7 +45,7 @@ impl Container { pub fn status(&self) -> ContainerStatus { self.state.status } - pub fn refresh_status(&self) -> Result { + pub fn refresh_status(&mut self) -> Result { let new_status = match self.pid() { Some(pid) => { // Note that Process::new does not spawn a new process @@ -60,11 +66,19 @@ impl Container { } None => ContainerStatus::Stopped, }; - self.update_status(new_status) + Ok(self.update_status(new_status)) + } + + pub fn refresh_state(&self) -> Result { + let state = State::load(&self.root)?; + Ok(Self { + state, + root: self.root.clone(), + }) } pub fn save(&self) -> Result<()> { - log::debug!("Sava container status: {:?} in {:?}", self, self.root); + log::debug!("Save container status: {:?} in {:?}", self, self.root); self.state.save(&self.root) } @@ -85,24 +99,50 @@ impl Container { } pub fn set_pid(&self, pid: i32) -> Self { - Self::new( - self.state.id.as_str(), - self.state.status, - Some(pid), - self.state.bundle.as_str(), - &self.root, - ) - .expect("unexpected error") - } - - pub fn update_status(&self, status: ContainerStatus) -> Result { - Self::new( - self.state.id.as_str(), - status, - self.state.pid, - self.state.bundle.as_str(), - &self.root, - ) + let mut new_state = self.state.clone(); + new_state.pid = Some(pid); + + Self { + state: new_state, + root: self.root.clone(), + } + } + + pub fn created(&self) -> Option> { + self.state.created + } + + pub fn set_creator(mut self, uid: u32) -> Self { + self.state.creator = Some(uid); + self + } + + pub fn creator(&self) -> Option { + if let Some(uid) = self.state.creator { + let command = create_syscall(); + let user_name = command.get_pwuid(uid); + if let Some(user_name) = user_name { + return Some((&*user_name).to_owned()); + } + } + + None + } + + pub fn update_status(&self, status: ContainerStatus) -> Self { + let created = match (status, self.state.created) { + (ContainerStatus::Created, None) => Some(Utc::now()), + _ => self.state.created, + }; + + let mut new_state = self.state.clone(); + new_state.created = created; + new_state.status = status; + + Self { + state: new_state, + root: self.root.clone(), + } } pub fn load(container_root: PathBuf) -> Result { @@ -112,4 +152,8 @@ impl Container { root: container_root, }) } + + pub fn bundle(&self) -> String { + self.state.bundle.clone() + } } diff --git a/src/container/init_builder.rs b/src/container/init_builder.rs new file mode 100644 index 000000000..b6789f21e --- /dev/null +++ b/src/container/init_builder.rs @@ -0,0 +1,121 @@ +use anyhow::{bail, Context, Result}; +use nix::unistd; +use oci_spec::Spec; +use rootless::detect_rootless; +use std::{ + fs, + path::{Path, PathBuf}, +}; + +use crate::{notify_socket::NotifyListener, rootless, tty, utils}; + +use super::{ + builder::ContainerBuilder, builder_impl::ContainerBuilderImpl, Container, ContainerStatus, +}; + +// Builder that can be used to configure the properties of a new container +pub struct InitContainerBuilder { + base: ContainerBuilder, + bundle: PathBuf, + use_systemd: bool, +} + +impl InitContainerBuilder { + /// Generates the base configuration for a new container from which + /// configuration methods can be chained + pub(super) fn new(builder: ContainerBuilder, bundle: PathBuf) -> Self { + Self { + base: builder, + bundle, + use_systemd: true, + } + } + + /// Sets if systemd should be used for managing cgroups + pub fn with_systemd(mut self, should_use: bool) -> Self { + self.use_systemd = should_use; + self + } + + /// Creates a new container + pub fn build(self) -> Result<()> { + let container_dir = self.create_container_dir()?; + let spec = self.load_and_safeguard_spec(&container_dir)?; + + unistd::chdir(&*container_dir)?; + let container_state = self.create_container_state(&container_dir)?; + + let notify_socket: NotifyListener = NotifyListener::new(&container_dir)?; + // convert path of root file system of the container to absolute path + let rootfs = fs::canonicalize(&spec.root.path)?; + + // if socket file path is given in commandline options, + // get file descriptors of console socket + let csocketfd = if let Some(console_socket) = &self.base.console_socket { + Some(tty::setup_console_socket(&container_dir, console_socket)?) + } else { + None + }; + + let rootless = detect_rootless(&spec)?; + + let mut builder_impl = ContainerBuilderImpl { + init: true, + syscall: self.base.syscall, + container_id: self.base.container_id, + root_path: self.base.root_path, + pid_file: self.base.pid_file, + console_socket: csocketfd, + use_systemd: self.use_systemd, + container_dir, + spec, + rootfs, + rootless, + notify_socket, + container: Some(container_state), + }; + + builder_impl.create()?; + Ok(()) + } + + fn create_container_dir(&self) -> Result { + let container_dir = self.base.root_path.join(&self.base.container_id); + log::debug!("container directory will be {:?}", container_dir); + + if container_dir.exists() { + bail!("container {} already exists", self.base.container_id); + } + + utils::create_dir_all(&container_dir)?; + Ok(container_dir) + } + + fn load_and_safeguard_spec(&self, container_dir: &Path) -> Result { + let source_spec_path = self.bundle.join("config.json"); + let target_spec_path = container_dir.join("config.json"); + fs::copy(&source_spec_path, &target_spec_path).with_context(|| { + format!( + "failed to copy {:?} to {:?}", + source_spec_path, target_spec_path + ) + })?; + + let mut spec = oci_spec::Spec::load(&target_spec_path)?; + unistd::chdir(&self.bundle)?; + spec.canonicalize_rootfs()?; + Ok(spec) + } + + fn create_container_state(&self, container_dir: &Path) -> Result { + let container = Container::new( + &self.base.container_id, + ContainerStatus::Creating, + None, + self.bundle.as_path().to_str().unwrap(), + &container_dir, + )?; + container.save()?; + Ok(container) + } +} diff --git a/src/container/mod.rs b/src/container/mod.rs index bf3ca1982..59873ad22 100644 --- a/src/container/mod.rs +++ b/src/container/mod.rs @@ -1,7 +1,11 @@ //! Container management +pub mod builder; +mod builder_impl; #[allow(clippy::module_inception)] mod container; +pub mod init_builder; mod state; +pub mod tenant_builder; pub use container::Container; pub use state::{ContainerStatus, State}; diff --git a/src/container/state.rs b/src/container/state.rs index 30964c855..49bfa4274 100644 --- a/src/container/state.rs +++ b/src/container/state.rs @@ -1,9 +1,11 @@ //! Information about status and state of the container use std::collections::HashMap; +use std::fmt::Display; use std::fs; use std::{fs::File, path::Path}; use anyhow::Result; +use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; const STATE_FILE_PATH: &str = "state.json"; @@ -40,6 +42,19 @@ impl ContainerStatus { } } +impl Display for ContainerStatus { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let print = match *self { + Self::Creating => "Creating", + Self::Created => "Created", + Self::Running => "Running", + Self::Stopped => "Stopped", + }; + + write!(f, "{}", print) + } +} + /// Stores the state information of the container #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(rename_all = "camelCase")] @@ -57,6 +72,12 @@ pub struct State { pub bundle: String, // Annotations are key values associated with the container. pub annotations: HashMap, + // Creation time of the container + #[serde(skip_serializing_if = "Option::is_none")] + pub created: Option>, + // User that created the container + #[serde(skip_serializing_if = "Option::is_none")] + pub creator: Option, } impl State { @@ -73,6 +94,8 @@ impl State { pid, bundle: bundle.to_string(), annotations: HashMap::default(), + created: None, + creator: None, } } diff --git a/src/container/tenant_builder.rs b/src/container/tenant_builder.rs new file mode 100644 index 000000000..386c3a6f0 --- /dev/null +++ b/src/container/tenant_builder.rs @@ -0,0 +1,107 @@ +use anyhow::{bail, Result}; +use oci_spec::Spec; +use std::{ + collections::HashMap, + fs, + path::{Path, PathBuf}, +}; + +use crate::{notify_socket::NotifyListener, rootless::detect_rootless, tty}; + +use super::{builder::ContainerBuilder, builder_impl::ContainerBuilderImpl}; + +/// Builder that can be used to configure the properties of a process +/// that will join an existing container sandbox +pub struct TenantContainerBuilder { + base: ContainerBuilder, + env: HashMap, + cwd: Option, + command: Vec, +} + +impl TenantContainerBuilder { + /// Generates the base configuration for a process that will join + /// an existing container sandbox from which configuration methods + /// can be chained + pub(super) fn new(builder: ContainerBuilder) -> Self { + Self { + base: builder, + env: HashMap::new(), + cwd: None, + command: vec!["sh".to_owned()], + } + } + + /// Sets environment variables for the container + pub fn with_env(mut self, env: HashMap) -> Self { + self.env = env; + self + } + + /// Sets the working directory of the container + pub fn with_cwd>(mut self, path: P) -> Self { + self.cwd = Some(path.into()); + self + } + + /// Sets the command the container will be started with + pub fn with_container_command(mut self, command: Vec) -> Self { + self.command = command; + self + } + + /// Joins an existing container + pub fn build(self) -> Result<()> { + let container_dir = self.lookup_container_dir()?; + let spec = self.load_init_spec(&container_dir)?; + + let notify_socket: NotifyListener = NotifyListener::new(&container_dir)?; + // convert path of root file system of the container to absolute path + let rootfs = fs::canonicalize(&spec.root.path)?; + + // if socket file path is given in commandline options, + // get file descriptors of console socket + let csocketfd = if let Some(console_socket) = &self.base.console_socket { + Some(tty::setup_console_socket(&container_dir, console_socket)?) + } else { + None + }; + + let rootless = detect_rootless(&spec)?; + + let mut builder_impl = ContainerBuilderImpl { + init: false, + syscall: self.base.syscall, + container_id: self.base.container_id, + root_path: self.base.root_path, + pid_file: self.base.pid_file, + console_socket: csocketfd, + use_systemd: false, + container_dir, + spec, + rootfs, + rootless, + notify_socket, + container: None, + }; + + builder_impl.create()?; + Ok(()) + } + + fn lookup_container_dir(&self) -> Result { + let container_dir = self.base.root_path.join(&self.base.container_id); + if !container_dir.exists() { + bail!("container {} does not exist", self.base.container_id); + } + + Ok(container_dir) + } + + fn load_init_spec(&self, container_dir: &Path) -> Result { + let spec_path = container_dir.join("config.json"); + + let spec = oci_spec::Spec::load(spec_path)?; + Ok(spec) + } +} diff --git a/src/create.rs b/src/create.rs index 3c1b19e1d..d0f6b8471 100644 --- a/src/create.rs +++ b/src/create.rs @@ -1,24 +1,9 @@ //! Handles the creation of a new container -use std::fs; -use std::path::{Path, PathBuf}; -use std::process; - -use anyhow::{bail, Result}; +use anyhow::Result; use clap::Clap; -use nix::sched; -use nix::unistd; -use nix::unistd::{Gid, Uid}; +use std::path::PathBuf; -use crate::cgroups; -use crate::container::{Container, ContainerStatus}; -use crate::namespaces::Namespaces; -use crate::notify_socket::NotifyListener; -use crate::process::{fork, Process}; -use crate::rootfs; -use crate::stdio::FileDescriptor; -use crate::tty; -use crate::utils; -use crate::{capabilities, command::Command}; +use crate::container::builder::ContainerBuilder; /// This is the main structure which stores various commandline options given by /// high-level container runtime @@ -45,181 +30,22 @@ pub struct Create { // associated with it like any other process. impl Create { /// Starts a new container process - pub fn exec(&self, root_path: PathBuf, command: impl Command) -> Result<()> { - // create a directory for the container to store state etc. - // if already present, return error - let bundle_canonicalized = fs::canonicalize(&self.bundle) - .unwrap_or_else(|_| panic!("failed to canonicalied {:?}", &self.bundle)); - let container_dir = root_path.join(&self.container_id); - log::debug!("container directory will be {:?}", container_dir); - - if !container_dir.exists() { - fs::create_dir(&container_dir).unwrap(); - } else { - bail!("{} already exists", self.container_id) - } - - // change directory to the bundle directory, and load configuration, - // copy that to the container's directory - unistd::chdir(&self.bundle)?; - - let spec = oci_spec::Spec::load("config.json")?; - fs::copy("config.json", container_dir.join("config.json"))?; - log::debug!("spec: {:?}", spec); - - // convert path to absolute path, as relative path will be evaluated - // relative to where youki command is executed, and will be difficult to manipulate - let container_dir = fs::canonicalize(container_dir)?; - unistd::chdir(&*container_dir)?; - - log::debug!("{:?}", &container_dir); - - let container = Container::new( - &self.container_id, - ContainerStatus::Creating, - None, - bundle_canonicalized.to_str().unwrap(), - &container_dir, - )?; - container.save()?; - - let mut notify_socket: NotifyListener = NotifyListener::new(&container_dir)?; - // convert path of root file system of the container to absolute path - let rootfs = fs::canonicalize(&spec.root.path)?; - - // if socket file path is given in commandline options, - // get file descriptors of console socket - let csocketfd = if let Some(console_socket) = &self.console_socket { - Some(tty::setup_console_socket(&container_dir, console_socket)?) - } else { - None - }; - - let process = run_container( - self.pid_file.as_ref(), - &mut notify_socket, - rootfs, - spec, - csocketfd, - container, - command, - )?; - // the run_container forks the process, so not after return if in - // parent process, exit ; as the work of creating the container is done - if let Process::Parent(_) = process { - process::exit(0); + pub fn exec(&self, root_path: PathBuf, systemd_cgroup: bool) -> Result<()> { + let mut builder = ContainerBuilder::new(self.container_id.clone()); + if let Some(pid_file) = &self.pid_file { + builder = builder.with_pid_file(pid_file); } - // if in the child process after fork, then just return - Ok(()) - } -} -/// Fork the process and actually start the container process -fn run_container>( - pid_file: Option

, - notify_socket: &mut NotifyListener, - rootfs: PathBuf, - spec: oci_spec::Spec, - csocketfd: Option, - container: Container, - command: impl Command, -) -> Result { - // disable core dump for the process, check https://man7.org/linux/man-pages/man2/prctl.2.html for more information - prctl::set_dumpable(false).unwrap(); - // get Linux specific section of OCI spec, - // refer https://github.com/opencontainers/runtime-spec/blob/master/config-linux.md for more information - let linux = spec.linux.as_ref().unwrap(); - let namespaces: Namespaces = linux.namespaces.clone().into(); - - let cgroups_path = utils::get_cgroup_path(&linux.cgroups_path, container.id()); - let cmanager = cgroups::common::create_cgroup_manager(&cgroups_path)?; - - // first fork, which creates process, which will later create actual container process - match fork::fork_first( - pid_file, - namespaces - .clone_flags - .contains(sched::CloneFlags::CLONE_NEWUSER), - linux, - &container, - cmanager, - )? { - // In the parent process, which called run_container - Process::Parent(parent) => Ok(Process::Parent(parent)), - // in child process - Process::Child(child) => { - // set limits and namespaces to the process - for rlimit in spec.process.rlimits.iter() { - command.set_rlimit(rlimit)? - } - command.set_id(Uid::from_raw(0), Gid::from_raw(0))?; - - let without = sched::CloneFlags::CLONE_NEWUSER; - namespaces.apply_unshare(without)?; - - // set up tty if specified - if let Some(csocketfd) = csocketfd { - tty::setup_console(csocketfd)?; - } - - // set namespaces - namespaces.apply_setns()?; - - // fork second time, which will later create container - match fork::fork_init(child)? { - Process::Child(_child) => unreachable!(), - // This is actually the child process after fork - Process::Init(mut init) => { - // setup args and env vars as in the spec - let spec_args: &Vec = &spec.process.args.clone(); - let envs: &Vec = &spec.process.env.clone(); - // prepare process - init_process(spec, command, rootfs, namespaces)?; - init.ready()?; - notify_socket.wait_for_container_start()?; - // actually run the command / program to be run in container - utils::do_exec(&spec_args[0], spec_args, envs)?; - // the command / program is done executing - container.update_status(ContainerStatus::Stopped)?.save()?; - - Ok(Process::Init(init)) - } - Process::Parent(_) => unreachable!(), - } + if let Some(console_socket) = &self.console_socket { + builder = builder.with_console_socket(console_socket); } - _ => unreachable!(), - } -} -/// setup hostname, rootfs for the container process -fn init_process( - spec: oci_spec::Spec, - command: impl Command, - rootfs: PathBuf, - namespaces: Namespaces, -) -> Result<()> { - let proc = spec.process.clone(); - - command.set_hostname(&spec.hostname.as_str())?; - if spec.process.no_new_privileges { - let _ = prctl::set_no_new_privileges(true); - } + builder + .with_root_path(root_path) + .as_init(&self.bundle) + .with_systemd(systemd_cgroup) + .build()?; - rootfs::prepare_rootfs( - &spec, - &rootfs, - namespaces - .clone_flags - .contains(sched::CloneFlags::CLONE_NEWUSER), - )?; - - // change the root of filesystem of the process to the rootfs - command.pivot_rootfs(&rootfs)?; - - command.set_id(Uid::from_raw(proc.user.uid), Gid::from_raw(proc.user.gid))?; - capabilities::reset_effective(&command)?; - if let Some(caps) = &proc.capabilities { - capabilities::drop_privileges(&caps, &command)?; + Ok(()) } - Ok(()) } diff --git a/src/dbus/client.rs b/src/dbus/client.rs new file mode 100644 index 000000000..b0dc4afef --- /dev/null +++ b/src/dbus/client.rs @@ -0,0 +1,33 @@ +use anyhow::Result; +use dbus::blocking::Connection; +use std::time::Duration; +use std::vec::Vec; + +/// Client is a wrapper providing higher level API and abatraction around dbus. +/// For more information see https://www.freedesktop.org/wiki/Software/systemd/dbus/ +pub struct Client { + conn: Connection, +} + +impl Client { + pub fn new() -> Result { + let conn = Connection::new_session()?; + Ok(Client { conn }) + } + + /// start_unit starts a specific unit under systemd. See https://www.freedesktop.org/wiki/Software/systemd/dbus + /// for more details. + pub fn start_unit(&self, unit_name: &str, _properties: Vec<&str>) -> Result<()> { + let proxy = self.conn.with_proxy( + "org.freedesktop.systemd1.Manager", + "/", + Duration::from_millis(5000), + ); + let (_job_id,): (i32,) = proxy.method_call( + "org.freedesktop.systemd1.Manager", + "StartTransientUnit", + (unit_name, "replace"), + )?; + Ok(()) + } +} diff --git a/src/dbus/mod.rs b/src/dbus/mod.rs new file mode 100644 index 000000000..e99ee79b9 --- /dev/null +++ b/src/dbus/mod.rs @@ -0,0 +1,2 @@ +mod client; +pub use client::Client; diff --git a/src/delete.rs b/src/delete.rs new file mode 100644 index 000000000..0c35b6448 --- /dev/null +++ b/src/delete.rs @@ -0,0 +1,72 @@ +use std::fs; +use std::path::PathBuf; + +use anyhow::{bail, Result}; +use clap::Clap; +use nix::sys::signal::Signal; + +use crate::cgroups; +use crate::container::{Container, ContainerStatus}; +use crate::utils; +use nix::sys::signal as nix_signal; + +#[derive(Clap, Debug)] +pub struct Delete { + container_id: String, + /// forces deletion of the container if it is still running (using SIGKILL) + #[clap(short, long)] + force: bool, +} + +impl Delete { + pub fn exec(&self, root_path: PathBuf, systemd_cgroup: bool) -> Result<()> { + log::debug!("start deleting {}", self.container_id); + // state of container is stored in a directory named as container id inside + // root directory given in commandline options + let container_root = root_path.join(&self.container_id); + if !container_root.exists() { + bail!("{} doesn't exist.", self.container_id) + } + // load container state from json file, and check status of the container + // it might be possible that delete is invoked on a running container. + log::debug!("load the container from {:?}", container_root); + let mut container = Container::load(container_root)?.refresh_status()?; + if container.can_kill() && self.force { + let sig = Signal::SIGKILL; + log::debug!("kill signal {} to {}", sig, container.pid().unwrap()); + nix_signal::kill(container.pid().unwrap(), sig)?; + container = container.update_status(ContainerStatus::Stopped); + container.save()?; + } + log::debug!("container status: {:?}", container.status()); + if container.can_delete() { + if container.root.exists() { + let config_absolute_path = container.root.join("config.json"); + log::debug!("load spec from {:?}", config_absolute_path); + let spec = oci_spec::Spec::load(config_absolute_path)?; + log::debug!("spec: {:?}", spec); + + // remove the directory storing container state + log::debug!("remove dir {:?}", container.root); + fs::remove_dir_all(&container.root)?; + + let cgroups_path = + utils::get_cgroup_path(&spec.linux.unwrap().cgroups_path, container.id()); + + // remove the cgroup created for the container + // check https://man7.org/linux/man-pages/man7/cgroups.7.html + // creating and removing cgroups section for more information on cgroups + let cmanager = + cgroups::common::create_cgroup_manager(cgroups_path, systemd_cgroup)?; + cmanager.remove()?; + } + std::process::exit(0) + } else { + bail!( + "{} could not be deleted because it was {:?}", + container.id(), + container.status() + ) + } + } +} diff --git a/src/info.rs b/src/info.rs new file mode 100644 index 000000000..ae585565c --- /dev/null +++ b/src/info.rs @@ -0,0 +1,130 @@ +//! Contains functions related to printing information about system running Youki +use std::{fs, path::Path}; + +use anyhow::Result; +use clap::Clap; +use procfs::{CpuInfo, Meminfo}; + +use crate::cgroups; + +#[derive(Clap, Debug)] +pub struct Info {} + +impl Info { + pub fn exec(&self) -> Result<()> { + print_youki(); + print_kernel(); + print_os(); + print_hardware(); + print_cgroups(); + + Ok(()) + } +} + +/// print Version of Youki +pub fn print_youki() { + println!("{:<18}{}", "Version", env!("CARGO_PKG_VERSION")); +} + +/// Print Kernel Release, Version and Architecture +pub fn print_kernel() { + let uname = nix::sys::utsname::uname(); + println!("{:<18}{}", "Kernel-Release", uname.release()); + println!("{:<18}{}", "Kernel-Version", uname.version()); + println!("{:<18}{}", "Architecture", uname.machine()); +} + +/// Prints OS Distribution information +// see https://www.freedesktop.org/software/systemd/man/os-release.html +pub fn print_os() { + if let Some(os) = try_read_os_from("/etc/os-release") { + println!("{:<18}{}", "Operating System", os); + } else if let Some(os) = try_read_os_from("/usr/lib/os-release") { + println!("{:<18}{}", "Operating System", os); + } +} + +/// Helper function to read the OS Distribution info +fn try_read_os_from>(path: P) -> Option { + let os_release = path.as_ref(); + if !os_release.exists() { + return None; + } + + if let Ok(release_content) = fs::read_to_string(path) { + let pretty = find_parameter(&release_content, "PRETTY_NAME"); + + if let Some(pretty) = pretty { + return Some(pretty.trim_matches('"').to_owned()); + } + + let name = find_parameter(&release_content, "NAME"); + let version = find_parameter(&release_content, "VERSION"); + + if let (Some(name), Some(version)) = (name, version) { + return Some(format!( + "{} {}", + name.trim_matches('"'), + version.trim_matches('"') + )); + } + } + + None +} + +/// Helper function to find keyword values in OS info string +fn find_parameter<'a>(content: &'a str, param_name: &str) -> Option<&'a str> { + let param_value = content + .lines() + .find(|l| l.starts_with(param_name)) + .map(|l| l.split_terminator('=').last()); + + if let Some(Some(value)) = param_value { + return Some(value); + } + + None +} + +/// Print Hardware information of system +pub fn print_hardware() { + if let Ok(cpu_info) = CpuInfo::new() { + println!("{:<18}{}", "Cores", cpu_info.num_cores()); + } + + if let Ok(mem_info) = Meminfo::new() { + println!( + "{:<18}{}", + "Total Memory", + mem_info.mem_total / u64::pow(1024, 2) + ); + } +} + +/// Print cgroups info of system +pub fn print_cgroups() { + if let Ok(cgroup_fs) = cgroups::common::get_supported_cgroup_fs() { + let cgroup_fs: Vec = cgroup_fs.into_iter().map(|c| c.to_string()).collect(); + println!("{:<18}{}", "cgroup version", cgroup_fs.join(" and ")); + } + + println!("cgroup mounts"); + if let Ok(v1_mounts) = cgroups::v1::util::list_subsystem_mount_points() { + let mut v1_mounts: Vec = v1_mounts + .iter() + .map(|kv| format!(" {:<16}{}", kv.0, kv.1.display())) + .collect(); + + v1_mounts.sort(); + for cgroup_mount in v1_mounts { + println!("{}", cgroup_mount); + } + } + + let unified = cgroups::v2::util::get_unified_mount_point(); + if let Ok(mount_point) = unified { + println!(" {:<16}{}", "unified", mount_point.display()); + } +} diff --git a/src/kill.rs b/src/kill.rs new file mode 100644 index 000000000..734d65b17 --- /dev/null +++ b/src/kill.rs @@ -0,0 +1,46 @@ +use std::{fs, path::PathBuf}; + +use anyhow::{bail, Result}; +use clap::Clap; +use nix::sys::signal as nix_signal; + +use crate::{ + container::{Container, ContainerStatus}, + signal::ToSignal, +}; + +#[derive(Clap, Debug)] +pub struct Kill { + container_id: String, + signal: String, +} + +impl Kill { + pub fn exec(&self, root_path: PathBuf) -> Result<()> { + // resolves relative paths, symbolic links etc. and get complete path + let root_path = fs::canonicalize(root_path)?; + // state of container is stored in a directory named as container id inside + // root directory given in commandline options + let container_root = root_path.join(&self.container_id); + if !container_root.exists() { + bail!("{} doesn't exist.", self.container_id) + } + + // load container state from json file, and check status of the container + // it might be possible that kill is invoked on a already stopped container etc. + let container = Container::load(container_root)?.refresh_status()?; + if container.can_kill() { + let sig = self.signal.to_signal()?; + log::debug!("kill signal {} to {}", sig, container.pid().unwrap()); + nix_signal::kill(container.pid().unwrap(), sig)?; + container.update_status(ContainerStatus::Stopped).save()?; + std::process::exit(0) + } else { + bail!( + "{} could not be killed because it was {:?}", + container.id(), + container.status() + ) + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 98be65394..da3c23572 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,14 +7,21 @@ pub mod cgroups; pub mod command; pub mod container; pub mod create; +pub mod dbus; +pub mod delete; +pub mod info; +pub mod kill; +pub mod list; pub mod logger; pub mod namespaces; pub mod notify_socket; pub mod pipe; pub mod process; pub mod rootfs; +pub mod rootless; pub mod signal; pub mod start; +pub mod state; pub mod stdio; pub mod tty; pub mod utils; diff --git a/src/list.rs b/src/list.rs new file mode 100644 index 000000000..e99fd0245 --- /dev/null +++ b/src/list.rs @@ -0,0 +1,67 @@ +use std::ffi::OsString; +use std::fs; +use std::io; +use std::io::Write; +use std::path::PathBuf; + +use anyhow::Result; +use chrono::{DateTime, Local}; +use clap::Clap; +use tabwriter::TabWriter; + +use crate::container::Container; + +#[derive(Clap, Debug)] +pub struct List {} + +impl List { + pub fn exec(&self, root_path: PathBuf) -> Result<()> { + let root_path = fs::canonicalize(root_path)?; + let mut content = String::new(); + + for container_dir in fs::read_dir(root_path)? { + let container_dir = container_dir?.path(); + let state_file = container_dir.join("state.json"); + if !state_file.exists() { + continue; + } + + let container = Container::load(container_dir)?.refresh_status()?; + let pid = if let Some(pid) = container.pid() { + pid.to_string() + } else { + "".to_owned() + }; + + let user_name = if let Some(creator) = container.creator() { + creator + } else { + OsString::new() + }; + + let created = if let Some(utc) = container.created() { + let local: DateTime = DateTime::from(utc); + local.to_rfc3339_opts(chrono::SecondsFormat::Secs, false) + } else { + "".to_owned() + }; + + content.push_str(&format!( + "{}\t{}\t{}\t{}\t{}\t{}\n", + container.id(), + pid, + container.status(), + container.bundle(), + created, + user_name.to_string_lossy() + )); + } + + let mut tab_writer = TabWriter::new(io::stdout()); + writeln!(&mut tab_writer, "ID\tPID\tSTATUS\tBUNDLE\tCREATED\tCREATOR")?; + write!(&mut tab_writer, "{}", content)?; + tab_writer.flush()?; + + Ok(()) + } +} diff --git a/src/main.rs b/src/main.rs index 04ee7102e..c3eebf2dd 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,20 +3,19 @@ //! This crate provides a container runtime which can be used by a high-level container runtime to run containers. use std::fs; -use std::path::{Path, PathBuf}; +use std::path::PathBuf; -use anyhow::{bail, Result}; +use anyhow::Result; use clap::Clap; -use nix::sys::signal as nix_signal; -use youki::command::linux::LinuxCommand; -use youki::container::{Container, ContainerStatus}; use youki::create; -use youki::signal; +use youki::delete; +use youki::info; +use youki::kill; +use youki::list; +use youki::rootless::should_use_rootless; use youki::start; - -use youki::cgroups; -use youki::utils; +use youki::state; /// High-level commandline option definition /// This takes global options as well as individual commands as specified in [OCI runtime-spec](https://github.com/opencontainers/runtime-spec/blob/master/runtime.md) @@ -31,27 +30,14 @@ struct Opts { log: Option, #[clap(long)] log_format: Option, + /// Enable systemd cgroup manager, rather then use the cgroupfs directly. + #[clap(short, long)] + systemd_cgroup: bool, /// command to actually manage container #[clap(subcommand)] subcmd: SubCommand, } -#[derive(Clap, Debug)] -pub struct Kill { - container_id: String, - signal: String, -} - -#[derive(Clap, Debug)] -pub struct Delete { - container_id: String, -} - -#[derive(Clap, Debug)] -pub struct StateArgs { - pub container_id: String, -} - /// Subcommands accepted by Youki, confirming with [OCI runtime-spec](https://github.com/opencontainers/runtime-spec/blob/master/runtime.md) /// Also for a short information, check [runc commandline documentation](https://github.com/opencontainers/runc/blob/master/man/runc.8.md) #[derive(Clap, Debug)] @@ -61,13 +47,15 @@ enum SubCommand { #[clap(version = "0.0.1", author = "utam0k ")] Start(start::Start), #[clap(version = "0.0.1", author = "utam0k ")] - Kill(Kill), + Kill(kill::Kill), + #[clap(version = "0.0.1", author = "utam0k ")] + Delete(delete::Delete), #[clap(version = "0.0.1", author = "utam0k ")] - Delete(Delete), + State(state::State), #[clap(version = "0.0.1", author = "utam0k ")] - State(StateArgs), + Info(info::Info), #[clap(version = "0.0.1", author = "utam0k ")] - Info, + List(list::List), } /// This is the entry point in the container runtime. The binary is run by a high-level container runtime, @@ -79,131 +67,22 @@ fn main() -> Result<()> { eprintln!("log init failed: {:?}", e); } - let root_path = PathBuf::from(&opts.root); + let root_path = if should_use_rootless() && opts.root.eq(&PathBuf::from("/run/youki")) { + PathBuf::from("/tmp/rootless") + } else { + PathBuf::from(&opts.root) + }; fs::create_dir_all(&root_path)?; + let systemd_cgroup = opts.systemd_cgroup; + match opts.subcmd { - SubCommand::Create(create) => create.exec(root_path, LinuxCommand), + SubCommand::Create(create) => create.exec(root_path, systemd_cgroup), SubCommand::Start(start) => start.exec(root_path), - SubCommand::Kill(kill) => { - // resolves relative paths, symbolic links etc. and get complete path - let root_path = fs::canonicalize(root_path)?; - // state of container is stored in a directory named as container id inside - // root directory given in commandline options - let container_root = root_path.join(&kill.container_id); - if !container_root.exists() { - bail!("{} doesn't exist.", kill.container_id) - } - - // load container state from json file, and check status of the container - // it might be possible that kill is invoked on a already stopped container etc. - let container = Container::load(container_root)?.refresh_status()?; - if container.can_kill() { - let sig = signal::from_str(kill.signal.as_str())?; - log::debug!("kill signal {} to {}", sig, container.pid().unwrap()); - nix_signal::kill(container.pid().unwrap(), sig)?; - container.update_status(ContainerStatus::Stopped)?.save()?; - std::process::exit(0) - } else { - bail!( - "{} could not be killed because it was {:?}", - container.id(), - container.status() - ) - } - } - SubCommand::Delete(delete) => { - log::debug!("start deleting {}", delete.container_id); - // state of container is stored in a directory named as container id inside - // root directory given in commandline options - let container_root = root_path.join(&delete.container_id); - if !container_root.exists() { - bail!("{} doesn't exist.", delete.container_id) - } - // load container state from json file, and check status of the container - // it might be possible that delete is invoked on a running container. - log::debug!("load the container from {:?}", container_root); - let container = Container::load(container_root)?.refresh_status()?; - if container.can_delete() { - if container.root.exists() { - nix::unistd::chdir(&PathBuf::from(&container.state.bundle))?; - let config_absolute_path = &PathBuf::from(&container.state.bundle) - .join(Path::new("config.json")) - .to_string_lossy() - .to_string(); - log::debug!("load spec from {:?}", config_absolute_path); - let spec = oci_spec::Spec::load(config_absolute_path)?; - log::debug!("spec: {:?}", spec); - - // remove the directory storing container state - log::debug!("remove dir {:?}", container.root); - fs::remove_dir_all(&container.root)?; - - let cgroups_path = - utils::get_cgroup_path(&spec.linux.unwrap().cgroups_path, container.id()); - - // remove the cgroup created for the container - // check https://man7.org/linux/man-pages/man7/cgroups.7.html - // creating and removing cgroups section for more information on cgroups - let cmanager = cgroups::common::create_cgroup_manager(cgroups_path)?; - cmanager.remove()?; - } - std::process::exit(0) - } else { - bail!( - "{} could not be deleted because it was {:?}", - container.id(), - container.status() - ) - } - } - SubCommand::State(state_args) => { - let root_path = fs::canonicalize(root_path)?; - let container_root = root_path.join(state_args.container_id); - let container = Container::load(container_root)?.refresh_status()?; - println!("{}", serde_json::to_string_pretty(&container.state)?); - std::process::exit(0); - } - - SubCommand::Info => { - let uname = nix::sys::utsname::uname(); - println!("{:<18}{}", "Kernel-Release", uname.release()); - println!("{:<18}{}", "Kernel-Version", uname.version()); - println!("{:<18}{}", "Architecture", uname.machine()); - - let cpu_info = procfs::CpuInfo::new()?; - println!("{:<18}{}", "Cores", cpu_info.num_cores()); - let mem_info = procfs::Meminfo::new()?; - println!( - "{:<18}{}", - "Total Memory", - mem_info.mem_total / u64::pow(1024, 2) - ); - - let cgroup_fs: Vec = cgroups::common::get_supported_cgroup_fs()? - .into_iter() - .map(|c| c.to_string()) - .collect(); - println!("{:<18}{}", "cgroup version", cgroup_fs.join(" and ")); - - println!("cgroup mounts"); - let mut cgroup_v1_mounts: Vec = - cgroups::v1::util::list_subsystem_mount_points()? - .iter() - .map(|kv| format!(" {:<16}{:?}", kv.0, kv.1)) - .collect(); - - cgroup_v1_mounts.sort(); - for cgroup_mount in cgroup_v1_mounts { - println!("{}", cgroup_mount); - } - - let unified = cgroups::v2::util::get_unified_mount_point(); - if let Ok(mount_point) = unified { - println!(" {:<16}{:?}", "unified", mount_point); - } - - Ok(()) - } + SubCommand::Kill(kill) => kill.exec(root_path), + SubCommand::Delete(delete) => delete.exec(root_path, systemd_cgroup), + SubCommand::State(state) => state.exec(root_path), + SubCommand::Info(info) => info.exec(), + SubCommand::List(list) => list.exec(root_path), } } diff --git a/src/namespaces.rs b/src/namespaces.rs index 4fffea80e..1f2361652 100644 --- a/src/namespaces.rs +++ b/src/namespaces.rs @@ -15,12 +15,12 @@ use nix::{ unistd::{self, Gid, Uid}, }; -use crate::command::{linux::LinuxCommand, test::TestHelperCommand, Command}; -use oci_spec::{LinuxNamespace, LinuxNamespaceType}; +use crate::command::{syscall::create_syscall, Syscall}; +use oci_spec::LinuxNamespace; pub struct Namespaces { spaces: Vec, - command: Box, + command: Box, pub clone_flags: CloneFlags, } @@ -33,11 +33,7 @@ impl From> for Namespaces { cf }, ); - let command: Box = if cfg!(test) { - Box::new(TestHelperCommand::default()) - } else { - Box::new(LinuxCommand) - }; + let command: Box = create_syscall(); Namespaces { spaces: namespaces, @@ -80,10 +76,13 @@ impl Namespaces { } } +#[cfg(test)] mod tests { + use oci_spec::LinuxNamespaceType; + use super::*; + use crate::command::test::TestHelperSyscall; - #[allow(dead_code)] fn gen_sample_linux_namespaces() -> Vec { vec![ LinuxNamespace { @@ -113,7 +112,7 @@ mod tests { fn test_namespaces_set_ns() { let sample_linux_namespaces = gen_sample_linux_namespaces(); let namespaces: Namespaces = sample_linux_namespaces.into(); - let test_command: &TestHelperCommand = namespaces.command.as_any().downcast_ref().unwrap(); + let test_command: &TestHelperSyscall = namespaces.command.as_any().downcast_ref().unwrap(); assert!(namespaces.apply_setns().is_ok()); let mut setns_args: Vec<_> = test_command @@ -133,7 +132,7 @@ mod tests { let namespaces: Namespaces = sample_linux_namespaces.into(); assert!(namespaces.apply_unshare(CloneFlags::CLONE_NEWIPC).is_ok()); - let test_command: &TestHelperCommand = namespaces.command.as_any().downcast_ref().unwrap(); + let test_command: &TestHelperSyscall = namespaces.command.as_any().downcast_ref().unwrap(); let mut unshare_args = test_command.get_unshare_args(); unshare_args.sort(); let mut expect = vec![CloneFlags::CLONE_NEWUSER | CloneFlags::CLONE_NEWPID]; diff --git a/src/process/child.rs b/src/process/child.rs index 2bdca4931..65db39921 100644 --- a/src/process/child.rs +++ b/src/process/child.rs @@ -1,6 +1,5 @@ use std::io::ErrorKind; use std::io::Read; -use std::io::Write; use anyhow::{bail, Result}; use mio::unix::pipe; @@ -9,6 +8,7 @@ use mio::unix::pipe::Sender; use mio::{Events, Interest, Poll, Token}; use nix::unistd::Pid; +use super::parent::ParentChannel; use super::{MAX_EVENTS, WAIT_FOR_INIT}; use crate::process::message::Message; @@ -18,7 +18,7 @@ const CHILD: Token = Token(1); /// Contains sending end of pipe for parent process, receiving end of pipe /// for the init process and poller for that pub struct ChildProcess { - sender_for_parent: Sender, + parent_channel: ParentChannel, receiver: Option, poll: Option, } @@ -29,9 +29,9 @@ pub struct ChildProcess { // a process point of view, init process is child of child process, which is child of original youki process. impl ChildProcess { /// create a new Child process structure - pub fn new(sender_for_parent: Sender) -> Result { + pub fn new(parent_channel: ParentChannel) -> Result { Ok(Self { - sender_for_parent, + parent_channel, receiver: None, poll: None, }) @@ -55,24 +55,17 @@ impl ChildProcess { /// Indicate that child process has forked the init process to parent process pub fn notify_parent(&mut self, init_pid: Pid) -> Result<()> { - log::debug!( - "child send to parent {:?}", - (Message::ChildReady as u8).to_be_bytes() - ); - // write ChildReady message to the pipe to parent - self.write_message_for_parent(Message::ChildReady)?; - // write pid of init process which is forked by child process to the pipe, - // Pid in nix::unistd is type alias of SessionId which itself is alias of i32 - self.sender_for_parent - .write_all(&(init_pid.as_raw()).to_be_bytes())?; + self.parent_channel.send_init_pid(init_pid)?; + Ok(()) + } + + pub fn request_identifier_mapping(&mut self) -> Result<()> { + self.parent_channel.request_identifier_mapping()?; Ok(()) } - /// writes given message to pipe for the parent - #[inline] - fn write_message_for_parent(&mut self, msg: Message) -> Result<()> { - self.sender_for_parent - .write_all(&(msg as u8).to_be_bytes())?; + pub fn wait_for_mapping_ack(&mut self) -> Result<()> { + self.parent_channel.wait_for_mapping_ack()?; Ok(()) } diff --git a/src/process/fork.rs b/src/process/fork.rs index 9689dacff..915f802cd 100644 --- a/src/process/fork.rs +++ b/src/process/fork.rs @@ -14,25 +14,23 @@ use nix::unistd; use nix::unistd::Pid; use crate::cgroups::common::CgroupManager; +use crate::container::Container; use crate::container::ContainerStatus; use crate::process::{child, init, parent, Process}; -use crate::{container::Container, pipe::Pipe}; +use crate::rootless::Rootless; /// Function to perform the first fork for in order to run the container process pub fn fork_first>( - pid_file: Option

, - is_userns: bool, + pid_file: &Option

, + rootless: &Option, linux: &oci_spec::Linux, - container: &Container, + container: Option<&Container>, cmanager: Box, ) -> Result { - // create a new pipe - let cpipe = Pipe::new()?; - // create new parent process structure - let (mut parent, sender_for_parent) = parent::ParentProcess::new()?; + let (mut parent, parent_channel) = parent::ParentProcess::new(rootless.clone())?; // create a new child process structure with sending end of parent process - let child = child::ChildProcess::new(sender_for_parent)?; + let mut child = child::ChildProcess::new(parent_channel)?; // fork the process match unsafe { unistd::fork()? } { @@ -51,27 +49,39 @@ pub fn fork_first>( // if new user is specified in specification, this will be true // and new namespace will be created, check https://man7.org/linux/man-pages/man7/user_namespaces.7.html // for more information - if is_userns { + if rootless.is_some() { + log::debug!("creating new user namespace"); sched::unshare(sched::CloneFlags::CLONE_NEWUSER)?; + + // child needs to be dumpable, otherwise the non root parent is not + // allowed to write the uid/gid maps + prctl::set_dumpable(true).unwrap(); + child.request_identifier_mapping()?; + child.wait_for_mapping_ack()?; + prctl::set_dumpable(false).unwrap(); } - cpipe.notify()?; Ok(Process::Child(child)) } // in the parent process unistd::ForkResult::Parent { child } => { - cpipe.wait()?; - // wait for child to fork init process and report back its pid - let init_pid = parent.wait_for_child_ready()?; + let init_pid = parent.wait_for_child_ready(child)?; log::debug!("init pid is {:?}", init_pid); - cmanager.apply(&linux.resources.as_ref().unwrap(), Pid::from_raw(init_pid))?; + if rootless.is_none() && linux.resources.is_some() { + cmanager.add_task(Pid::from_raw(init_pid))?; + cmanager.apply(&linux.resources.as_ref().unwrap())?; + } + + if let Some(container) = container { + // update status and pid of the container process + container + .update_status(ContainerStatus::Created) + .set_creator(nix::unistd::geteuid().as_raw()) + .set_pid(init_pid) + .save()?; + } - // update status and pid of the container process - container - .update_status(ContainerStatus::Created)? - .set_pid(init_pid) - .save()?; // if file to write the pid to is specified, write pid of the child if let Some(pid_file) = pid_file { fs::write(&pid_file, format!("{}", child))?; diff --git a/src/process/init.rs b/src/process/init.rs index 42a015927..4615cc4c1 100644 --- a/src/process/init.rs +++ b/src/process/init.rs @@ -1,9 +1,15 @@ -use std::io::Write; +use std::{io::Write, path::PathBuf}; use anyhow::Result; use mio::unix::pipe::Sender; +use nix::{ + sched, + unistd::{Gid, Uid}, +}; -use crate::process::message::Message; +use crate::{ + capabilities, command::Syscall, namespaces::Namespaces, process::message::Message, rootfs, +}; /// Contains sending end for pipe for the child process pub struct InitProcess { @@ -36,3 +42,36 @@ impl InitProcess { Ok(()) } } + +/// setup hostname, rootfs for the container process +pub fn setup_init_process( + spec: &oci_spec::Spec, + command: &impl Syscall, + rootfs: PathBuf, + namespaces: &Namespaces, +) -> Result<()> { + let proc = &spec.process; + + command.set_hostname(spec.hostname.as_str())?; + if proc.no_new_privileges { + let _ = prctl::set_no_new_privileges(true); + } + + rootfs::prepare_rootfs( + &spec, + &rootfs, + namespaces + .clone_flags + .contains(sched::CloneFlags::CLONE_NEWUSER), + )?; + + // change the root of filesystem of the process to the rootfs + command.pivot_rootfs(&rootfs)?; + + command.set_id(Uid::from_raw(proc.user.uid), Gid::from_raw(proc.user.gid))?; + capabilities::reset_effective(command)?; + if let Some(caps) = &proc.capabilities { + capabilities::drop_privileges(&caps, command)?; + } + Ok(()) +} diff --git a/src/process/message.rs b/src/process/message.rs index fddf09ab9..386b4fb77 100644 --- a/src/process/message.rs +++ b/src/process/message.rs @@ -3,6 +3,8 @@ pub enum Message { ChildReady = 0x00, InitReady = 0x01, + WriteMapping = 0x02, + MappingWritten = 0x03, } impl From for Message { @@ -10,6 +12,8 @@ impl From for Message { match from { 0x00 => Message::ChildReady, 0x01 => Message::InitReady, + 0x02 => Message::WriteMapping, + 0x03 => Message::MappingWritten, _ => panic!("unknown message."), } } diff --git a/src/process/mod.rs b/src/process/mod.rs index c64fea8bf..65b1b907a 100644 --- a/src/process/mod.rs +++ b/src/process/mod.rs @@ -10,7 +10,7 @@ mod child; mod init; mod parent; -pub use init::InitProcess; +pub use init::{setup_init_process, InitProcess}; /// Used to describe type of process after fork. /// Parent and child processes mean the same thing as in a normal fork call @@ -26,3 +26,5 @@ const MAX_EVENTS: usize = 128; const WAIT_FOR_CHILD: Duration = Duration::from_secs(5); /// Time to wait when polling for message from init process const WAIT_FOR_INIT: Duration = Duration::from_millis(1000); +/// Time to wait when polling for mapping ack from parent +const WAIT_FOR_MAPPING: Duration = Duration::from_secs(3); diff --git a/src/process/parent.rs b/src/process/parent.rs index 3ff941794..bd1fe6d2f 100644 --- a/src/process/parent.rs +++ b/src/process/parent.rs @@ -1,89 +1,252 @@ use std::io::ErrorKind; use std::io::Read; +use std::io::Write; +use std::path::Path; +use std::process::Command; use super::{MAX_EVENTS, WAIT_FOR_CHILD}; use crate::process::message::Message; +use crate::process::WAIT_FOR_MAPPING; +use crate::rootless::Rootless; +use crate::utils; +use anyhow::Context; use anyhow::{bail, Result}; use mio::unix::pipe; use mio::unix::pipe::{Receiver, Sender}; use mio::{Events, Interest, Poll, Token}; +use nix::unistd::Pid; +use oci_spec::LinuxIdMapping; // Token is used to identify which socket generated an event const PARENT: Token = Token(0); /// Contains receiving end of pipe to child process and a poller for that. pub struct ParentProcess { - receiver: Receiver, - poll: Poll, + child_channel: ChildChannel, } // Poll is used to register and listen for various events // by registering it with an event source such as receiving end of a pipe impl ParentProcess { /// Create new Parent process structure - pub fn new() -> Result<(Self, Sender)> { - // create a new pipe - let (sender, mut receiver) = pipe::new()?; - // create a new poll, and register the receiving end of pipe to it - // This will poll for the read events, so when data is written to sending end of the pipe, - // the receiving end will be readable and poll wil notify + pub fn new(rootless: Option) -> Result<(Self, ParentChannel)> { + let (parent_channel, child_channel) = Self::setup_pipes(rootless)?; + let parent = Self { child_channel }; + + Ok((parent, parent_channel)) + } + + fn setup_pipes(rootless: Option) -> Result<(ParentChannel, ChildChannel)> { + let (send_to_parent, receive_from_child) = pipe::new()?; + let (send_to_child, receive_from_parent) = pipe::new()?; + + let parent_channel = ParentChannel::new(send_to_parent, receive_from_parent)?; + let child_channel = ChildChannel::new(send_to_child, receive_from_child, rootless)?; + + Ok((parent_channel, child_channel)) + } + + /// Waits for associated child process to send ready message + /// and return the pid of init process which is forked by child process + pub fn wait_for_child_ready(&mut self, child_pid: Pid) -> Result { + let init_pid = self.child_channel.wait_for_child_ready(child_pid)?; + Ok(init_pid) + } +} + +// Channel for communicating with the parent +pub struct ParentChannel { + sender: Sender, + receiver: Receiver, + poll: Poll, +} + +impl ParentChannel { + fn new(sender: Sender, mut receiver: Receiver) -> Result { let poll = Poll::new()?; poll.registry() .register(&mut receiver, PARENT, Interest::READABLE)?; - Ok((Self { receiver, poll }, sender)) + Ok(Self { + sender, + receiver, + poll, + }) } - /// Waits for associated child process to send ready message - /// and return the pid of init process which is forked by child process - pub fn wait_for_child_ready(&mut self) -> Result { - // Create collection with capacity to store up to MAX_EVENTS events + pub fn send_init_pid(&mut self, pid: Pid) -> Result<()> { + // write ChildReady message to the pipe to parent + log::debug!("[child to parent] sending init pid ({:?})", pid); + self.write_message(Message::ChildReady)?; + // write pid of init process which is forked by child process to the pipe, + // Pid in nix::unistd is type alias of SessionId which itself is alias of i32 + self.sender.write_all(&(pid.as_raw()).to_be_bytes())?; + Ok(()) + } + + // requests the parent to write the id mappings for the child process + // this needs to be done from the parent see https://man7.org/linux/man-pages/man7/user_namespaces.7.html + pub fn request_identifier_mapping(&mut self) -> Result<()> { + log::debug!("[child to parent] request identifier mapping"); + self.write_message(Message::WriteMapping)?; + Ok(()) + } + + // wait until the parent process has finished writing the id mappings + pub fn wait_for_mapping_ack(&mut self) -> Result<()> { let mut events = Events::with_capacity(MAX_EVENTS); + log::debug!("waiting for ack from parent"); - // poll the receiving end of pipe created for WAIT_FOR_CHILD duration for an event - self.poll.poll(&mut events, Some(WAIT_FOR_CHILD))?; + self.poll.poll(&mut events, Some(WAIT_FOR_MAPPING))?; for event in events.iter() { - // check if the event token in PARENT - // note that this does not assign anything to PARENT, but instead compares PARENT and event.token() - // check http://patshaughnessy.net/2018/1/18/learning-rust-if-let-vs--match for a bit more detailed explanation - if let PARENT = event.token() { - // read data from pipe + if event.token() == PARENT { let mut buf = [0; 1]; match self.receiver.read_exact(&mut buf) { - // This error simply means that there are no more incoming connections waiting to be accepted at this point. Err(ref e) if e.kind() == ErrorKind::WouldBlock => (), Err(e) => bail!( "Failed to receive a message from the child process. {:?}", e ), _ => (), - }; - // convert to Message wrapper + } + match Message::from(u8::from_be_bytes(buf)) { - Message::ChildReady => { - // read pid of init process forked by child, 4 bytes as the type is i32 - let mut buf = [0; 4]; - match self.receiver.read_exact(&mut buf) { - // This error simply means that there are no more incoming connections waiting to be accepted at this point. - Err(ref e) if e.kind() == ErrorKind::WouldBlock => (), - Err(e) => bail!( - "Failed to receive a message from the child process. {:?}", - e - ), - _ => (), + Message::MappingWritten => return Ok(()), + msg => bail!("receive unexpected message {:?} in child process", msg), + } + } + } + unreachable!("timed out waiting for mapping ack from parent") + } + + #[inline] + fn write_message(&mut self, msg: Message) -> Result<()> { + self.sender.write_all(&(msg as u8).to_be_bytes())?; + Ok(()) + } +} + +struct ChildChannel { + sender: Sender, + receiver: Receiver, + poll: Poll, + rootless: Option, +} + +impl ChildChannel { + fn new(sender: Sender, mut receiver: Receiver, rootless: Option) -> Result { + let poll = Poll::new()?; + poll.registry() + .register(&mut receiver, PARENT, Interest::READABLE)?; + Ok(Self { + sender, + receiver, + poll, + rootless, + }) + } + + /// Waits for associated child process to send ready message + /// and return the pid of init process which is forked by child process + pub fn wait_for_child_ready(&mut self, child_pid: Pid) -> Result { + // Create collection with capacity to store up to MAX_EVENTS events + let mut events = Events::with_capacity(MAX_EVENTS); + loop { + // poll the receiving end of pipe created for WAIT_FOR_CHILD duration for an event + self.poll.poll(&mut events, Some(WAIT_FOR_CHILD))?; + for event in events.iter() { + // check if the event token in PARENT + // note that this does not assign anything to PARENT, but instead compares PARENT and event.token() + // check http://patshaughnessy.net/2018/1/18/learning-rust-if-let-vs--match for a bit more detailed explanation + if let PARENT = event.token() { + // read data from pipe + let mut buf = [0; 1]; + match self.receiver.read_exact(&mut buf) { + // This error simply means that there are no more incoming connections waiting to be accepted at this point. + Err(ref e) if e.kind() == ErrorKind::WouldBlock => { + break; + } + Err(e) => bail!( + "Failed to receive a message from the child process. {:?}", + e + ), + _ => (), + }; + // convert to Message wrapper + match Message::from(u8::from_be_bytes(buf)) { + Message::ChildReady => { + log::debug!("received child ready message"); + // read pid of init process forked by child, 4 bytes as the type is i32 + let mut buf = [0; 4]; + match self.receiver.read_exact(&mut buf) { + // This error simply means that there are no more incoming connections waiting to be accepted at this point. + Err(ref e) if e.kind() == ErrorKind::WouldBlock => (), + Err(e) => bail!( + "Failed to receive a message from the child process. {:?}", + e + ), + _ => (), + } + return Ok(i32::from_be_bytes(buf)); } - return Ok(i32::from_be_bytes(buf)); + Message::WriteMapping => { + log::debug!("write mapping for pid {:?}", child_pid); + utils::write_file(format!("/proc/{}/setgroups", child_pid), "deny")?; + self.write_uid_mapping(child_pid)?; + self.write_gid_mapping(child_pid)?; + self.notify_mapping_written()?; + } + msg => bail!("receive unexpected message {:?} in parent process", msg), } - msg => bail!("receive unexpected message {:?} in parent process", msg), + } else { + // as the poll is registered with only parent token + unreachable!() } - } else { - // as the poll is registered with only parent token - unreachable!() } } - // should not reach here, as there should be a ready event from child within WAIT_FOR_CHILD duration - unreachable!( - "No message received from child process within {} seconds", - WAIT_FOR_CHILD.as_secs() - ); } + + fn notify_mapping_written(&mut self) -> Result<()> { + self.sender + .write_all(&(Message::MappingWritten as u8).to_be_bytes())?; + Ok(()) + } + + fn write_uid_mapping(&self, target_pid: Pid) -> Result<()> { + let rootless = self.rootless.as_ref().unwrap(); + write_id_mapping( + &format!("/proc/{}/uid_map", target_pid), + &rootless.uid_mappings, + rootless.newuidmap.as_deref(), + ) + } + + fn write_gid_mapping(&self, target_pid: Pid) -> Result<()> { + let rootless = self.rootless.as_ref().unwrap(); + write_id_mapping( + &format!("/proc/{}/gid_map", target_pid), + &rootless.gid_mappings, + rootless.newgidmap.as_deref(), + ) + } +} + +fn write_id_mapping( + map_file: &str, + mappings: &[LinuxIdMapping], + map_binary: Option<&Path>, +) -> Result<()> { + let mappings: Vec = mappings + .iter() + .map(|m| format!("{} {} {}", m.container_id, m.host_id, m.size)) + .collect(); + if mappings.len() == 1 { + utils::write_file(map_file, mappings.first().unwrap())?; + } else { + Command::new(map_binary.unwrap()) + .args(mappings) + .output() + .with_context(|| format!("failed to execute {:?}", map_binary))?; + } + + Ok(()) } diff --git a/src/rootless.rs b/src/rootless.rs new file mode 100644 index 000000000..3841260da --- /dev/null +++ b/src/rootless.rs @@ -0,0 +1,139 @@ +use std::{env, path::PathBuf}; + +use anyhow::{bail, Result}; +use nix::sched::CloneFlags; +use oci_spec::{Linux, LinuxIdMapping, Mount, Spec}; + +use crate::namespaces::Namespaces; + +#[derive(Debug, Clone)] +pub struct Rootless { + /// Location of the newuidmap binary + pub newuidmap: Option, + /// Location of the newgidmap binary + pub newgidmap: Option, + /// Mappings for user ids + pub uid_mappings: Vec, + /// Mappings for group ids + pub gid_mappings: Vec, +} + +impl From<&Linux> for Rootless { + fn from(linux: &Linux) -> Self { + Self { + newuidmap: None, + newgidmap: None, + uid_mappings: linux.uid_mappings.clone(), + gid_mappings: linux.gid_mappings.clone(), + } + } +} + +pub fn detect_rootless(spec: &Spec) -> Result> { + let linux = spec.linux.as_ref().unwrap(); + + let rootless = if should_use_rootless() { + log::debug!("rootless container should be created"); + log::warn!( + "resource constraints and multi id mapping is unimplemented for rootless containers" + ); + validate(spec)?; + let mut rootless = Rootless::from(linux); + if let Some((uid_binary, gid_binary)) = lookup_map_binaries(linux)? { + rootless.newuidmap = Some(uid_binary); + rootless.newgidmap = Some(gid_binary); + } + Some(rootless) + } else { + None + }; + + Ok(rootless) +} + +/// Checks if rootless mode should be used +pub fn should_use_rootless() -> bool { + if !nix::unistd::geteuid().is_root() { + return true; + } + + if let Ok("true") = std::env::var("YOUKI_USE_ROOTLESS").as_deref() { + return true; + } + + false +} + +/// Validates that the spec contains the required information for +/// running in rootless mode +pub fn validate(spec: &Spec) -> Result<()> { + let linux = spec.linux.as_ref().unwrap(); + + if linux.uid_mappings.is_empty() { + bail!("rootless containers require at least one uid mapping"); + } + + if linux.gid_mappings.is_empty() { + bail!("rootless containers require at least one gid mapping") + } + + let namespaces: Namespaces = linux.namespaces.clone().into(); + if !namespaces.clone_flags.contains(CloneFlags::CLONE_NEWUSER) { + bail!("rootless containers require the specification of a user namespace"); + } + + validate_mounts(&spec.mounts, &linux.uid_mappings, &linux.gid_mappings)?; + + Ok(()) +} + +fn validate_mounts( + mounts: &[Mount], + uid_mappings: &[LinuxIdMapping], + gid_mappings: &[LinuxIdMapping], +) -> Result<()> { + for mount in mounts { + for opt in &mount.options { + if opt.starts_with("uid=") && !is_id_mapped(&opt[4..], uid_mappings)? { + bail!("Mount {:?} specifies option {} which is not mapped inside the rootless container", mount, opt); + } + + if opt.starts_with("gid=") && !is_id_mapped(&opt[4..], gid_mappings)? { + bail!("Mount {:?} specifies option {} which is not mapped inside the rootless container", mount, opt); + } + } + } + + Ok(()) +} + +fn is_id_mapped(id: &str, mappings: &[LinuxIdMapping]) -> Result { + let id = id.parse::()?; + Ok(mappings + .iter() + .any(|m| id >= m.container_id && id <= m.container_id + m.size)) +} + +/// Looks up the location of the newuidmap and newgidmap binaries which +/// are required to write multiple user/group mappings +pub fn lookup_map_binaries(spec: &Linux) -> Result> { + if spec.uid_mappings.len() == 1 && spec.uid_mappings.len() == 1 { + return Ok(None); + } + + let uidmap = lookup_map_binary("newuidmap")?; + let gidmap = lookup_map_binary("newgidmap")?; + + match (uidmap, gidmap) { + (Some(newuidmap), Some(newgidmap)) => Ok(Some((newuidmap, newgidmap))), + _ => bail!("newuidmap/newgidmap binaries could not be found in path. This is required if multiple id mappings are specified"), + } +} + +fn lookup_map_binary(binary: &str) -> Result> { + let paths = env::var("PATH")?; + Ok(paths + .split_terminator(':') + .find(|p| PathBuf::from(p).join(binary).exists()) + .map(PathBuf::from)) +} diff --git a/src/signal.rs b/src/signal.rs index 39ddbb26d..30b151f13 100644 --- a/src/signal.rs +++ b/src/signal.rs @@ -3,40 +3,99 @@ use anyhow::{bail, Result}; use nix::sys::signal::Signal; -pub fn from_str(signal: &str) -> Result { - use Signal::*; - Ok(match signal.to_ascii_uppercase().as_str() { - "1" | "HUP" | "SIGHUP" => Signal::SIGHUP, - "2" | "INT" | "SIGINT" => Signal::SIGINT, - "3" | "QUIT" | "SIGQUIT" => Signal::SIGQUIT, - "4" | "ILL" | "SIGILL" => Signal::SIGILL, - "5" | "BUS" | "SIGBUS" => Signal::SIGBUS, - "6" | "ABRT" | "IOT" | "SIGABRT" | "SIGIOT" => Signal::SIGABRT, - "7" | "TRAP" | "SIGTRAP" => Signal::SIGTRAP, - "8" | "FPE" | "SIGFPE" => Signal::SIGFPE, - "9" | "KILL" | "SIGKILL" => Signal::SIGKILL, - "10" | "USR1" | "SIGUSR1" => Signal::SIGUSR1, - "11" | "SEGV" | "SIGSEGV" => SIGSEGV, - "12" | "USR2" | "SIGUSR2" => SIGUSR2, - "13" | "PIPE" | "SIGPIPE" => SIGPIPE, - "14" | "ALRM" | "SIGALRM" => SIGALRM, - "15" | "TERM" | "SIGTERM" => SIGTERM, - "16" | "STKFLT" | "SIGSTKFLT" => SIGSTKFLT, - "17" | "CHLD" | "SIGCHLD" => SIGCHLD, - "18" | "CONT" | "SIGCONT" => SIGCONT, - "19" | "STOP" | "SIGSTOP" => SIGSTOP, - "20" | "TSTP" | "SIGTSTP" => SIGTSTP, - "21" | "TTIN" | "SIGTTIN" => SIGTTIN, - "22" | "TTOU" | "SIGTTOU" => SIGTTOU, - "23" | "URG" | "SIGURG" => SIGURG, - "24" | "XCPU" | "SIGXCPU" => SIGXCPU, - "25" | "XFSZ" | "SIGXFSZ" => SIGXFSZ, - "26" | "VTALRM" | "SIGVTALRM" => SIGVTALRM, - "27" | "PROF" | "SIGPROF" => SIGPROF, - "28" | "WINCH" | "SIGWINCH" => SIGWINCH, - "29" | "IO" | "SIGIO" => SIGIO, - "30" | "PWR" | "SIGPWR" => SIGPWR, - "31" | "SYS" | "SIGSYS" => SIGSYS, - _ => bail! {"{} is not a valid signal", signal}, - }) +pub trait ToSignal { + fn to_signal(&self) -> Result; +} + +impl ToSignal for String { + fn to_signal(&self) -> Result { + use Signal::*; + Ok(match self.to_ascii_uppercase().as_str() { + "1" | "HUP" | "SIGHUP" => SIGHUP, + "2" | "INT" | "SIGINT" => SIGINT, + "3" | "QUIT" | "SIGQUIT" => SIGQUIT, + "4" | "ILL" | "SIGILL" => SIGILL, + "5" | "BUS" | "SIGBUS" => SIGBUS, + "6" | "ABRT" | "IOT" | "SIGABRT" | "SIGIOT" => SIGABRT, + "7" | "TRAP" | "SIGTRAP" => SIGTRAP, + "8" | "FPE" | "SIGFPE" => SIGFPE, + "9" | "KILL" | "SIGKILL" => SIGKILL, + "10" | "USR1" | "SIGUSR1" => SIGUSR1, + "11" | "SEGV" | "SIGSEGV" => SIGSEGV, + "12" | "USR2" | "SIGUSR2" => SIGUSR2, + "13" | "PIPE" | "SIGPIPE" => SIGPIPE, + "14" | "ALRM" | "SIGALRM" => SIGALRM, + "15" | "TERM" | "SIGTERM" => SIGTERM, + "16" | "STKFLT" | "SIGSTKFLT" => SIGSTKFLT, + "17" | "CHLD" | "SIGCHLD" => SIGCHLD, + "18" | "CONT" | "SIGCONT" => SIGCONT, + "19" | "STOP" | "SIGSTOP" => SIGSTOP, + "20" | "TSTP" | "SIGTSTP" => SIGTSTP, + "21" | "TTIN" | "SIGTTIN" => SIGTTIN, + "22" | "TTOU" | "SIGTTOU" => SIGTTOU, + "23" | "URG" | "SIGURG" => SIGURG, + "24" | "XCPU" | "SIGXCPU" => SIGXCPU, + "25" | "XFSZ" | "SIGXFSZ" => SIGXFSZ, + "26" | "VTALRM" | "SIGVTALRM" => SIGVTALRM, + "27" | "PROF" | "SIGPROF" => SIGPROF, + "28" | "WINCH" | "SIGWINCH" => SIGWINCH, + "29" | "IO" | "SIGIO" => SIGIO, + "30" | "PWR" | "SIGPWR" => SIGPWR, + "31" | "SYS" | "SIGSYS" => SIGSYS, + _ => bail! {"{} is not a valid signal", self}, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use nix::sys::signal::Signal::*; + use std::collections::HashMap; + + #[test] + fn test_conversion_from_string() { + let mut test_sets = HashMap::new(); + test_sets.insert(SIGHUP, vec!["1", "HUP", "SIGHUP"]); + test_sets.insert(SIGINT, vec!["2", "INT", "SIGINT"]); + test_sets.insert(SIGQUIT, vec!["3", "QUIT", "SIGQUIT"]); + test_sets.insert(SIGILL, vec!["4", "ILL", "SIGILL"]); + test_sets.insert(SIGBUS, vec!["5", "BUS", "SIGBUS"]); + test_sets.insert(SIGABRT, vec!["6", "ABRT", "IOT", "SIGABRT", "SIGIOT"]); + test_sets.insert(SIGTRAP, vec!["7", "TRAP", "SIGTRAP"]); + test_sets.insert(SIGFPE, vec!["8", "FPE", "SIGFPE"]); + test_sets.insert(SIGKILL, vec!["9", "KILL", "SIGKILL"]); + test_sets.insert(SIGUSR1, vec!["10", "USR1", "SIGUSR1"]); + test_sets.insert(SIGSEGV, vec!["11", "SEGV", "SIGSEGV"]); + test_sets.insert(SIGUSR2, vec!["12", "USR2", "SIGUSR2"]); + test_sets.insert(SIGPIPE, vec!["13", "PIPE", "SIGPIPE"]); + test_sets.insert(SIGALRM, vec!["14", "ALRM", "SIGALRM"]); + test_sets.insert(SIGTERM, vec!["15", "TERM", "SIGTERM"]); + test_sets.insert(SIGSTKFLT, vec!["16", "STKFLT", "SIGSTKFLT"]); + test_sets.insert(SIGCHLD, vec!["17", "CHLD", "SIGCHLD"]); + test_sets.insert(SIGCONT, vec!["18", "CONT", "SIGCONT"]); + test_sets.insert(SIGSTOP, vec!["19", "STOP", "SIGSTOP"]); + test_sets.insert(SIGTSTP, vec!["20", "TSTP", "SIGTSTP"]); + test_sets.insert(SIGTTIN, vec!["21", "TTIN", "SIGTTIN"]); + test_sets.insert(SIGTTOU, vec!["22", "TTOU", "SIGTTOU"]); + test_sets.insert(SIGURG, vec!["23", "URG", "SIGURG"]); + test_sets.insert(SIGXCPU, vec!["24", "XCPU", "SIGXCPU"]); + test_sets.insert(SIGXFSZ, vec!["25", "XFSZ", "SIGXFSZ"]); + test_sets.insert(SIGVTALRM, vec!["26", "VTALRM", "SIGVTALRM"]); + test_sets.insert(SIGPROF, vec!["27", "PROF", "SIGPROF"]); + test_sets.insert(SIGWINCH, vec!["28", "WINCH", "SIGWINCH"]); + test_sets.insert(SIGIO, vec!["29", "IO", "SIGIO"]); + test_sets.insert(SIGPWR, vec!["30", "PWR", "SIGPWR"]); + test_sets.insert(SIGSYS, vec!["31", "SYS", "SIGSYS"]); + for (signal, strings) in test_sets { + for s in strings { + assert_eq!(signal, s.to_string().to_signal().unwrap()); + } + } + } + + #[test] + fn test_conversion_from_string_should_be_failed() { + assert!("invalid".to_string().to_signal().is_err()) + } } diff --git a/src/start.rs b/src/start.rs index bad12a7b4..d37f95cbc 100644 --- a/src/start.rs +++ b/src/start.rs @@ -36,7 +36,7 @@ impl Start { let mut notify_socket = NotifySocket::new(&container.root)?; notify_socket.notify_container_start()?; - container.update_status(ContainerStatus::Running)?.save()?; + container.update_status(ContainerStatus::Running).save()?; Ok(()) } } diff --git a/src/state.rs b/src/state.rs new file mode 100644 index 000000000..7be62193c --- /dev/null +++ b/src/state.rs @@ -0,0 +1,22 @@ +use std::fs; +use std::path::PathBuf; + +use anyhow::Result; +use clap::Clap; + +use crate::container::Container; + +#[derive(Clap, Debug)] +pub struct State { + pub container_id: String, +} + +impl State { + pub fn exec(&self, root_path: PathBuf) -> Result<()> { + let root_path = fs::canonicalize(root_path)?; + let container_root = root_path.join(&self.container_id); + let container = Container::load(container_root)?.refresh_status()?; + println!("{}", serde_json::to_string_pretty(&container.state)?); + std::process::exit(0); + } +} diff --git a/src/tty.rs b/src/tty.rs index b6bf4be34..a375e6ca1 100644 --- a/src/tty.rs +++ b/src/tty.rs @@ -43,7 +43,7 @@ pub fn setup_console_socket( Ok(csocketfd.into()) } -pub fn setup_console(console_fd: FileDescriptor) -> Result<()> { +pub fn setup_console(console_fd: &FileDescriptor) -> Result<()> { // You can also access pty master, but it is better to use the API. // ref. https://github.com/containerd/containerd/blob/261c107ffc4ff681bc73988f64e3f60c32233b37/vendor/github.com/containerd/go-runc/console.go#L139-L154 let openpty_result = nix::pty::openpty(None, None)?; @@ -68,3 +68,76 @@ pub fn setup_console(console_fd: FileDescriptor) -> Result<()> { close(console_fd.as_raw_fd())?; Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + + use std::env; + use std::fs::{self, File}; + use std::os::unix::net::UnixListener; + use std::path::PathBuf; + + use serial_test::serial; + + use crate::utils::{create_temp_dir, TempDir}; + + fn setup(testname: &str) -> Result<(TempDir, PathBuf, PathBuf)> { + let testdir = create_temp_dir(testname)?; + let rundir_path = Path::join(&testdir, "run"); + let _ = fs::create_dir(&rundir_path)?; + let socket_path = Path::new(&rundir_path).join("socket"); + let _ = File::create(&socket_path); + env::set_current_dir(&testdir)?; + Ok((testdir, rundir_path, socket_path)) + } + + #[test] + #[serial] + fn test_setup_console_socket() { + let init = setup("test_setup_console_socket"); + assert!(init.is_ok()); + let (testdir, rundir_path, socket_path) = init.unwrap(); + let lis = UnixListener::bind(Path::join(&testdir, "console-socket")); + assert!(lis.is_ok()); + let fd = setup_console_socket(&&rundir_path, &socket_path); + assert!(fd.is_ok()); + assert_ne!(fd.unwrap().as_raw_fd(), -1); + } + + #[test] + #[serial] + fn test_setup_console_socket_empty() { + let init = setup("test_setup_console_socket_empty"); + assert!(init.is_ok()); + let (_testdir, rundir_path, socket_path) = init.unwrap(); + let fd = setup_console_socket(&rundir_path, &socket_path); + assert!(fd.is_ok()); + assert_eq!(fd.unwrap().as_raw_fd(), -1); + } + + #[test] + #[serial] + fn test_setup_console_socket_invalid() { + let init = setup("test_setup_console_socket_invalid"); + assert!(init.is_ok()); + let (testdir, rundir_path, socket_path) = init.unwrap(); + let _socket = File::create(Path::join(&testdir, "console-socket")); + assert!(_socket.is_ok()); + let fd = setup_console_socket(&rundir_path, &socket_path); + assert!(fd.is_err()); + } + + #[test] + #[serial] + fn test_setup_console() { + let init = setup("test_setup_console"); + assert!(init.is_ok()); + let (testdir, rundir_path, socket_path) = init.unwrap(); + let lis = UnixListener::bind(Path::join(&testdir, "console-socket")); + assert!(lis.is_ok()); + let fd = setup_console_socket(&&rundir_path, &socket_path); + let status = setup_console(&fd.unwrap()); + assert!(status.is_ok()); + } +} diff --git a/src/utils.rs b/src/utils.rs index 178b3d054..d5f0729fb 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -3,9 +3,11 @@ use std::env; use std::ffi::CString; use std::fs; +use std::ops::Deref; use std::path::{Path, PathBuf}; use std::time::Duration; +use anyhow::Context; use anyhow::{bail, Result}; use nix::unistd; @@ -88,6 +90,67 @@ pub fn delete_with_retry>(path: P) -> Result<()> { bail!("could not delete {:?}", path) } +pub fn write_file, C: AsRef<[u8]>>(path: P, contents: C) -> Result<()> { + let path = path.as_ref(); + fs::write(path, contents).with_context(|| format!("failed to write to {:?}", path))?; + Ok(()) +} + +pub fn create_dir_all>(path: P) -> Result<()> { + let path = path.as_ref(); + fs::create_dir_all(path).with_context(|| format!("failed to create directory {:?}", path)) +} + +pub struct TempDir { + path: Option, +} + +impl TempDir { + pub fn new>(path: P) -> Result { + let p = path.into(); + std::fs::create_dir_all(&p)?; + Ok(Self { path: Some(p) }) + } + + pub fn path(&self) -> &Path { + self.path + .as_ref() + .expect("temp dir has already been removed") + } + + pub fn remove(&mut self) { + if let Some(p) = &self.path { + let _ = fs::remove_dir_all(p); + self.path = None; + } + } +} + +impl Drop for TempDir { + fn drop(&mut self) { + self.remove(); + } +} + +impl AsRef for TempDir { + fn as_ref(&self) -> &Path { + self.path() + } +} + +impl Deref for TempDir { + type Target = Path; + + fn deref(&self) -> &Self::Target { + self.path() + } +} + +pub fn create_temp_dir(test_name: &str) -> Result { + let dir = TempDir::new(std::env::temp_dir().join(test_name))?; + Ok(dir) +} + #[cfg(test)] mod tests { use super::*; @@ -104,12 +167,9 @@ mod tests { #[test] fn test_join_absolute_path_error() { - assert_eq!( - PathBuf::from("sample/a/") - .join_absolute_path(&PathBuf::from("b/c")) - .is_err(), - true - ); + assert!(PathBuf::from("sample/a/") + .join_absolute_path(&PathBuf::from("b/c")) + .is_err(),); } #[test]