From d92a0addde9936e1a93e0a28aa372233750e391e Mon Sep 17 00:00:00 2001 From: vmagro Date: Mon, 6 Jan 2025 08:17:00 -0800 Subject: [PATCH] [antlir2][unshare_userns] rewrite constrained syscall part in C MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: The complicated part of `unshare_userns` involves two `fork()`s with logic in between the `fork()` and `exec()` in the child process. This comes with a bunch of rules, the most annoying being no safe way to allocate memory. Previously, I tried to "enforce" this by making the Rust crate `#![no_std]`, but in fbcode, `std` is always unconditionally added as a buck2 dependency, so there was nothing preventing `std` from being used (and thus, lots of things that can do dynamic memory allocations). `std` usage was accidentally introduced in D67157487. To prevent this from happening again, to properly enforce these constraints, and to fix the OSS build, I am rewriting the post-`fork()` logic in C. This is a direct translation of the Rust code, which was already very C-like do to the low-level syscalls being done. Test Plan: ``` ❯ buck2 test fbcode//antlir/antlir2/features/user/tests: fbcode//antlir/antlir2/features/install/tests: Buck UI: https://www.internalfb.com/buck2/db4a7f57-d8a5-41d5-861c-4508eb80410a Test UI: https://www.internalfb.com/intern/testinfra/testrun/5910974770074096 Tests finished: Pass 39. Fail 0. Fatal 0. Skip 0. Build failure 0 ❯ buck2 test -c antlir2.rootless=1 fbcode//antlir/antlir2/features/user/tests: fbcode//antlir/antlir2/features/install/tests: Buck UI: https://www.internalfb.com/buck2/ad2d4e89-d6ab-435c-a0a5-519560f52a3b Test UI: https://www.internalfb.com/intern/testinfra/testrun/3659174958492970 Tests finished: Pass 39. Fail 0. Fatal 0. Skip 0. Build failure 0 ``` Differential Revision: D67803947 --- .../antlir2_rootless/unshare_userns/BUCK | 19 +- .../unshare_userns/src/lib.rs | 157 ++++------------ .../unshare_userns/unshare_userns.c | 173 ++++++++++++++++++ 3 files changed, 226 insertions(+), 123 deletions(-) create mode 100644 antlir/antlir2/antlir2_rootless/unshare_userns/unshare_userns.c diff --git a/antlir/antlir2/antlir2_rootless/unshare_userns/BUCK b/antlir/antlir2/antlir2_rootless/unshare_userns/BUCK index 30ea006b4c6..e0bea37e915 100644 --- a/antlir/antlir2/antlir2_rootless/unshare_userns/BUCK +++ b/antlir/antlir2/antlir2_rootless/unshare_userns/BUCK @@ -1,8 +1,10 @@ -load("//antlir/bzl:build_defs.bzl", "rust_library") +load("//antlir/bzl:build_defs.bzl", "cpp_library", "rust_library") oncall("antlir") +# @rust-guess-deps-ignore does not understand conditional fb_deps rust_library( + # @autodeps-skip name = "unshare_userns", srcs = glob(["src/**/*.rs"]), compatible_with = [ @@ -13,7 +15,18 @@ rust_library( ], visibility = ["//antlir/antlir2/antlir2_rootless:"], deps = [ - "close-err", - "nix", + ":unshare_userns_c", # @autodeps2-fixme-manual ], ) + +cpp_library( + # @autodeps-skip + name = "unshare_userns_c", + srcs = ["unshare_userns.c"], + compiler_flags = [ + # _GNU_SOURCE is required for unshare(), but it may not be set by our + # OSS toolchain + "-D_GNU_SOURCE", + ], + visibility = [":unshare_userns"], +) diff --git a/antlir/antlir2/antlir2_rootless/unshare_userns/src/lib.rs b/antlir/antlir2/antlir2_rootless/unshare_userns/src/lib.rs index 119370ae101..9382e1cb2c3 100644 --- a/antlir/antlir2/antlir2_rootless/unshare_userns/src/lib.rs +++ b/antlir/antlir2/antlir2_rootless/unshare_userns/src/lib.rs @@ -7,70 +7,13 @@ //! This is a helper library for unsharing the current process into a new, //! unprivileged user namespace. -//! This is a little bit of a tricky dance that requires a few unsafe `fork()`s -//! and pipe based communication to accomplish the following flow: -//! -//! ┌────────────┐ ┌───────┐ ┌───────┐ -//! │Main Process│ │Child 1│ │Child 2│ -//! └─────┬──────┘ └───┬───┘ └───┬───┘ -//! │ │ │ -//! │ fork() │ │ -//! │──────────────>│ │ -//! │ │ │ -//! │"I've unshared"│ │ -//! │──────────────>│ │ -//! │ │ │ -//! │ │ fork() │ -//! │ │──────────────>│ -//! │ │ │ -//! │ │exec(newgidmap)│ -//! │ │<──────────────│ -//! │ │ │ -//! │ exec(newuidmap) │ -//! │<──────────────────────────────│ -//! ┌─────┴──────┐ ┌───┴───┐ ┌───┴───┐ -//! │Main Process│ │Child 1│ │Child 2│ -//! └────────────┘ └───────┘ └───────┘ -//! -//! 1. Main Process starts in the initial user namespace. It forks Child 1 (also -//! in the initial user namespace). -//! -//! 2. Main Process unshares itself into a new user namespace. At this point, -//! the new user namespace has no IDs mapped into it. -//! -//! 3. Main Process closes the write end of the pipe it gave to Child 1 to -//! indicate that Main Process has created the new user namespace. -//! -//! 4. Child 1 forks Child 2 (also in the initial user namespace). -//! -//! 5. Child 2 execs /usr/bin/newgidmap to map GIDs into Main Process's new user -//! namespace. -//! -//! 6. Child 1 execs /usr/bin/newuidmap to map UIDs into Main Process's new user -//! namespace. -//! -//! 7. Main Process gets a 0 return code from Child 1 and continues its -//! execution. Main Process's user namespace now has a full range of UIDs and -//! GIDs mapped into it. +//! See the C implementation for more details about how exactly it works, but +//! the useful end result is that the process that calls this function will end +//! up in a new user namespace with a full range of UIDs and GIDs mapped into +//! it. -// This does a few `fork()`s with logic afterwards so we have to be careful not -// to accidentally do any dynamic memory allocation. An easy way to accomplish -// that is just using no_std. -#![no_std] - -use core::ffi::CStr; -use std::io; -use std::os::fd::AsRawFd; - -use close_err::Closable; -use nix::errno::Errno; -use nix::sched::unshare; -use nix::sched::CloneFlags; -use nix::sys::wait::waitpid; -use nix::sys::wait::WaitStatus; -use nix::unistd::fork; -use nix::unistd::pipe; -use nix::unistd::ForkResult; +use std::ffi::CStr; +use std::io::Error; #[derive(Copy, Clone)] pub struct Map<'a> { @@ -79,7 +22,22 @@ pub struct Map<'a> { pub len: &'a CStr, } -pub fn unshare_userns(pid_cstr: &CStr, uid_map: &Map, gid_map: &Map) -> io::Result<()> { +mod c { + use std::os::raw::c_char; + extern "C" { + pub(crate) fn unshare_userns( + pid_cstr: *const c_char, + uid_map_outside_root: *const c_char, + uid_map_outside_sub_start: *const c_char, + uid_map_len: *const c_char, + gid_map_outside_root: *const c_char, + gid_map_outside_sub_start: *const c_char, + gid_map_len: *const c_char, + ) -> i32; + } +} + +pub fn unshare_userns(pid_cstr: &CStr, uid_map: &Map, gid_map: &Map) -> std::io::Result<()> { // TODO(T181212521): do the same check in OSS #[cfg(facebook)] if memory::is_using_jemalloc() @@ -90,61 +48,20 @@ pub fn unshare_userns(pid_cstr: &CStr, uid_map: &Map, gid_map: &Map) -> io::Resu please check your binary's `malloc_conf` or set the binary target's `allocator` attribute to \"malloc\"." ); } - let (read, write) = pipe()?; - match unsafe { fork() }? { - ForkResult::Parent { child } => { - unshare(CloneFlags::CLONE_NEWUSER)?; - read.close()?; - write.close()?; - let status = waitpid(child, None)?; - if status != WaitStatus::Exited(child, 0) { - return Err(io::Error::from(Errno::EIO)); - } - } - ForkResult::Child => { - write.close()?; - nix::unistd::read(read.as_raw_fd(), &mut [0u8])?; - - match unsafe { fork() } { - Ok(ForkResult::Parent { child }) => { - let status = waitpid(child, None)?; - if status != WaitStatus::Exited(child, 0) { - return Err(io::Error::from(Errno::EIO)); - } - Ok(()) - } - Ok(ForkResult::Child) => nix::unistd::execv( - c"/usr/bin/newgidmap", - &[ - c"newgidmap", - pid_cstr, - c"0", - gid_map.outside_root, - c"1", - c"1", - gid_map.outside_sub_start, - gid_map.len, - ], - ) - .map(|_| ()), - Err(e) => Err(e), - }?; - nix::unistd::execv( - c"/usr/bin/newuidmap", - &[ - c"newuidmap", - pid_cstr, - c"0", - uid_map.outside_root, - c"1", - c"1", - uid_map.outside_sub_start, - uid_map.len, - ], - ) - .expect("failed to exec newuidmap"); - unreachable!("we just exec-ed") - } + let res = unsafe { + c::unshare_userns( + pid_cstr.as_ptr(), + uid_map.outside_root.as_ptr(), + uid_map.outside_sub_start.as_ptr(), + uid_map.len.as_ptr(), + gid_map.outside_root.as_ptr(), + gid_map.outside_sub_start.as_ptr(), + gid_map.len.as_ptr(), + ) + }; + match res { + 0 => Ok(()), + -1 => Err(Error::last_os_error()), + _ => Err(Error::from_raw_os_error(res)), } - Ok(()) } diff --git a/antlir/antlir2/antlir2_rootless/unshare_userns/unshare_userns.c b/antlir/antlir2/antlir2_rootless/unshare_userns/unshare_userns.c new file mode 100644 index 00000000000..a49d955604f --- /dev/null +++ b/antlir/antlir2/antlir2_rootless/unshare_userns/unshare_userns.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +// This is a helper library for unsharing the current process into a new, +// unprivileged user namespace. +// This is a little bit of a tricky dance that requires a few unsafe `fork()`s +// and pipe based communication to accomplish the following flow: +// +// ┌────────────┐ ┌───────┐ ┌───────┐ +// │Main Process│ │Child 1│ │Child 2│ +// └─────┬──────┘ └───┬───┘ └───┬───┘ +// │ │ │ +// │ fork() │ │ +// │──────────────>│ │ +// │ │ │ +// │"I've unshared"│ │ +// │──────────────>│ │ +// │ │ │ +// │ │ fork() │ +// │ │──────────────>│ +// │ │ │ +// │ │exec(newgidmap)│ +// │ │<──────────────│ +// │ │ │ +// │ exec(newuidmap) │ +// │<──────────────────────────────│ +// ┌─────┴──────┐ ┌───┴───┐ ┌───┴───┐ +// │Main Process│ │Child 1│ │Child 2│ +// └────────────┘ └───────┘ └───────┘ +// +// 1. Main Process starts in the initial user namespace. It forks Child 1 (also +// in the initial user namespace). +// +// 2. Main Process unshares itself into a new user namespace. At this point, +// the new user namespace has no IDs mapped into it. +// +// 3. Main Process closes the write end of the pipe it gave to Child 1 to +// indicate that Main Process has created the new user namespace. +// +// 4. Child 1 forks Child 2 (also in the initial user namespace). +// +// 5. Child 2 execs /usr/bin/newgidmap to map GIDs into Main Process's new user +// namespace. +// +// 6. Child 1 execs /usr/bin/newuidmap to map UIDs into Main Process's new user +// namespace. +// +// 7. Main Process gets a 0 return code from Child 1 and continues its +// execution. Main Process's user namespace now has a full range of UIDs and +// GIDs mapped into it. + +#include +#include +#include +#include +#include + +// WARNING!!!!! +// This does a few `fork()`s with logic afterwards so we have to be careful not +// to accidentally do any dynamic memory allocation, which is not allowed +// between `fork()` and `exec()`. +int unshare_userns( + char* pid_str, + char* uid_map_outside_root, + char* uid_map_outside_sub_start, + char* uid_map_len, + char* gid_map_outside_root, + char* gid_map_outside_sub_start, + char* gid_map_len) { + int pipefd[2]; + if (pipe(pipefd) == -1) { + return -1; + } + + int child1 = fork(); + switch (child1) { + case -1: + close(pipefd[0]); + close(pipefd[1]); + return -1; + case 0: + // In the child process, wait for the parent process to indicate that it + // has unshared into a new user namespace, then setup the id mappings + // using the new{ug}idmap binaries + + // close our end of the write pipe, we won't be using it + if (close(pipefd[1]) == -1) { + exit(EXIT_FAILURE); + } + // this read() will complete as soon as the parent process closes its end + // of the pipe + char buf; + read(pipefd[0], &buf, 1); + close(pipefd[0]); + + int child2 = fork(); + switch (child2) { + case -1: + exit(EXIT_FAILURE); + case 0: { + // do newgidmap first + char* args[] = { + "newgidmap", + pid_str, + "0", + gid_map_outside_root, + "1", + "1", + gid_map_outside_sub_start, + gid_map_len, + NULL}; + if (execv("/usr/bin/newgidmap", args) == -1) { + perror("exec newgidmap"); + exit(EXIT_FAILURE); + } + exit(EXIT_SUCCESS); + } + default: { + // wait for the newgidmap to finish + int status = 0; + if (waitpid(child2, &status, 0) == -1) { + exit(EXIT_FAILURE); + } + if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) { + exit(EXIT_FAILURE); + } + } + } + + // now the newgidmap is done, do newuidmap + char* args[] = { + "newuidmap", + pid_str, + "0", + uid_map_outside_root, + "1", + "1", + uid_map_outside_sub_start, + uid_map_len, + NULL}; + if (execv("/usr/bin/newuidmap", args) == -1) { + perror("exec newuidmap"); + exit(EXIT_FAILURE); + }; + exit(EXIT_SUCCESS); + + default: + close(pipefd[1]); + // In the parent process, we must unshare the usernamespace, signal the + // child process by closing our ends of the pipe and then wait for it to + // exit, which signals that the namespace mapping is complete + if (unshare(CLONE_NEWUSER) == -1) { + close(pipefd[0]); + return -1; + } + if (close(pipefd[0]) == -1) { + return -1; + } + + int status = 0; + if (waitpid(child1, &status, 0) == -1) { + exit(EXIT_FAILURE); + } + if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) { + return status; + } + return 0; + } +}