Skip to content

Commit

Permalink
[antlir2][unshare_userns] rewrite constrained syscall part in C
Browse files Browse the repository at this point in the history
Summary:
The complicated part of `unshare_userns` involves two `fork()`s with logic in
between the `fork()` and `exec()` in the child process. This comes with a bunch
of rules, the most annoying being no safe way to allocate memory.

Previously, I tried to "enforce" this by making the Rust crate `#![no_std]`,
but in fbcode, `std` is always unconditionally added as a buck2 dependency, so
there was nothing preventing `std` from being used (and thus, lots of things
that can do dynamic memory allocations). `std` usage was accidentally
introduced in D67157487.

To prevent this from happening again, to properly enforce these constraints,
and to fix the OSS build, I am rewriting the post-`fork()` logic in C. This is
a direct translation of the Rust code, which was already very C-like do to the
low-level syscalls being done.

Test Plan:
```
❯ buck2 test fbcode//antlir/antlir2/features/user/tests: fbcode//antlir/antlir2/features/install/tests:
Buck UI: https://www.internalfb.com/buck2/db4a7f57-d8a5-41d5-861c-4508eb80410a
Test UI: https://www.internalfb.com/intern/testinfra/testrun/5910974770074096
Tests finished: Pass 39. Fail 0. Fatal 0. Skip 0. Build failure 0

❯ buck2 test -c antlir2.rootless=1 fbcode//antlir/antlir2/features/user/tests: fbcode//antlir/antlir2/features/install/tests:
Buck UI: https://www.internalfb.com/buck2/ad2d4e89-d6ab-435c-a0a5-519560f52a3b
Test UI: https://www.internalfb.com/intern/testinfra/testrun/3659174958492970
Tests finished: Pass 39. Fail 0. Fatal 0. Skip 0. Build failure 0
```

Differential Revision: D67803947
  • Loading branch information
vmagro authored and facebook-github-bot committed Jan 6, 2025
1 parent 46f1dbc commit d92a0ad
Show file tree
Hide file tree
Showing 3 changed files with 226 additions and 123 deletions.
19 changes: 16 additions & 3 deletions antlir/antlir2/antlir2_rootless/unshare_userns/BUCK
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
load("//antlir/bzl:build_defs.bzl", "rust_library")
load("//antlir/bzl:build_defs.bzl", "cpp_library", "rust_library")

oncall("antlir")

# @rust-guess-deps-ignore does not understand conditional fb_deps
rust_library(
# @autodeps-skip
name = "unshare_userns",
srcs = glob(["src/**/*.rs"]),
compatible_with = [
Expand All @@ -13,7 +15,18 @@ rust_library(
],
visibility = ["//antlir/antlir2/antlir2_rootless:"],
deps = [
"close-err",
"nix",
":unshare_userns_c", # @autodeps2-fixme-manual
],
)

cpp_library(
# @autodeps-skip
name = "unshare_userns_c",
srcs = ["unshare_userns.c"],
compiler_flags = [
# _GNU_SOURCE is required for unshare(), but it may not be set by our
# OSS toolchain
"-D_GNU_SOURCE",
],
visibility = [":unshare_userns"],
)
157 changes: 37 additions & 120 deletions antlir/antlir2/antlir2_rootless/unshare_userns/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,70 +7,13 @@

//! This is a helper library for unsharing the current process into a new,
//! unprivileged user namespace.
//! This is a little bit of a tricky dance that requires a few unsafe `fork()`s
//! and pipe based communication to accomplish the following flow:
//!
//! ┌────────────┐ ┌───────┐ ┌───────┐
//! │Main Process│ │Child 1│ │Child 2│
//! └─────┬──────┘ └───┬───┘ └───┬───┘
//! │ │ │
//! │ fork() │ │
//! │──────────────>│ │
//! │ │ │
//! │"I've unshared"│ │
//! │──────────────>│ │
//! │ │ │
//! │ │ fork() │
//! │ │──────────────>│
//! │ │ │
//! │ │exec(newgidmap)│
//! │ │<──────────────│
//! │ │ │
//! │ exec(newuidmap) │
//! │<──────────────────────────────│
//! ┌─────┴──────┐ ┌───┴───┐ ┌───┴───┐
//! │Main Process│ │Child 1│ │Child 2│
//! └────────────┘ └───────┘ └───────┘
//!
//! 1. Main Process starts in the initial user namespace. It forks Child 1 (also
//! in the initial user namespace).
//!
//! 2. Main Process unshares itself into a new user namespace. At this point,
//! the new user namespace has no IDs mapped into it.
//!
//! 3. Main Process closes the write end of the pipe it gave to Child 1 to
//! indicate that Main Process has created the new user namespace.
//!
//! 4. Child 1 forks Child 2 (also in the initial user namespace).
//!
//! 5. Child 2 execs /usr/bin/newgidmap to map GIDs into Main Process's new user
//! namespace.
//!
//! 6. Child 1 execs /usr/bin/newuidmap to map UIDs into Main Process's new user
//! namespace.
//!
//! 7. Main Process gets a 0 return code from Child 1 and continues its
//! execution. Main Process's user namespace now has a full range of UIDs and
//! GIDs mapped into it.
//! See the C implementation for more details about how exactly it works, but
//! the useful end result is that the process that calls this function will end
//! up in a new user namespace with a full range of UIDs and GIDs mapped into
//! it.
// This does a few `fork()`s with logic afterwards so we have to be careful not
// to accidentally do any dynamic memory allocation. An easy way to accomplish
// that is just using no_std.
#![no_std]

use core::ffi::CStr;
use std::io;
use std::os::fd::AsRawFd;

use close_err::Closable;
use nix::errno::Errno;
use nix::sched::unshare;
use nix::sched::CloneFlags;
use nix::sys::wait::waitpid;
use nix::sys::wait::WaitStatus;
use nix::unistd::fork;
use nix::unistd::pipe;
use nix::unistd::ForkResult;
use std::ffi::CStr;
use std::io::Error;

#[derive(Copy, Clone)]
pub struct Map<'a> {
Expand All @@ -79,7 +22,22 @@ pub struct Map<'a> {
pub len: &'a CStr,
}

pub fn unshare_userns(pid_cstr: &CStr, uid_map: &Map, gid_map: &Map) -> io::Result<()> {
mod c {
use std::os::raw::c_char;
extern "C" {
pub(crate) fn unshare_userns(
pid_cstr: *const c_char,
uid_map_outside_root: *const c_char,
uid_map_outside_sub_start: *const c_char,
uid_map_len: *const c_char,
gid_map_outside_root: *const c_char,
gid_map_outside_sub_start: *const c_char,
gid_map_len: *const c_char,
) -> i32;
}
}

pub fn unshare_userns(pid_cstr: &CStr, uid_map: &Map, gid_map: &Map) -> std::io::Result<()> {
// TODO(T181212521): do the same check in OSS
#[cfg(facebook)]
if memory::is_using_jemalloc()
Expand All @@ -90,61 +48,20 @@ pub fn unshare_userns(pid_cstr: &CStr, uid_map: &Map, gid_map: &Map) -> io::Resu
please check your binary's `malloc_conf` or set the binary target's `allocator` attribute to \"malloc\"."
);
}
let (read, write) = pipe()?;
match unsafe { fork() }? {
ForkResult::Parent { child } => {
unshare(CloneFlags::CLONE_NEWUSER)?;
read.close()?;
write.close()?;
let status = waitpid(child, None)?;
if status != WaitStatus::Exited(child, 0) {
return Err(io::Error::from(Errno::EIO));
}
}
ForkResult::Child => {
write.close()?;
nix::unistd::read(read.as_raw_fd(), &mut [0u8])?;

match unsafe { fork() } {
Ok(ForkResult::Parent { child }) => {
let status = waitpid(child, None)?;
if status != WaitStatus::Exited(child, 0) {
return Err(io::Error::from(Errno::EIO));
}
Ok(())
}
Ok(ForkResult::Child) => nix::unistd::execv(
c"/usr/bin/newgidmap",
&[
c"newgidmap",
pid_cstr,
c"0",
gid_map.outside_root,
c"1",
c"1",
gid_map.outside_sub_start,
gid_map.len,
],
)
.map(|_| ()),
Err(e) => Err(e),
}?;
nix::unistd::execv(
c"/usr/bin/newuidmap",
&[
c"newuidmap",
pid_cstr,
c"0",
uid_map.outside_root,
c"1",
c"1",
uid_map.outside_sub_start,
uid_map.len,
],
)
.expect("failed to exec newuidmap");
unreachable!("we just exec-ed")
}
let res = unsafe {
c::unshare_userns(
pid_cstr.as_ptr(),
uid_map.outside_root.as_ptr(),
uid_map.outside_sub_start.as_ptr(),
uid_map.len.as_ptr(),
gid_map.outside_root.as_ptr(),
gid_map.outside_sub_start.as_ptr(),
gid_map.len.as_ptr(),
)
};
match res {
0 => Ok(()),
-1 => Err(Error::last_os_error()),
_ => Err(Error::from_raw_os_error(res)),
}
Ok(())
}
173 changes: 173 additions & 0 deletions antlir/antlir2/antlir2_rootless/unshare_userns/unshare_userns.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/

// This is a helper library for unsharing the current process into a new,
// unprivileged user namespace.
// This is a little bit of a tricky dance that requires a few unsafe `fork()`s
// and pipe based communication to accomplish the following flow:
//
// ┌────────────┐ ┌───────┐ ┌───────┐
// │Main Process│ │Child 1│ │Child 2│
// └─────┬──────┘ └───┬───┘ └───┬───┘
// │ │ │
// │ fork() │ │
// │──────────────>│ │
// │ │ │
// │"I've unshared"│ │
// │──────────────>│ │
// │ │ │
// │ │ fork() │
// │ │──────────────>│
// │ │ │
// │ │exec(newgidmap)│
// │ │<──────────────│
// │ │ │
// │ exec(newuidmap) │
// │<──────────────────────────────│
// ┌─────┴──────┐ ┌───┴───┐ ┌───┴───┐
// │Main Process│ │Child 1│ │Child 2│
// └────────────┘ └───────┘ └───────┘
//
// 1. Main Process starts in the initial user namespace. It forks Child 1 (also
// in the initial user namespace).
//
// 2. Main Process unshares itself into a new user namespace. At this point,
// the new user namespace has no IDs mapped into it.
//
// 3. Main Process closes the write end of the pipe it gave to Child 1 to
// indicate that Main Process has created the new user namespace.
//
// 4. Child 1 forks Child 2 (also in the initial user namespace).
//
// 5. Child 2 execs /usr/bin/newgidmap to map GIDs into Main Process's new user
// namespace.
//
// 6. Child 1 execs /usr/bin/newuidmap to map UIDs into Main Process's new user
// namespace.
//
// 7. Main Process gets a 0 return code from Child 1 and continues its
// execution. Main Process's user namespace now has a full range of UIDs and
// GIDs mapped into it.

#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/wait.h>
#include <unistd.h>

// WARNING!!!!!
// This does a few `fork()`s with logic afterwards so we have to be careful not
// to accidentally do any dynamic memory allocation, which is not allowed
// between `fork()` and `exec()`.
int unshare_userns(
char* pid_str,
char* uid_map_outside_root,
char* uid_map_outside_sub_start,
char* uid_map_len,
char* gid_map_outside_root,
char* gid_map_outside_sub_start,
char* gid_map_len) {
int pipefd[2];
if (pipe(pipefd) == -1) {
return -1;
}

int child1 = fork();
switch (child1) {
case -1:
close(pipefd[0]);
close(pipefd[1]);
return -1;
case 0:
// In the child process, wait for the parent process to indicate that it
// has unshared into a new user namespace, then setup the id mappings
// using the new{ug}idmap binaries

// close our end of the write pipe, we won't be using it
if (close(pipefd[1]) == -1) {
exit(EXIT_FAILURE);
}
// this read() will complete as soon as the parent process closes its end
// of the pipe
char buf;
read(pipefd[0], &buf, 1);
close(pipefd[0]);

int child2 = fork();
switch (child2) {
case -1:
exit(EXIT_FAILURE);
case 0: {
// do newgidmap first
char* args[] = {
"newgidmap",
pid_str,
"0",
gid_map_outside_root,
"1",
"1",
gid_map_outside_sub_start,
gid_map_len,
NULL};
if (execv("/usr/bin/newgidmap", args) == -1) {
perror("exec newgidmap");
exit(EXIT_FAILURE);
}
exit(EXIT_SUCCESS);
}
default: {
// wait for the newgidmap to finish
int status = 0;
if (waitpid(child2, &status, 0) == -1) {
exit(EXIT_FAILURE);
}
if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) {
exit(EXIT_FAILURE);
}
}
}

// now the newgidmap is done, do newuidmap
char* args[] = {
"newuidmap",
pid_str,
"0",
uid_map_outside_root,
"1",
"1",
uid_map_outside_sub_start,
uid_map_len,
NULL};
if (execv("/usr/bin/newuidmap", args) == -1) {
perror("exec newuidmap");
exit(EXIT_FAILURE);
};
exit(EXIT_SUCCESS);

default:
close(pipefd[1]);
// In the parent process, we must unshare the usernamespace, signal the
// child process by closing our ends of the pipe and then wait for it to
// exit, which signals that the namespace mapping is complete
if (unshare(CLONE_NEWUSER) == -1) {
close(pipefd[0]);
return -1;
}
if (close(pipefd[0]) == -1) {
return -1;
}

int status = 0;
if (waitpid(child1, &status, 0) == -1) {
exit(EXIT_FAILURE);
}
if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) {
return status;
}
return 0;
}
}

0 comments on commit d92a0ad

Please sign in to comment.