From af40f6186632d8d299e2ae9d0b87875e79ff9c7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Borys=20Pop=C5=82awski?= Date: Mon, 28 Nov 2022 13:18:18 +0100 Subject: [PATCH] [LibOS] Clean up `select` and `poll` implementations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Borys Popławski --- libos/include/libos_fs.h | 10 +- libos/include/libos_fs_mem.h | 2 +- libos/include/libos_table.h | 2 +- libos/src/fs/libos_fs_mem.c | 12 +- libos/src/fs/libos_fs_pseudo.c | 16 +- libos/src/fs/libos_fs_util.c | 15 +- libos/src/sys/libos_poll.c | 443 ++++++++++++++++++--------------- libos/test/ltp/ltp.cfg | 7 +- libos/test/regression/poll.c | 2 +- 9 files changed, 264 insertions(+), 245 deletions(-) diff --git a/libos/include/libos_fs.h b/libos/include/libos_fs.h index 760bb6d570..01e2d92762 100644 --- a/libos/include/libos_fs.h +++ b/libos/include/libos_fs.h @@ -21,10 +21,6 @@ struct libos_handle; -#define FS_POLL_RD 0x01 -#define FS_POLL_WR 0x02 -#define FS_POLL_ER 0x04 - /* Describes mount parameters. Passed to `mount_fs`, and to the `mount` callback. */ struct libos_mount_params { /* Filesystem type (corresponds to `name` field of `libos_fs` */ @@ -182,8 +178,8 @@ struct libos_fs_ops { int (*checkout)(struct libos_handle* hdl); int (*checkin)(struct libos_handle* hdl); - /* poll a single handle */ - int (*poll)(struct libos_handle* hdl, int poll_type); + /* Poll a single handle. Must not block. */ + int (*poll)(struct libos_handle* hdl, int in_events, int* out_events); /* checkpoint/migrate the file system */ ssize_t (*checkpoint)(void** checkpoint, void* mount_data); @@ -950,7 +946,7 @@ int generic_readdir(struct libos_dentry* dent, readdir_callback_t callback, void int generic_inode_stat(struct libos_dentry* dent, struct stat* buf); int generic_inode_hstat(struct libos_handle* hdl, struct stat* buf); file_off_t generic_inode_seek(struct libos_handle* hdl, file_off_t offset, int origin); -int generic_inode_poll(struct libos_handle* hdl, int poll_type); +int generic_inode_poll(struct libos_handle* hdl, int events, int* out_events); int generic_emulated_mmap(struct libos_handle* hdl, void* addr, size_t size, int prot, int flags, uint64_t offset); diff --git a/libos/include/libos_fs_mem.h b/libos/include/libos_fs_mem.h index 4e76a45cc0..9ce9e0e189 100644 --- a/libos/include/libos_fs_mem.h +++ b/libos/include/libos_fs_mem.h @@ -30,4 +30,4 @@ ssize_t mem_file_read(struct libos_mem_file* mem, file_off_t pos_start, void* bu ssize_t mem_file_write(struct libos_mem_file* mem, file_off_t pos_start, const void* buf, size_t size); int mem_file_truncate(struct libos_mem_file* mem, file_off_t size); -int mem_file_poll(struct libos_mem_file* mem, file_off_t pos, int poll_type); +int mem_file_poll(struct libos_mem_file* mem, file_off_t pos, int events, int* out_events); diff --git a/libos/include/libos_table.h b/libos/include/libos_table.h index ca6e065ef5..a06fd65911 100644 --- a/libos/include/libos_table.h +++ b/libos/include/libos_table.h @@ -182,7 +182,7 @@ long libos_syscall_fchmodat(int dfd, const char* filename, mode_t mode); long libos_syscall_fchownat(int dfd, const char* filename, uid_t user, gid_t group, int flags); long libos_syscall_faccessat(int dfd, const char* filename, mode_t mode); long libos_syscall_pselect6(int nfds, fd_set* readfds, fd_set* writefds, fd_set* exceptfds, - const struct __kernel_timespec* tsp, void* sigmask_argpack); + struct __kernel_timespec* tsp, void* sigmask_argpack); long libos_syscall_ppoll(struct pollfd* fds, unsigned int nfds, struct timespec* tsp, const __sigset_t* sigmask_ptr, size_t sigsetsize); long libos_syscall_set_robust_list(struct robust_list_head* head, size_t len); diff --git a/libos/src/fs/libos_fs_mem.c b/libos/src/fs/libos_fs_mem.c index f4ff92de1a..2df7bddada 100644 --- a/libos/src/fs/libos_fs_mem.c +++ b/libos/src/fs/libos_fs_mem.c @@ -112,11 +112,9 @@ int mem_file_truncate(struct libos_mem_file* mem, file_off_t size) { return 0; } -int mem_file_poll(struct libos_mem_file* mem, file_off_t pos, int poll_type) { - int ret = 0; - if ((poll_type & FS_POLL_RD) && (pos < mem->size)) - ret |= FS_POLL_RD; - if (poll_type & FS_POLL_WR) - ret |= FS_POLL_WR; - return ret; +int mem_file_poll(struct libos_mem_file* mem, file_off_t pos, int events, int* out_events) { + *out_events = events & (POLLOUT | POLLWRNORM); + if ((events & (POLLIN | POLLRDNORM)) && (pos < mem->size)) + *out_events |= events & (POLLIN | POLLRDNORM); + return 0; } diff --git a/libos/src/fs/libos_fs_pseudo.c b/libos/src/fs/libos_fs_pseudo.c index 0243858398..997f61527a 100644 --- a/libos/src/fs/libos_fs_pseudo.c +++ b/libos/src/fs/libos_fs_pseudo.c @@ -468,26 +468,26 @@ static int pseudo_close(struct libos_handle* hdl) { } } -static int pseudo_poll(struct libos_handle* hdl, int poll_type) { +static int pseudo_poll(struct libos_handle* hdl, int events, int* out_events) { struct pseudo_node* node = hdl->inode->data; switch (node->type) { case PSEUDO_STR: { assert(hdl->type == TYPE_STR); lock(&hdl->pos_lock); lock(&hdl->lock); - int ret = mem_file_poll(&hdl->info.str.mem, hdl->pos, poll_type); + int ret = mem_file_poll(&hdl->info.str.mem, hdl->pos, events, out_events); unlock(&hdl->lock); unlock(&hdl->pos_lock); return ret; } case PSEUDO_DEV: { - int ret = 0; - if ((poll_type & FS_POLL_RD) && node->dev.dev_ops.read) - ret |= FS_POLL_RD; - if ((poll_type & FS_POLL_WR) && node->dev.dev_ops.write) - ret |= FS_POLL_WR; - return ret; + *out_events = 0; + if ((events & POLLIN) && node->dev.dev_ops.read) + *out_events |= POLLIN; + if ((events & POLLOUT) && node->dev.dev_ops.write) + *out_events |= POLLOUT; + return 0; } default: diff --git a/libos/src/fs/libos_fs_util.c b/libos/src/fs/libos_fs_util.c index 0e8cfd9a55..0a2f047ec3 100644 --- a/libos/src/fs/libos_fs_util.c +++ b/libos/src/fs/libos_fs_util.c @@ -116,27 +116,16 @@ file_off_t generic_inode_seek(struct libos_handle* hdl, file_off_t offset, int o return ret; } -int generic_inode_poll(struct libos_handle* hdl, int poll_type) { +int generic_inode_poll(struct libos_handle* hdl, int events, int* out_events) { int ret; - lock(&hdl->pos_lock); - lock(&hdl->inode->lock); - if (hdl->inode->type == S_IFREG) { ret = 0; - if (poll_type & FS_POLL_WR) - ret |= FS_POLL_WR; - /* TODO: The `hdl->pos < hdl->inode->size` condition is wrong, the `poll` syscall treats - * end-of-file as readable. Check if removing this condition doesn't break anything - * in our `poll` implementation. */ - if ((poll_type & FS_POLL_RD) && hdl->pos < hdl->inode->size) - ret |= FS_POLL_RD; + *out_events = events & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM); } else { ret = -EAGAIN; } - unlock(&hdl->inode->lock); - unlock(&hdl->pos_lock); return ret; } diff --git a/libos/src/sys/libos_poll.c b/libos/src/sys/libos_poll.c index d528e47949..a10341b893 100644 --- a/libos/src/sys/libos_poll.c +++ b/libos/src/sys/libos_poll.c @@ -1,12 +1,13 @@ /* SPDX-License-Identifier: LGPL-3.0-or-later */ -/* Copyright (C) 2014 Stony Brook University */ +/* Copyright (C) 2014 Stony Brook University + * Copyright (C) 2022 Intel Corporation + * Borys Popławski + */ /* * Implementation of system calls "poll", "ppoll", "select" and "pselect6". */ -#include - #include "libos_fs.h" #include "libos_handle.h" #include "libos_internal.h" @@ -41,283 +42,309 @@ typedef unsigned long __fd_mask; #define __FD_CLR(d, set) ((void)(__FDS_BITS(set)[__FD_ELT(d)] &= ~__FD_MASK(d))) #define __FD_ISSET(d, set) ((__FDS_BITS(set)[__FD_ELT(d)] & __FD_MASK(d)) != 0) -static long _libos_syscall_poll(struct pollfd* fds, nfds_t nfds, uint64_t* timeout_us) { - if ((uint64_t)nfds > get_rlimit_cur(RLIMIT_NOFILE)) - return -EINVAL; - - struct libos_handle_map* map = get_cur_thread()->handle_map; - - /* nfds is the upper limit for actual number of handles */ - PAL_HANDLE* pals = malloc(nfds * sizeof(PAL_HANDLE)); - if (!pals) - return -ENOMEM; - - /* for bookkeeping, need to have a mapping FD -> {libos handle, index-in-pals} */ - struct fds_mapping_t { - struct libos_handle* hdl; /* NULL if no mapping (handle is not used in polling) */ - nfds_t idx; /* index from fds array to pals array */ - }; - struct fds_mapping_t* fds_mapping = malloc(nfds * sizeof(struct fds_mapping_t)); - if (!fds_mapping) { - free(pals); - return -ENOMEM; - } - - /* allocate one memory region to hold two pal_wait_flags_t arrays: events and revents */ - pal_wait_flags_t* pal_events = malloc(nfds * sizeof(*pal_events) * 2); - if (!pal_events) { - free(pals); - free(fds_mapping); +#define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR) +#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR) +#define POLLEX_SET (POLLPRI) + +static long do_poll(struct pollfd* fds, size_t fds_len, uint64_t* timeout_us) { + struct libos_handle** libos_handles = calloc(fds_len, sizeof(*libos_handles)); + PAL_HANDLE* pal_handles = calloc(fds_len, sizeof(*pal_handles)); + /* Double the amount of PAL events - one part are input events, the other - output. */ + pal_wait_flags_t* pal_events = calloc(2 * fds_len, sizeof(*pal_events)); + if (!libos_handles || !pal_handles || !pal_events) { + free(libos_handles); + free(pal_handles); + free(pal_events); return -ENOMEM; } - pal_wait_flags_t* ret_events = pal_events + nfds; - nfds_t pal_cnt = 0; - nfds_t nrevents = 0; + long ret; + size_t ret_events_count = 0; + struct libos_handle_map* map = get_cur_thread()->handle_map; lock(&map->lock); - /* collect PAL handles that correspond to user-supplied FDs (only those that can be polled) */ - for (nfds_t i = 0; i < nfds; i++) { - fds[i].revents = 0; - fds_mapping[i].hdl = NULL; - + for (size_t i = 0; i < fds_len; i++) { if (fds[i].fd < 0) { - /* FD is negative, must be ignored */ + /* Negative file descriptors are ignored. */ + fds[i].revents = 0; continue; } - struct libos_handle* hdl = __get_fd_handle(fds[i].fd, NULL, map); - if (!hdl || !hdl->fs || !hdl->fs->fs_ops) { - /* The corresponding handle doesn't exist or doesn't provide FS-like semantics; do not - * include it in handles-to-poll array but notify user about invalid request. */ + struct libos_handle* handle = __get_fd_handle(fds[i].fd, NULL, map); + if (!handle) { fds[i].revents = POLLNVAL; - nrevents++; + ret_events_count++; continue; } - if (hdl->type == TYPE_CHROOT || hdl->type == TYPE_DEV || hdl->type == TYPE_STR || - hdl->type == TYPE_TMPFS) { - /* Files, devs and strings are special cases: their poll is emulated at LibOS level; do - * not include them in handles-to-poll array but instead use handle-specific - * callback. - * - * TODO: we probably should use the poll() callback in all cases. */ - int libos_events = 0; - if ((fds[i].events & (POLLIN | POLLRDNORM)) && (hdl->acc_mode & MAY_READ)) - libos_events |= FS_POLL_RD; - if ((fds[i].events & (POLLOUT | POLLWRNORM)) && (hdl->acc_mode & MAY_WRITE)) - libos_events |= FS_POLL_WR; - - int libos_revents = hdl->fs->fs_ops->poll(hdl, libos_events); + int events = fds[i].events; + if (!(handle->acc_mode & MAY_READ)) { + events &= ~(POLLIN | POLLRDNORM); + } + if (!(handle->acc_mode & MAY_WRITE)) { + events &= ~(POLLOUT | POLLWRNORM); + } - fds[i].revents = 0; - if (libos_revents & FS_POLL_RD) - fds[i].revents |= fds[i].events & (POLLIN | POLLRDNORM); - if (libos_revents & FS_POLL_WR) - fds[i].revents |= fds[i].events & (POLLOUT | POLLWRNORM); + if (handle->fs && handle->fs->fs_ops && handle->fs->fs_ops->poll) { + ret = handle->fs->fs_ops->poll(handle, events, &events); + /* + * FIXME: remove this hack. + * Initial 0,1,2 fds in Gramine are represented by "/dev/tty" (whatever that means) + * and have `generic_inode_poll` set as poll callback, which returns `-EAGAIN` on + * non-regular-file handles. In such case we let PAL do the actuall polling. + */ + if (ret == -EAGAIN && handle->uri && !strcmp(handle->uri, "dev:tty")) { + goto dev_tty_hack; + } + + if (ret < 0) { + unlock(&map->lock); + goto out; + } + + fds[i].revents = events; + if (events) { + ret_events_count++; + } - if (fds[i].revents) - nrevents++; continue; + + dev_tty_hack:; } PAL_HANDLE pal_handle; - if (hdl->type == TYPE_SOCK) { - pal_handle = __atomic_load_n(&hdl->info.sock.pal_handle, __ATOMIC_ACQUIRE); + if (handle->type == TYPE_SOCK) { + pal_handle = __atomic_load_n(&handle->info.sock.pal_handle, __ATOMIC_ACQUIRE); + if (!pal_handle) { + /* UNIX sockets that are still not connected have no `pal_handle`. */ + fds[i].revents = POLLHUP; + ret_events_count++; + continue; + } } else { - pal_handle = hdl->pal_handle; - } - - if (!pal_handle) { - fds[i].revents = POLLNVAL; - nrevents++; - continue; + pal_handle = handle->pal_handle; + if (!pal_handle) { + fds[i].revents = POLLNVAL; + ret_events_count++; + continue; + } } - pal_wait_flags_t allowed_events = 0; - if ((fds[i].events & (POLLIN | POLLRDNORM)) && (hdl->acc_mode & MAY_READ)) - allowed_events |= PAL_WAIT_READ; - if ((fds[i].events & (POLLOUT | POLLWRNORM)) && (hdl->acc_mode & MAY_WRITE)) - allowed_events |= PAL_WAIT_WRITE; - - if ((fds[i].events & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) && !allowed_events) { - /* If user requested read/write events but they are not allowed on this handle, ignore - * this handle (but note that user may only be interested in errors, and this is a valid - * request). */ - continue; - } + if (events & (POLLIN | POLLRDNORM)) + pal_events[i] |= PAL_WAIT_READ; + if (events & (POLLOUT | POLLWRNORM)) + pal_events[i] |= PAL_WAIT_WRITE; - get_handle(hdl); - fds_mapping[i].hdl = hdl; - fds_mapping[i].idx = pal_cnt; - pals[pal_cnt] = pal_handle; - pal_events[pal_cnt] = allowed_events; - ret_events[pal_cnt] = 0; - pal_cnt++; + libos_handles[i] = handle; + get_handle(handle); + pal_handles[i] = pal_handle; } unlock(&map->lock); - bool polled = false; - long error = 0; - if (pal_cnt) { - error = PalStreamsWaitEvents(pal_cnt, pals, pal_events, ret_events, timeout_us); - polled = error == 0; - error = pal_to_unix_errno(error); + uint64_t tmp_timeout_us = 0; + if (ret_events_count) { + /* If we already have events to return, we should not sleep below. */ + timeout_us = &tmp_timeout_us; } - for (nfds_t i = 0; i < nfds; i++) { - if (!fds_mapping[i].hdl) - continue; + pal_wait_flags_t* ret_events = pal_events + fds_len; + ret = PalStreamsWaitEvents(fds_len, pal_handles, pal_events, ret_events, timeout_us); + if (ret < 0) { + ret = pal_to_unix_errno(ret); + if (ret == -EAGAIN) { + /* Timeout - return number of already seen events, which might be 0. */ + ret = ret_events_count; + } + goto out; + } - /* update fds.revents, but only if something was actually polled */ - if (polled) { - fds[i].revents = 0; - if (ret_events[fds_mapping[i].idx] & PAL_WAIT_ERROR) - fds[i].revents |= POLLERR | POLLHUP; - if (ret_events[fds_mapping[i].idx] & PAL_WAIT_READ) - fds[i].revents |= fds[i].events & (POLLIN | POLLRDNORM); - if (ret_events[fds_mapping[i].idx] & PAL_WAIT_WRITE) - fds[i].revents |= fds[i].events & (POLLOUT | POLLWRNORM); - - if (fds[i].revents) - nrevents++; + for (size_t i = 0; i < fds_len; i++) { + if (!libos_handles[i]) { + continue; } - put_handle(fds_mapping[i].hdl); + fds[i].revents = 0; + if (ret_events[i] & PAL_WAIT_ERROR) + fds[i].revents |= POLLERR | POLLHUP; + if (ret_events[i] & PAL_WAIT_READ) + fds[i].revents |= fds[i].events & (POLLIN | POLLRDNORM); + if (ret_events[i] & PAL_WAIT_WRITE) + fds[i].revents |= fds[i].events & (POLLOUT | POLLWRNORM); + + if (fds[i].revents) + ret_events_count++; } - free(pals); + ret = ret_events_count; + +out: + for (size_t i = 0; i < fds_len; i++) { + if (libos_handles[i]) { + put_handle(libos_handles[i]); + } + } + free(libos_handles); + free(pal_handles); free(pal_events); - free(fds_mapping); - if (error == -EAGAIN) { - /* `poll` returns 0 on timeout. */ - error = 0; - } else if (error == -EINTR) { + if (ret == -EINTR) { /* `poll`, `ppoll`, `select` and `pselect` are not restarted after being interrupted by * a signal handler. */ - error = -ERESTARTNOHAND; + ret = -ERESTARTNOHAND; } - return nrevents ? (long)nrevents : error; + return ret; } long libos_syscall_poll(struct pollfd* fds, unsigned int nfds, int timeout_ms) { + if (nfds > get_rlimit_cur(RLIMIT_NOFILE) || nfds > INT_MAX) + return -EINVAL; + if (!is_user_memory_writable(fds, nfds * sizeof(*fds))) return -EFAULT; uint64_t timeout_us = (unsigned int)timeout_ms * TIME_US_IN_MS; - return _libos_syscall_poll(fds, nfds, timeout_ms < 0 ? NULL : &timeout_us); + return do_poll(fds, nfds, timeout_ms < 0 ? NULL : &timeout_us); } long libos_syscall_ppoll(struct pollfd* fds, unsigned int nfds, struct timespec* tsp, const __sigset_t* sigmask_ptr, size_t sigsetsize) { + if (nfds > get_rlimit_cur(RLIMIT_NOFILE) || nfds > INT_MAX) + return -EINVAL; + if (!is_user_memory_writable(fds, nfds * sizeof(*fds))) { return -EFAULT; } - int ret = set_user_sigmask(sigmask_ptr, sigsetsize); + long ret = set_user_sigmask(sigmask_ptr, sigsetsize); if (ret < 0) { return ret; } - uint64_t timeout_us = tsp ? tsp->tv_sec * TIME_US_IN_S + tsp->tv_nsec / TIME_NS_IN_US : 0; - return _libos_syscall_poll(fds, nfds, tsp ? &timeout_us : NULL); -} - -long libos_syscall_select(int nfds, fd_set* readfds, fd_set* writefds, fd_set* errorfds, - struct __kernel_timeval* tsv) { - if (tsv && (tsv->tv_sec < 0 || tsv->tv_usec < 0)) - return -EINVAL; - - if (nfds < 0 || (uint64_t)nfds > get_rlimit_cur(RLIMIT_NOFILE)) - return -EINVAL; + uint64_t timeout_us = 0; + if (tsp) { + if (!is_user_memory_readable(tsp, sizeof(*tsp))) { + return -EFAULT; + } + if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 || (unsigned long)tsp->tv_nsec >= TIME_NS_IN_S) { + return -EINVAL; + } + timeout_us = tsp->tv_sec * TIME_US_IN_S + tsp->tv_nsec / TIME_NS_IN_US; + } - if (!nfds) { - if (!tsv) - return libos_syscall_pause(); + ret = do_poll(fds, nfds, tsp ? &timeout_us : NULL); - /* special case of select(0, ..., tsv) used for sleep */ - return do_nanosleep(tsv->tv_sec * TIME_US_IN_S + tsv->tv_usec, NULL); + if (tsp) { + if (is_user_memory_writable_no_skip(tsp, sizeof(*tsp))) { + tsp->tv_sec = timeout_us / TIME_US_IN_S; + tsp->tv_nsec = (timeout_us % TIME_US_IN_S) * TIME_NS_IN_US; + } } + return ret; +} - if (nfds < __NFDBITS) { - /* interesting corner case: Linux always checks at least 64 first FDs */ - nfds = __NFDBITS; +static long do_select(int nfds, fd_set* read_set, fd_set* write_set, fd_set* except_set, + uint64_t* timeout_us) { + if (nfds < 0 || (uint64_t)nfds > get_rlimit_cur(RLIMIT_NOFILE) || nfds > INT_MAX) + return -EINVAL; + + size_t total_fds = 0; + for (size_t i = 0; i < (size_t)nfds; i++) { + if ((read_set && __FD_ISSET(i, read_set)) || (write_set && __FD_ISSET(i, write_set)) + || (except_set && __FD_ISSET(i, except_set))) { + total_fds++; + } } - /* nfds is the upper limit for actual number of fds for poll */ - struct pollfd* fds_poll = malloc(nfds * sizeof(struct pollfd)); - if (!fds_poll) + struct pollfd* poll_fds = malloc(total_fds * sizeof(*poll_fds)); + if (!poll_fds) return -ENOMEM; - /* populate array of pollfd's based on user-supplied readfds & writefds */ - nfds_t nfds_poll = 0; - for (int fd = 0; fd < nfds; fd++) { + long ret; + size_t poll_fds_idx = 0; + for (size_t i = 0; i < (size_t)nfds; i++) { short events = 0; - if (readfds && __FD_ISSET(fd, readfds)) - events |= POLLIN; - if (writefds && __FD_ISSET(fd, writefds)) - events |= POLLOUT; + if (read_set && __FD_ISSET(i, read_set)) { + events |= POLLIN_SET; + } + if (write_set && __FD_ISSET(i, write_set)) { + events |= POLLOUT_SET; + } + if (except_set && __FD_ISSET(i, except_set)) { + events |= POLLEX_SET; + } if (!events) continue; - fds_poll[nfds_poll].fd = fd; - fds_poll[nfds_poll].events = events; - fds_poll[nfds_poll].revents = 0; - nfds_poll++; - } - - /* select()/pselect() return -EBADF if invalid FD was given by user in readfds/writefds; - * note that poll()/ppoll() don't have this error code, so we return this code only here */ - struct libos_handle_map* map = get_cur_thread()->handle_map; - lock(&map->lock); - for (nfds_t i = 0; i < nfds_poll; i++) { - struct libos_handle* hdl = __get_fd_handle(fds_poll[i].fd, NULL, map); - if (!hdl || !hdl->fs || !hdl->fs->fs_ops) { - /* the corresponding handle doesn't exist or doesn't provide FS-like semantics */ - free(fds_poll); - unlock(&map->lock); - return -EBADF; + if (poll_fds_idx == total_fds) { + log_error("User app is buggy and changed `select` fds sets concurrently!"); + ret = -EAGAIN; + goto out; } + + poll_fds[poll_fds_idx] = (struct pollfd){ + .fd = i, + .events = events, + }; + poll_fds_idx++; } - unlock(&map->lock); - uint64_t timeout_us = tsv ? tsv->tv_sec * TIME_US_IN_S + tsv->tv_usec : 0; - long ret = _libos_syscall_poll(fds_poll, nfds_poll, tsv ? &timeout_us : NULL); + if (poll_fds_idx != total_fds) { + log_error("User app is buggy and changed `select` fds sets concurrently!"); + ret = -EAGAIN; + goto out; + } + ret = do_poll(poll_fds, total_fds, timeout_us); if (ret < 0) { - free(fds_poll); - return ret; + goto out; } - /* modify readfds, writefds, and errorfds in-place with returned events */ - if (readfds) - __FD_ZERO(readfds); - if (writefds) - __FD_ZERO(writefds); - if (errorfds) - __FD_ZERO(errorfds); - - ret = 0; - for (nfds_t i = 0; i < nfds_poll; i++) { - if (readfds && (fds_poll[i].revents & POLLIN)) { - __FD_SET(fds_poll[i].fd, readfds); - ret++; + /* `select` modifies read_set, write_set and except_set in-place. */ + for (size_t i = 0; i < total_fds; i++) { + if (poll_fds[i].revents & POLLNVAL) { + /* `select` returns error on invalid fds, but also fills sets. */ + ret = -EBADF; + continue; + } + + if (read_set && !(poll_fds[i].revents & POLLIN_SET)) { + __FD_CLR(poll_fds[i].fd, read_set); } - if (writefds && (fds_poll[i].revents & POLLOUT)) { - __FD_SET(fds_poll[i].fd, writefds); - ret++; + if (write_set && !(poll_fds[i].revents & POLLOUT_SET)) { + __FD_CLR(poll_fds[i].fd, write_set); + } + if (except_set && !(poll_fds[i].revents & POLLEX_SET)) { + __FD_CLR(poll_fds[i].fd, except_set); + } + } + +out: + free(poll_fds); + return ret; +} + +long libos_syscall_select(int nfds, fd_set* read_set, fd_set* write_set, fd_set* except_set, + struct __kernel_timeval* tv) { + uint64_t timeout_us = 0; + if (tv) { + if (!is_user_memory_readable(tv, sizeof(*tv))) { + return -EFAULT; } - if (errorfds && (fds_poll[i].revents & POLLERR)) { - __FD_SET(fds_poll[i].fd, errorfds); - ret++; + if (tv->tv_sec < 0 || tv->tv_usec < 0 || (unsigned long)tv->tv_usec >= TIME_US_IN_S) { + return -EINVAL; } + timeout_us = tv->tv_sec * TIME_US_IN_S + tv->tv_usec; } - free(fds_poll); + long ret = do_select(nfds, read_set, write_set, except_set, tv ? &timeout_us : NULL); + + if (tv && is_user_memory_writable_no_skip(tv, sizeof(*tv))) { + tv->tv_sec = timeout_us / TIME_US_IN_S; + tv->tv_usec = timeout_us % TIME_US_IN_S; + } return ret; } @@ -326,8 +353,8 @@ struct sigset_argpack { size_t size; }; -long libos_syscall_pselect6(int nfds, fd_set* readfds, fd_set* writefds, fd_set* errorfds, - const struct __kernel_timespec* tsp, void* _sigmask_argpack) { +long libos_syscall_pselect6(int nfds, fd_set* read_set, fd_set* write_set, fd_set* except_set, + struct __kernel_timespec* tsp, void* _sigmask_argpack) { struct sigset_argpack* sigmask_argpack = _sigmask_argpack; if (sigmask_argpack) { if (!is_user_memory_readable(sigmask_argpack, sizeof(*sigmask_argpack))) { @@ -339,12 +366,22 @@ long libos_syscall_pselect6(int nfds, fd_set* readfds, fd_set* writefds, fd_set* } } + uint64_t timeout_us = 0; if (tsp) { - struct __kernel_timeval tsv; - tsv.tv_sec = tsp->tv_sec; - tsv.tv_usec = tsp->tv_nsec / 1000; - return libos_syscall_select(nfds, readfds, writefds, errorfds, &tsv); + if (!is_user_memory_readable(tsp, sizeof(*tsp))) { + return -EFAULT; + } + if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 || (unsigned long)tsp->tv_nsec >= TIME_NS_IN_S) { + return -EINVAL; + } + timeout_us = tsp->tv_sec * TIME_US_IN_S + tsp->tv_nsec / TIME_NS_IN_US; } - return libos_syscall_select(nfds, readfds, writefds, errorfds, NULL); + long ret = do_select(nfds, read_set, write_set, except_set, tsp ? &timeout_us : NULL); + + if (tsp && is_user_memory_writable_no_skip(tsp, sizeof(*tsp))) { + tsp->tv_sec = timeout_us / TIME_US_IN_S; + tsp->tv_nsec = (timeout_us % TIME_US_IN_S) * TIME_NS_IN_US; + } + return ret; } diff --git a/libos/test/ltp/ltp.cfg b/libos/test/ltp/ltp.cfg index 28c833e223..7efd278e73 100644 --- a/libos/test/ltp/ltp.cfg +++ b/libos/test/ltp/ltp.cfg @@ -1360,6 +1360,9 @@ skip = yes [personality01] skip = yes +[personality02] +skip = yes + # no pidfd_open() [pidfd_open*] skip = yes @@ -1432,10 +1435,6 @@ skip = yes [posix_fadvise04_64] skip = yes -# some tests fail with incorrect revents and errnos, need further investigation -[ppoll01] -skip = yes - [prctl01] skip = yes diff --git a/libos/test/regression/poll.c b/libos/test/regression/poll.c index 0e8722f5ba..e9f576a75b 100644 --- a/libos/test/regression/poll.c +++ b/libos/test/regression/poll.c @@ -109,7 +109,7 @@ int main(int argc, char** argv) { test_file(argv[0], O_RDONLY, POLLIN, POLLIN, 0, 0); /* Host file (empty) */ - test_file("tmp/host_file", O_RDWR | O_CREAT | O_TRUNC, POLLIN, 0, POLLOUT, POLLOUT); + test_file("tmp/host_file", O_RDWR | O_CREAT | O_TRUNC, POLLIN, POLLIN, POLLOUT, POLLOUT); printf("TEST OK\n"); return 0;