diff --git a/Documentation/admin-guide/devices.txt b/Documentation/admin-guide/devices.txt index 8390549235304f..7ae111fd02e72c 100644 --- a/Documentation/admin-guide/devices.txt +++ b/Documentation/admin-guide/devices.txt @@ -376,8 +376,9 @@ 240 = /dev/userio Serio driver testing device 241 = /dev/vhost-vsock Host kernel driver for virtio vsock 242 = /dev/rfkill Turning off radio transmissions (rfkill) + 243 = /dev/winesync Wine synchronization primitive device - 243-254 Reserved for local use + 244-254 Reserved for local use 255 Reserved for MISC_DYNAMIC_MINOR 11 char Raw keyboard device (Linux/SPARC only) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 8d2f9ed3f1076e..4da131bf03d2ba 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4310,6 +4310,15 @@ nomsi [MSI] If the PCI_MSI kernel config parameter is enabled, this kernel boot option can be used to disable the use of MSI interrupts system-wide. + pcie_acs_override = + [PCIE] Override missing PCIe ACS support for: + downstream + All downstream ports - full ACS capabilities + multifunction + All multifunction devices - multifunction ACS subset + id:nnnn:nnnn + Specific device - full ACS capabilities + Specified as vid:did (vendor/device ID) in hex noioapicquirk [APIC] Disable all boot interrupt quirks. Safety option to keep boot IRQs enabled. This should never be necessary. diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst index 72a65db0c49889..ce5d0df572ebd1 100644 --- a/Documentation/userspace-api/index.rst +++ b/Documentation/userspace-api/index.rst @@ -32,6 +32,7 @@ place where this information is gathered. sysfs-platform_profile vduse futex2 + winesync .. only:: subproject and html diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 4ea5b837399ad1..825e95da7e95eb 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -375,6 +375,8 @@ Code Seq# Include File Comments 0xF6 all LTTng Linux Trace Toolkit Next Generation +0xF7 00-0F uapi/linux/winesync.h Wine synchronization primitives + 0xF8 all arch/x86/include/uapi/asm/amd_hsmp.h AMD HSMP EPYC system management interface driver 0xFD all linux/dm-ioctl.h diff --git a/Documentation/userspace-api/winesync.rst b/Documentation/userspace-api/winesync.rst new file mode 100644 index 00000000000000..f0110d2744c709 --- /dev/null +++ b/Documentation/userspace-api/winesync.rst @@ -0,0 +1,444 @@ +===================================== +Wine synchronization primitive driver +===================================== + +This page documents the user-space API for the winesync driver. + +winesync is a support driver for emulation of NT synchronization +primitives by the Wine project or other NT emulators. It exists +because implementation in user-space, using existing tools, cannot +simultaneously satisfy performance, correctness, and security +constraints. It is implemented entirely in software, and does not +drive any hardware device. + +This interface is meant as a compatibility tool only, and should not +be used for general synchronization. Instead use generic, versatile +interfaces such as futex(2) and poll(2). + +Synchronization primitives +========================== + +The winesync driver exposes three types of synchronization primitives: +semaphores, mutexes, and events. + +A semaphore holds a single volatile 32-bit counter, and a static +32-bit integer denoting the maximum value. It is considered signaled +when the counter is nonzero. The counter is decremented by one when a +wait is satisfied. Both the initial and maximum count are established +when the semaphore is created. + +A mutex holds a volatile 32-bit recursion count, and a volatile 32-bit +identifier denoting its owner. A mutex is considered signaled when its +owner is zero (indicating that it is not owned). The recursion count +is incremented when a wait is satisfied, and ownership is set to the +given identifier. + +A mutex also holds an internal flag denoting whether its previous +owner has died; such a mutex is said to be inconsistent. Owner death +is not tracked automatically based on thread death, but rather must be +communicated using ``WINESYNC_IOC_KILL_OWNER``. An inconsistent mutex +is inherently considered unowned. + +Except for the "unowned" semantics of zero, the actual value of the +owner identifier is not interpreted by the winesync driver at all. The +intended use is to store a thread identifier; however, the winesync +driver does not actually validate that a calling thread provides +consistent or unique identifiers. + +An event holds a volatile boolean state denoting whether it is +signaled or not. There are two types of events, auto-reset and +manual-reset. An auto-reset event is designaled when a wait is +satisfied; a manual-reset event is not. The event type is specified +when the event is created. + +Unless specified otherwise, all operations on an object are atomic and +totally ordered with respect to other operations on the same object. + +Objects are represented by unsigned 32-bit integers. + +Char device +=========== + +The winesync driver creates a single char device /dev/winesync. Each +file description opened on the device represents a unique namespace. +That is, objects created on one open file description are shared +across all its individual descriptors, but are not shared with other +open() calls on the same device. The same file description may be +shared across multiple processes. + +ioctl reference +=============== + +All operations on the device are done through ioctls. There are three +structures used in ioctl calls:: + + struct winesync_sem_args { + __u32 sem; + __u32 count; + __u32 max; + }; + + struct winesync_mutex_args { + __u32 mutex; + __u32 owner; + __u32 count; + }; + + struct winesync_event_args { + __u32 event; + __u32 signaled; + __u32 manual; + }; + + struct winesync_wait_args { + __u64 timeout; + __u64 objs; + __u32 count; + __u32 owner; + __u32 index; + __u32 pad; + }; + +Depending on the ioctl, members of the structure may be used as input, +output, or not at all. All ioctls return 0 on success. + +The ioctls are as follows: + +.. c:macro:: WINESYNC_IOC_CREATE_SEM + + Create a semaphore object. Takes a pointer to struct + :c:type:`winesync_sem_args`, which is used as follows: + + .. list-table:: + + * - ``sem`` + - On output, contains the identifier of the created semaphore. + * - ``count`` + - Initial count of the semaphore. + * - ``max`` + - Maximum count of the semaphore. + + Fails with ``EINVAL`` if ``count`` is greater than ``max``. + +.. c:macro:: WINESYNC_IOC_CREATE_MUTEX + + Create a mutex object. Takes a pointer to struct + :c:type:`winesync_mutex_args`, which is used as follows: + + .. list-table:: + + * - ``mutex`` + - On output, contains the identifier of the created mutex. + * - ``count`` + - Initial recursion count of the mutex. + * - ``owner`` + - Initial owner of the mutex. + + If ``owner`` is nonzero and ``count`` is zero, or if ``owner`` is + zero and ``count`` is nonzero, the function fails with ``EINVAL``. + +.. c:macro:: WINESYNC_IOC_CREATE_EVENT + + Create an event object. Takes a pointer to struct + :c:type:`winesync_event_args`, which is used as follows: + + .. list-table:: + + * - ``event`` + - On output, contains the identifier of the created event. + * - ``signaled`` + - If nonzero, the event is initially signaled, otherwise + nonsignaled. + * - ``manual`` + - If nonzero, the event is a manual-reset event, otherwise + auto-reset. + +.. c:macro:: WINESYNC_IOC_DELETE + + Delete an object of any type. Takes an input-only pointer to a + 32-bit integer denoting the object to delete. + + Wait ioctls currently in progress are not interrupted, and behave as + if the object remains valid. + +.. c:macro:: WINESYNC_IOC_PUT_SEM + + Post to a semaphore object. Takes a pointer to struct + :c:type:`winesync_sem_args`, which is used as follows: + + .. list-table:: + + * - ``sem`` + - Semaphore object to post to. + * - ``count`` + - Count to add to the semaphore. On output, contains the + previous count of the semaphore. + * - ``max`` + - Not used. + + If adding ``count`` to the semaphore's current count would raise the + latter past the semaphore's maximum count, the ioctl fails with + ``EOVERFLOW`` and the semaphore is not affected. If raising the + semaphore's count causes it to become signaled, eligible threads + waiting on this semaphore will be woken and the semaphore's count + decremented appropriately. + +.. c:macro:: WINESYNC_IOC_PUT_MUTEX + + Release a mutex object. Takes a pointer to struct + :c:type:`winesync_mutex_args`, which is used as follows: + + .. list-table:: + + * - ``mutex`` + - Mutex object to release. + * - ``owner`` + - Mutex owner identifier. + * - ``count`` + - On output, contains the previous recursion count. + + If ``owner`` is zero, the ioctl fails with ``EINVAL``. If ``owner`` + is not the current owner of the mutex, the ioctl fails with + ``EPERM``. + + The mutex's count will be decremented by one. If decrementing the + mutex's count causes it to become zero, the mutex is marked as + unowned and signaled, and eligible threads waiting on it will be + woken as appropriate. + +.. c:macro:: WINESYNC_IOC_SET_EVENT + + Signal an event object. Takes a pointer to struct + :c:type:`winesync_event_args`, which is used as follows: + + .. list-table:: + + * - ``event`` + - Event object to set. + * - ``signaled`` + - On output, contains the previous state of the event. + * - ``manual`` + - Unused. + + Eligible threads will be woken, and auto-reset events will be + designaled appropriately. + +.. c:macro:: WINESYNC_IOC_RESET_EVENT + + Designal an event object. Takes a pointer to struct + :c:type:`winesync_event_args`, which is used as follows: + + .. list-table:: + + * - ``event`` + - Event object to reset. + * - ``signaled`` + - On output, contains the previous state of the event. + * - ``manual`` + - Unused. + +.. c:macro:: WINESYNC_IOC_PULSE_EVENT + + Wake threads waiting on an event object without leaving it in a + signaled state. Takes a pointer to struct + :c:type:`winesync_event_args`, which is used as follows: + + .. list-table:: + + * - ``event`` + - Event object to pulse. + * - ``signaled`` + - On output, contains the previous state of the event. + * - ``manual`` + - Unused. + + A pulse operation can be thought of as a set followed by a reset, + performed as a single atomic operation. If two threads are waiting + on an auto-reset event which is pulsed, only one will be woken. If + two threads are waiting a manual-reset event which is pulsed, both + will be woken. However, in both cases, the event will be unsignaled + afterwards, and a simultaneous read operation will always report the + event as unsignaled. + +.. c:macro:: WINESYNC_IOC_READ_SEM + + Read the current state of a semaphore object. Takes a pointer to + struct :c:type:`winesync_sem_args`, which is used as follows: + + .. list-table:: + + * - ``sem`` + - Semaphore object to read. + * - ``count`` + - On output, contains the current count of the semaphore. + * - ``max`` + - On output, contains the maximum count of the semaphore. + +.. c:macro:: WINESYNC_IOC_READ_MUTEX + + Read the current state of a mutex object. Takes a pointer to struct + :c:type:`winesync_mutex_args`, which is used as follows: + + .. list-table:: + + * - ``mutex`` + - Mutex object to read. + * - ``owner`` + - On output, contains the current owner of the mutex, or zero + if the mutex is not currently owned. + * - ``count`` + - On output, contains the current recursion count of the mutex. + + If the mutex is marked as inconsistent, the function fails with + ``EOWNERDEAD``. In this case, ``count`` and ``owner`` are set to + zero. + +.. c:macro:: WINESYNC_IOC_READ_EVENT + + Read the current state of an event object. Takes a pointer to struct + :c:type:`winesync_event_args`, which is used as follows: + + .. list-table:: + + * - ``event`` + - Event object. + * - ``signaled`` + - On output, contains the current state of the event. + * - ``manual`` + - On output, contains 1 if the event is a manual-reset event, + and 0 otherwise. + +.. c:macro:: WINESYNC_IOC_KILL_OWNER + + Mark any mutexes owned by the given owner as unowned and + inconsistent. Takes an input-only pointer to a 32-bit integer + denoting the owner. If the owner is zero, the ioctl fails with + ``EINVAL``. + + For each mutex currently owned by the given owner, eligible threads + waiting on said mutex will be woken as appropriate (and such waits + will fail with ``EOWNERDEAD``, as described below). + + The operation as a whole is not atomic; however, the modification of + each mutex is atomic and totally ordered with respect to other + operations on the same mutex. + +.. c:macro:: WINESYNC_IOC_WAIT_ANY + + Poll on any of a list of objects, atomically acquiring at most one. + Takes a pointer to struct :c:type:`winesync_wait_args`, which is + used as follows: + + .. list-table:: + + * - ``timeout`` + - Optional pointer to a 64-bit struct :c:type:`timespec` + (specified as an integer so that the structure has the same + size regardless of architecture). The timeout is specified in + absolute format, as measured against the MONOTONIC clock. If + the timeout is equal to or earlier than the current time, the + function returns immediately without sleeping. If ``timeout`` + is zero, i.e. NULL, the function will sleep until an object + is signaled, and will not fail with ``ETIMEDOUT``. + * - ``objs`` + - Pointer to an array of ``count`` 32-bit object identifiers + (specified as an integer so that the structure has the same + size regardless of architecture). If any identifier is + invalid, the function fails with ``EINVAL``. + * - ``count`` + - Number of object identifiers specified in the ``objs`` array. + * - ``owner`` + - Mutex owner identifier. If any object in ``objs`` is a mutex, + the ioctl will attempt to acquire that mutex on behalf of + ``owner``. If ``owner`` is zero, the ioctl fails with + ``EINVAL``. + * - ``index`` + - On success, contains the index (into ``objs``) of the object + which was signaled. If ``alert`` was signaled instead, + this contains ``count``. + * - ``alert`` + - Optional event object identifier. If nonzero, this specifies + an "alert" event object which, if signaled, will terminate + the wait. If nonzero, the identifier must point to a valid + event. + + This function attempts to acquire one of the given objects. If + unable to do so, it sleeps until an object becomes signaled, + subsequently acquiring it, or the timeout expires. In the latter + case the ioctl fails with ``ETIMEDOUT``. The function only acquires + one object, even if multiple objects are signaled. + + A semaphore is considered to be signaled if its count is nonzero, + and is acquired by decrementing its count by one. A mutex is + considered to be signaled if it is unowned or if its owner matches + the ``owner`` argument, and is acquired by incrementing its + recursion count by one and setting its owner to the ``owner`` + argument. An auto-reset event is acquired by designaling it; a + manual-reset event is not affected by acquisition. + + Acquisition is atomic and totally ordered with respect to other + operations on the same object. If two wait operations (with + different ``owner`` identifiers) are queued on the same mutex, only + one is signaled. If two wait operations are queued on the same + semaphore, and a value of one is posted to it, only one is signaled. + The order in which threads are signaled is not specified. + + If an inconsistent mutex is acquired, the ioctl fails with + ``EOWNERDEAD``. Although this is a failure return, the function may + otherwise be considered successful. The mutex is marked as owned by + the given owner (with a recursion count of 1) and as no longer + inconsistent, and ``index`` is still set to the index of the mutex. + + The ``alert`` argument is an "extra" event which can terminate the + wait, independently of all other objects. If members of ``objs`` and + ``alert`` are both simultaneously signaled, a member of ``objs`` + will always be given priority and acquired first. Aside from this, + for "any" waits, there is no difference between passing an event as + this parameter, and passing it as an additional object at the end of + the ``objs`` array. For "all" waits, there is an additional + difference, as described below. + + It is valid to pass the same object more than once, including by + passing the same event in the ``objs`` array and in ``alert``. If a + wakeup occurs due to that object being signaled, ``index`` is set to + the lowest index corresponding to that object. + + The function may fail with ``EINTR`` if a signal is received. + +.. c:macro:: WINESYNC_IOC_WAIT_ALL + + Poll on a list of objects, atomically acquiring all of them. Takes a + pointer to struct :c:type:`winesync_wait_args`, which is used + identically to ``WINESYNC_IOC_WAIT_ANY``, except that ``index`` is + always filled with zero on success if not woken via alert. + + This function attempts to simultaneously acquire all of the given + objects. If unable to do so, it sleeps until all objects become + simultaneously signaled, subsequently acquiring them, or the timeout + expires. In the latter case the ioctl fails with ``ETIMEDOUT`` and + no objects are modified. + + Objects may become signaled and subsequently designaled (through + acquisition by other threads) while this thread is sleeping. Only + once all objects are simultaneously signaled does the ioctl acquire + them and return. The entire acquisition is atomic and totally + ordered with respect to other operations on any of the given + objects. + + If an inconsistent mutex is acquired, the ioctl fails with + ``EOWNERDEAD``. Similarly to ``WINESYNC_IOC_WAIT_ANY``, all objects + are nevertheless marked as acquired. Note that if multiple mutex + objects are specified, there is no way to know which were marked as + inconsistent. + + As with "any" waits, the ``alert`` argument is an "extra" event + which can terminate the wait. Critically, however, an "all" wait + will succeed if all members in ``objs`` are signaled, *or* if + ``alert`` is signaled. In the latter case ``index`` will be set to + ``count``. As with "any" waits, if both conditions are filled, the + former takes priority, and objects in ``objs`` will be acquired. + + Unlike ``WINESYNC_IOC_WAIT_ANY``, it is not valid to pass the same + object more than once, nor is it valid to pass the same object in + ``objs`` and in ``alert`` If this is attempted, the function fails + with ``EINVAL``. diff --git a/MAINTAINERS b/MAINTAINERS index f09415b2b3c5cf..418ef9f6563b0e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23240,6 +23240,15 @@ M: David Härdeman S: Maintained F: drivers/media/rc/winbond-cir.c +WINESYNC SYNCHRONIZATION PRIMITIVE DRIVER +M: Zebediah Figura +L: wine-devel@winehq.org +S: Supported +F: Documentation/userspace-api/winesync.rst +F: drivers/misc/winesync.c +F: include/uapi/linux/winesync.h +F: tools/testing/selftests/drivers/winesync/ + WINSYSTEMS EBC-C384 WATCHDOG DRIVER L: linux-watchdog@vger.kernel.org S: Orphan diff --git a/block/elevator.c b/block/elevator.c index 5ff093cb3cf8f5..200eb60c8e8b52 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -574,9 +574,19 @@ static struct elevator_type *elevator_get_default(struct request_queue *q) if (q->nr_hw_queues != 1 && !blk_mq_is_shared_tags(q->tag_set->flags)) +#if defined(CONFIG_ZEN_INTERACTIVE) && defined(CONFIG_MQ_IOSCHED_KYBER) + return elevator_find_get(q, "kyber"); +#elif defined(CONFIG_ZEN_INTERACTIVE) + return elevator_find_get(q, "mq-deadline"); +#else return NULL; +#endif +#if defined(CONFIG_ZEN_INTERACTIVE) && defined(CONFIG_IOSCHED_BFQ) + return elevator_find_get(q, "bfq"); +#else return elevator_find_get(q, "mq-deadline"); +#endif } /* diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index c52d19d67557f5..03c8da688a8cdb 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -18,10 +18,16 @@ #include "cpufreq_ondemand.h" /* On-demand governor macros */ +#if defined(CONFIG_ZEN_INTERACTIVE) +#define DEF_FREQUENCY_UP_THRESHOLD (55) +#define MICRO_FREQUENCY_UP_THRESHOLD (60) +#define DEF_SAMPLING_DOWN_FACTOR (5) +#else #define DEF_FREQUENCY_UP_THRESHOLD (80) +#define MICRO_FREQUENCY_UP_THRESHOLD (95) #define DEF_SAMPLING_DOWN_FACTOR (1) +#endif #define MAX_SAMPLING_DOWN_FACTOR (100000) -#define MICRO_FREQUENCY_UP_THRESHOLD (95) #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) #define MIN_FREQUENCY_UP_THRESHOLD (1) #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c index 95f90699d2b17b..2b10fe29d2c8d9 100644 --- a/drivers/input/evdev.c +++ b/drivers/input/evdev.c @@ -46,6 +46,7 @@ struct evdev_client { struct fasync_struct *fasync; struct evdev *evdev; struct list_head node; + struct rcu_head rcu; enum input_clock_type clk_type; bool revoked; unsigned long *evmasks[EV_CNT]; @@ -377,13 +378,22 @@ static void evdev_attach_client(struct evdev *evdev, spin_unlock(&evdev->client_lock); } +static void evdev_reclaim_client(struct rcu_head *rp) +{ + struct evdev_client *client = container_of(rp, struct evdev_client, rcu); + unsigned int i; + for (i = 0; i < EV_CNT; ++i) + bitmap_free(client->evmasks[i]); + kvfree(client); +} + static void evdev_detach_client(struct evdev *evdev, struct evdev_client *client) { spin_lock(&evdev->client_lock); list_del_rcu(&client->node); spin_unlock(&evdev->client_lock); - synchronize_rcu(); + call_rcu(&client->rcu, evdev_reclaim_client); } static int evdev_open_device(struct evdev *evdev) @@ -436,7 +446,6 @@ static int evdev_release(struct inode *inode, struct file *file) { struct evdev_client *client = file->private_data; struct evdev *evdev = client->evdev; - unsigned int i; mutex_lock(&evdev->mutex); @@ -448,11 +457,6 @@ static int evdev_release(struct inode *inode, struct file *file) evdev_detach_client(evdev, client); - for (i = 0; i < EV_CNT; ++i) - bitmap_free(client->evmasks[i]); - - kvfree(client); - evdev_close_device(evdev); return 0; @@ -495,7 +499,6 @@ static int evdev_open(struct inode *inode, struct file *file) err_free_client: evdev_detach_client(evdev, client); - kvfree(client); return error; } diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index cadd4a820c0336..128d498750c355 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -562,6 +562,17 @@ config TPS6594_PFSM This driver can also be built as a module. If so, the module will be called tps6594-pfsm. +config WINESYNC + tristate "Synchronization primitives for Wine" + help + This module provides kernel support for synchronization primitives + used by Wine. It is not a hardware driver. + + To compile this driver as a module, choose M here: the + module will be called winesync. + + If unsure, say N. + source "drivers/misc/c2port/Kconfig" source "drivers/misc/eeprom/Kconfig" source "drivers/misc/cb710/Kconfig" diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index f2a4d1ff65d46a..e7824ea71db493 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -59,6 +59,7 @@ obj-$(CONFIG_PVPANIC) += pvpanic/ obj-$(CONFIG_UACCE) += uacce/ obj-$(CONFIG_XILINX_SDFEC) += xilinx_sdfec.o obj-$(CONFIG_HISI_HIKEY_USB) += hisi_hikey_usb.o +obj-$(CONFIG_WINESYNC) += winesync.o obj-$(CONFIG_HI6421V600_IRQ) += hi6421v600-irq.o obj-$(CONFIG_OPEN_DICE) += open-dice.o obj-$(CONFIG_GP_PCI1XXXX) += mchp_pci1xxxx/ diff --git a/drivers/misc/winesync.c b/drivers/misc/winesync.c new file mode 100644 index 00000000000000..64b379d846dbee --- /dev/null +++ b/drivers/misc/winesync.c @@ -0,0 +1,1213 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * winesync.c - Kernel driver for Wine synchronization primitives + * + * Copyright (C) 2021 Zebediah Figura + */ + +#include +#include +#include +#include +#include +#include + +#define WINESYNC_NAME "winesync" + +enum winesync_type { + WINESYNC_TYPE_SEM, + WINESYNC_TYPE_MUTEX, + WINESYNC_TYPE_EVENT, +}; + +struct winesync_obj { + struct rcu_head rhead; + struct kref refcount; + spinlock_t lock; + + /* + * any_waiters is protected by the object lock, but all_waiters is + * protected by the device wait_all_lock. + */ + struct list_head any_waiters; + struct list_head all_waiters; + + /* + * Hint describing how many tasks are queued on this object in a + * wait-all operation. + * + * Any time we do a wake, we may need to wake "all" waiters as well as + * "any" waiters. In order to atomically wake "all" waiters, we must + * lock all of the objects, and that means grabbing the wait_all_lock + * below (and, due to lock ordering rules, before locking this object). + * However, wait-all is a rare operation, and grabbing the wait-all + * lock for every wake would create unnecessary contention. Therefore we + * first check whether all_hint is zero, and, if it is, we skip trying + * to wake "all" waiters. + * + * This hint isn't protected by any lock. It might change during the + * course of a wake, but there's no meaningful race there; it's only a + * hint. + * + * Since wait requests must originate from user-space threads, we're + * limited here by PID_MAX_LIMIT, so there's no risk of saturation. + */ + atomic_t all_hint; + + enum winesync_type type; + + /* The following fields are protected by the object lock. */ + union { + struct { + __u32 count; + __u32 max; + } sem; + struct { + __u32 count; + __u32 owner; + bool ownerdead; + } mutex; + struct { + bool manual; + bool signaled; + } event; + } u; +}; + +struct winesync_q_entry { + struct list_head node; + struct winesync_q *q; + struct winesync_obj *obj; + __u32 index; +}; + +struct winesync_q { + struct task_struct *task; + __u32 owner; + + /* + * Protected via atomic_cmpxchg(). Only the thread that wins the + * compare-and-swap may actually change object states and wake this + * task. + */ + atomic_t signaled; + + bool all; + bool ownerdead; + __u32 count; + struct winesync_q_entry entries[]; +}; + +struct winesync_device { + /* + * Wait-all operations must atomically grab all objects, and be totally + * ordered with respect to each other and wait-any operations. If one + * thread is trying to acquire several objects, another thread cannot + * touch the object at the same time. + * + * We achieve this by grabbing multiple object locks at the same time. + * However, this creates a lock ordering problem. To solve that problem, + * wait_all_lock is taken first whenever multiple objects must be locked + * at the same time. + */ + spinlock_t wait_all_lock; + + struct xarray objects; +}; + +static struct winesync_obj *get_obj(struct winesync_device *dev, __u32 id) +{ + struct winesync_obj *obj; + + rcu_read_lock(); + obj = xa_load(&dev->objects, id); + if (obj && !kref_get_unless_zero(&obj->refcount)) + obj = NULL; + rcu_read_unlock(); + + return obj; +} + +static void destroy_obj(struct kref *ref) +{ + struct winesync_obj *obj = container_of(ref, struct winesync_obj, refcount); + + kfree_rcu(obj, rhead); +} + +static void put_obj(struct winesync_obj *obj) +{ + kref_put(&obj->refcount, destroy_obj); +} + +static struct winesync_obj *get_obj_typed(struct winesync_device *dev, __u32 id, + enum winesync_type type) +{ + struct winesync_obj *obj = get_obj(dev, id); + + if (obj && obj->type != type) { + put_obj(obj); + return NULL; + } + return obj; +} + +static int winesync_char_open(struct inode *inode, struct file *file) +{ + struct winesync_device *dev; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + + spin_lock_init(&dev->wait_all_lock); + + xa_init_flags(&dev->objects, XA_FLAGS_ALLOC); + + file->private_data = dev; + return nonseekable_open(inode, file); +} + +static int winesync_char_release(struct inode *inode, struct file *file) +{ + struct winesync_device *dev = file->private_data; + struct winesync_obj *obj; + unsigned long id; + + xa_for_each(&dev->objects, id, obj) + put_obj(obj); + + xa_destroy(&dev->objects); + + kfree(dev); + + return 0; +} + +static void init_obj(struct winesync_obj *obj) +{ + kref_init(&obj->refcount); + atomic_set(&obj->all_hint, 0); + spin_lock_init(&obj->lock); + INIT_LIST_HEAD(&obj->any_waiters); + INIT_LIST_HEAD(&obj->all_waiters); +} + +static bool is_signaled(struct winesync_obj *obj, __u32 owner) +{ + lockdep_assert_held(&obj->lock); + + switch (obj->type) { + case WINESYNC_TYPE_SEM: + return !!obj->u.sem.count; + case WINESYNC_TYPE_MUTEX: + if (obj->u.mutex.owner && obj->u.mutex.owner != owner) + return false; + return obj->u.mutex.count < UINT_MAX; + case WINESYNC_TYPE_EVENT: + return obj->u.event.signaled; + } + + WARN(1, "bad object type %#x\n", obj->type); + return false; +} + +/* + * "locked_obj" is an optional pointer to an object which is already locked and + * should not be locked again. This is necessary so that changing an object's + * state and waking it can be a single atomic operation. + */ +static void try_wake_all(struct winesync_device *dev, struct winesync_q *q, + struct winesync_obj *locked_obj) +{ + __u32 count = q->count; + bool can_wake = true; + __u32 i; + + lockdep_assert_held(&dev->wait_all_lock); + if (locked_obj) + lockdep_assert_held(&locked_obj->lock); + + for (i = 0; i < count; i++) { + if (q->entries[i].obj != locked_obj) + spin_lock(&q->entries[i].obj->lock); + } + + for (i = 0; i < count; i++) { + if (!is_signaled(q->entries[i].obj, q->owner)) { + can_wake = false; + break; + } + } + + if (can_wake && atomic_cmpxchg(&q->signaled, -1, 0) == -1) { + for (i = 0; i < count; i++) { + struct winesync_obj *obj = q->entries[i].obj; + + switch (obj->type) { + case WINESYNC_TYPE_SEM: + obj->u.sem.count--; + break; + case WINESYNC_TYPE_MUTEX: + if (obj->u.mutex.ownerdead) + q->ownerdead = true; + obj->u.mutex.ownerdead = false; + obj->u.mutex.count++; + obj->u.mutex.owner = q->owner; + break; + case WINESYNC_TYPE_EVENT: + if (!obj->u.event.manual) + obj->u.event.signaled = false; + break; + } + } + wake_up_process(q->task); + } + + for (i = 0; i < count; i++) { + if (q->entries[i].obj != locked_obj) + spin_unlock(&q->entries[i].obj->lock); + } +} + +static void try_wake_all_obj(struct winesync_device *dev, + struct winesync_obj *obj) +{ + struct winesync_q_entry *entry; + + lockdep_assert_held(&dev->wait_all_lock); + lockdep_assert_held(&obj->lock); + + list_for_each_entry(entry, &obj->all_waiters, node) + try_wake_all(dev, entry->q, obj); +} + +static void try_wake_any_sem(struct winesync_obj *sem) +{ + struct winesync_q_entry *entry; + + lockdep_assert_held(&sem->lock); + + list_for_each_entry(entry, &sem->any_waiters, node) { + struct winesync_q *q = entry->q; + + if (!sem->u.sem.count) + break; + + if (atomic_cmpxchg(&q->signaled, -1, entry->index) == -1) { + sem->u.sem.count--; + wake_up_process(q->task); + } + } +} + +static void try_wake_any_mutex(struct winesync_obj *mutex) +{ + struct winesync_q_entry *entry; + + lockdep_assert_held(&mutex->lock); + + list_for_each_entry(entry, &mutex->any_waiters, node) { + struct winesync_q *q = entry->q; + + if (mutex->u.mutex.count == UINT_MAX) + break; + if (mutex->u.mutex.owner && mutex->u.mutex.owner != q->owner) + continue; + + if (atomic_cmpxchg(&q->signaled, -1, entry->index) == -1) { + if (mutex->u.mutex.ownerdead) + q->ownerdead = true; + mutex->u.mutex.ownerdead = false; + mutex->u.mutex.count++; + mutex->u.mutex.owner = q->owner; + wake_up_process(q->task); + } + } +} + +static void try_wake_any_event(struct winesync_obj *event) +{ + struct winesync_q_entry *entry; + + lockdep_assert_held(&event->lock); + + list_for_each_entry(entry, &event->any_waiters, node) { + struct winesync_q *q = entry->q; + + if (!event->u.event.signaled) + break; + + if (atomic_cmpxchg(&q->signaled, -1, entry->index) == -1) { + if (!event->u.event.manual) + event->u.event.signaled = false; + wake_up_process(q->task); + } + } +} + +static int winesync_create_sem(struct winesync_device *dev, void __user *argp) +{ + struct winesync_sem_args __user *user_args = argp; + struct winesync_sem_args args; + struct winesync_obj *sem; + __u32 id; + int ret; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + if (args.count > args.max) + return -EINVAL; + + sem = kzalloc(sizeof(*sem), GFP_KERNEL); + if (!sem) + return -ENOMEM; + + init_obj(sem); + sem->type = WINESYNC_TYPE_SEM; + sem->u.sem.count = args.count; + sem->u.sem.max = args.max; + + ret = xa_alloc(&dev->objects, &id, sem, xa_limit_32b, GFP_KERNEL); + if (ret < 0) { + kfree(sem); + return ret; + } + + return put_user(id, &user_args->sem); +} + +static int winesync_create_mutex(struct winesync_device *dev, void __user *argp) +{ + struct winesync_mutex_args __user *user_args = argp; + struct winesync_mutex_args args; + struct winesync_obj *mutex; + __u32 id; + int ret; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + if (!args.owner != !args.count) + return -EINVAL; + + mutex = kzalloc(sizeof(*mutex), GFP_KERNEL); + if (!mutex) + return -ENOMEM; + + init_obj(mutex); + mutex->type = WINESYNC_TYPE_MUTEX; + mutex->u.mutex.count = args.count; + mutex->u.mutex.owner = args.owner; + + ret = xa_alloc(&dev->objects, &id, mutex, xa_limit_32b, GFP_KERNEL); + if (ret < 0) { + kfree(mutex); + return ret; + } + + return put_user(id, &user_args->mutex); +} + +static int winesync_create_event(struct winesync_device *dev, void __user *argp) +{ + struct winesync_event_args __user *user_args = argp; + struct winesync_event_args args; + struct winesync_obj *event; + __u32 id; + int ret; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + init_obj(event); + event->type = WINESYNC_TYPE_EVENT; + event->u.event.manual = args.manual; + event->u.event.signaled = args.signaled; + + ret = xa_alloc(&dev->objects, &id, event, xa_limit_32b, GFP_KERNEL); + if (ret < 0) { + kfree(event); + return ret; + } + + return put_user(id, &user_args->event); +} + +static int winesync_delete(struct winesync_device *dev, void __user *argp) +{ + struct winesync_obj *obj; + __u32 id; + + if (get_user(id, (__u32 __user *)argp)) + return -EFAULT; + + obj = xa_erase(&dev->objects, id); + if (!obj) + return -EINVAL; + + put_obj(obj); + return 0; +} + +/* + * Actually change the semaphore state, returning -EOVERFLOW if it is made + * invalid. + */ +static int put_sem_state(struct winesync_obj *sem, __u32 count) +{ + lockdep_assert_held(&sem->lock); + + if (sem->u.sem.count + count < sem->u.sem.count || + sem->u.sem.count + count > sem->u.sem.max) + return -EOVERFLOW; + + sem->u.sem.count += count; + return 0; +} + +static int winesync_put_sem(struct winesync_device *dev, void __user *argp) +{ + struct winesync_sem_args __user *user_args = argp; + struct winesync_sem_args args; + struct winesync_obj *sem; + __u32 prev_count; + int ret; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + sem = get_obj_typed(dev, args.sem, WINESYNC_TYPE_SEM); + if (!sem) + return -EINVAL; + + if (atomic_read(&sem->all_hint) > 0) { + spin_lock(&dev->wait_all_lock); + spin_lock(&sem->lock); + + prev_count = sem->u.sem.count; + ret = put_sem_state(sem, args.count); + if (!ret) { + try_wake_all_obj(dev, sem); + try_wake_any_sem(sem); + } + + spin_unlock(&sem->lock); + spin_unlock(&dev->wait_all_lock); + } else { + spin_lock(&sem->lock); + + prev_count = sem->u.sem.count; + ret = put_sem_state(sem, args.count); + if (!ret) + try_wake_any_sem(sem); + + spin_unlock(&sem->lock); + } + + put_obj(sem); + + if (!ret && put_user(prev_count, &user_args->count)) + ret = -EFAULT; + + return ret; +} + +/* + * Actually change the mutex state, returning -EPERM if not the owner. + */ +static int put_mutex_state(struct winesync_obj *mutex, + const struct winesync_mutex_args *args) +{ + lockdep_assert_held(&mutex->lock); + + if (mutex->u.mutex.owner != args->owner) + return -EPERM; + + if (!--mutex->u.mutex.count) + mutex->u.mutex.owner = 0; + return 0; +} + +static int winesync_put_mutex(struct winesync_device *dev, void __user *argp) +{ + struct winesync_mutex_args __user *user_args = argp; + struct winesync_mutex_args args; + struct winesync_obj *mutex; + __u32 prev_count; + int ret; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + if (!args.owner) + return -EINVAL; + + mutex = get_obj_typed(dev, args.mutex, WINESYNC_TYPE_MUTEX); + if (!mutex) + return -EINVAL; + + if (atomic_read(&mutex->all_hint) > 0) { + spin_lock(&dev->wait_all_lock); + spin_lock(&mutex->lock); + + prev_count = mutex->u.mutex.count; + ret = put_mutex_state(mutex, &args); + if (!ret) { + try_wake_all_obj(dev, mutex); + try_wake_any_mutex(mutex); + } + + spin_unlock(&mutex->lock); + spin_unlock(&dev->wait_all_lock); + } else { + spin_lock(&mutex->lock); + + prev_count = mutex->u.mutex.count; + ret = put_mutex_state(mutex, &args); + if (!ret) + try_wake_any_mutex(mutex); + + spin_unlock(&mutex->lock); + } + + put_obj(mutex); + + if (!ret && put_user(prev_count, &user_args->count)) + ret = -EFAULT; + + return ret; +} + +static int winesync_read_sem(struct winesync_device *dev, void __user *argp) +{ + struct winesync_sem_args __user *user_args = argp; + struct winesync_sem_args args; + struct winesync_obj *sem; + __u32 id; + + if (get_user(id, &user_args->sem)) + return -EFAULT; + + sem = get_obj_typed(dev, id, WINESYNC_TYPE_SEM); + if (!sem) + return -EINVAL; + + args.sem = id; + spin_lock(&sem->lock); + args.count = sem->u.sem.count; + args.max = sem->u.sem.max; + spin_unlock(&sem->lock); + + put_obj(sem); + + if (copy_to_user(user_args, &args, sizeof(args))) + return -EFAULT; + return 0; +} + +static int winesync_read_mutex(struct winesync_device *dev, void __user *argp) +{ + struct winesync_mutex_args __user *user_args = argp; + struct winesync_mutex_args args; + struct winesync_obj *mutex; + __u32 id; + int ret; + + if (get_user(id, &user_args->mutex)) + return -EFAULT; + + mutex = get_obj_typed(dev, id, WINESYNC_TYPE_MUTEX); + if (!mutex) + return -EINVAL; + + args.mutex = id; + spin_lock(&mutex->lock); + args.count = mutex->u.mutex.count; + args.owner = mutex->u.mutex.owner; + ret = mutex->u.mutex.ownerdead ? -EOWNERDEAD : 0; + spin_unlock(&mutex->lock); + + put_obj(mutex); + + if (copy_to_user(user_args, &args, sizeof(args))) + return -EFAULT; + return ret; +} + +static int winesync_read_event(struct winesync_device *dev, void __user *argp) +{ + struct winesync_event_args __user *user_args = argp; + struct winesync_event_args args; + struct winesync_obj *event; + __u32 id; + int ret; + + if (get_user(id, &user_args->event)) + return -EFAULT; + + event = get_obj_typed(dev, id, WINESYNC_TYPE_EVENT); + if (!event) + return -EINVAL; + + args.event = id; + spin_lock(&event->lock); + args.manual = event->u.event.manual; + args.signaled = event->u.event.signaled; + spin_unlock(&event->lock); + + put_obj(event); + + if (copy_to_user(user_args, &args, sizeof(args))) + return -EFAULT; + return ret; +} + +/* + * Actually change the mutex state to mark its owner as dead. + */ +static void put_mutex_ownerdead_state(struct winesync_obj *mutex) +{ + lockdep_assert_held(&mutex->lock); + + mutex->u.mutex.ownerdead = true; + mutex->u.mutex.owner = 0; + mutex->u.mutex.count = 0; +} + +static int winesync_kill_owner(struct winesync_device *dev, void __user *argp) +{ + struct winesync_obj *obj; + unsigned long id; + __u32 owner; + + if (get_user(owner, (__u32 __user *)argp)) + return -EFAULT; + if (!owner) + return -EINVAL; + + rcu_read_lock(); + + xa_for_each(&dev->objects, id, obj) { + if (!kref_get_unless_zero(&obj->refcount)) + continue; + + if (obj->type != WINESYNC_TYPE_MUTEX) { + put_obj(obj); + continue; + } + + if (atomic_read(&obj->all_hint) > 0) { + spin_lock(&dev->wait_all_lock); + spin_lock(&obj->lock); + + if (obj->u.mutex.owner == owner) { + put_mutex_ownerdead_state(obj); + try_wake_all_obj(dev, obj); + try_wake_any_mutex(obj); + } + + spin_unlock(&obj->lock); + spin_unlock(&dev->wait_all_lock); + } else { + spin_lock(&obj->lock); + + if (obj->u.mutex.owner == owner) { + put_mutex_ownerdead_state(obj); + try_wake_any_mutex(obj); + } + + spin_unlock(&obj->lock); + } + + put_obj(obj); + } + + rcu_read_unlock(); + + return 0; +} + +static int winesync_set_event(struct winesync_device *dev, void __user *argp, + bool pulse) +{ + struct winesync_event_args __user *user_args = argp; + struct winesync_event_args args; + struct winesync_obj *event; + bool prev_state; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + event = get_obj_typed(dev, args.event, WINESYNC_TYPE_EVENT); + if (!event) + return -EINVAL; + + if (atomic_read(&event->all_hint) > 0) { + spin_lock(&dev->wait_all_lock); + spin_lock(&event->lock); + + prev_state = event->u.event.signaled; + event->u.event.signaled = true; + try_wake_all_obj(dev, event); + try_wake_any_event(event); + if (pulse) + event->u.event.signaled = false; + + spin_unlock(&event->lock); + spin_unlock(&dev->wait_all_lock); + } else { + spin_lock(&event->lock); + + prev_state = event->u.event.signaled; + event->u.event.signaled = true; + try_wake_any_event(event); + if (pulse) + event->u.event.signaled = false; + + spin_unlock(&event->lock); + } + + put_obj(event); + + if (put_user(prev_state, &user_args->signaled)) + return -EFAULT; + + return 0; +} + +static int winesync_reset_event(struct winesync_device *dev, void __user *argp) +{ + struct winesync_event_args __user *user_args = argp; + struct winesync_event_args args; + struct winesync_obj *event; + bool prev_state; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + event = get_obj_typed(dev, args.event, WINESYNC_TYPE_EVENT); + if (!event) + return -EINVAL; + + spin_lock(&event->lock); + + prev_state = event->u.event.signaled; + event->u.event.signaled = false; + + spin_unlock(&event->lock); + + put_obj(event); + + if (put_user(prev_state, &user_args->signaled)) + return -EFAULT; + + return 0; +} + +static int winesync_schedule(const struct winesync_q *q, ktime_t *timeout) +{ + int ret = 0; + + do { + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + set_current_state(TASK_INTERRUPTIBLE); + if (atomic_read(&q->signaled) != -1) { + ret = 0; + break; + } + ret = schedule_hrtimeout(timeout, HRTIMER_MODE_ABS); + } while (ret < 0); + __set_current_state(TASK_RUNNING); + + return ret; +} + +/* + * Allocate and initialize the winesync_q structure, but do not queue us yet. + * Also, calculate the relative timeout. + */ +static int setup_wait(struct winesync_device *dev, + const struct winesync_wait_args *args, bool all, + ktime_t *ret_timeout, struct winesync_q **ret_q) +{ + const __u32 count = args->count; + struct winesync_q *q; + ktime_t timeout = 0; + __u32 total_count; + __u32 *ids; + __u32 i, j; + + if (!args->owner) + return -EINVAL; + + if (args->timeout) { + struct timespec64 to; + + if (get_timespec64(&to, u64_to_user_ptr(args->timeout))) + return -EFAULT; + if (!timespec64_valid(&to)) + return -EINVAL; + + timeout = timespec64_to_ns(&to); + } + + total_count = count; + if (args->alert) + total_count++; + + ids = kmalloc_array(total_count, sizeof(*ids), GFP_KERNEL); + if (!ids) + return -ENOMEM; + if (copy_from_user(ids, u64_to_user_ptr(args->objs), + array_size(count, sizeof(*ids)))) { + kfree(ids); + return -EFAULT; + } + if (args->alert) + ids[count] = args->alert; + + q = kmalloc(struct_size(q, entries, total_count), GFP_KERNEL); + if (!q) { + kfree(ids); + return -ENOMEM; + } + q->task = current; + q->owner = args->owner; + atomic_set(&q->signaled, -1); + q->all = all; + q->ownerdead = false; + q->count = count; + + for (i = 0; i < total_count; i++) { + struct winesync_q_entry *entry = &q->entries[i]; + struct winesync_obj *obj = get_obj(dev, ids[i]); + + if (!obj) + goto err; + + if (all) { + /* Check that the objects are all distinct. */ + for (j = 0; j < i; j++) { + if (obj == q->entries[j].obj) { + put_obj(obj); + goto err; + } + } + } + + entry->obj = obj; + entry->q = q; + entry->index = i; + } + + kfree(ids); + + *ret_q = q; + *ret_timeout = timeout; + return 0; + +err: + for (j = 0; j < i; j++) + put_obj(q->entries[j].obj); + kfree(ids); + kfree(q); + return -EINVAL; +} + +static void try_wake_any_obj(struct winesync_obj *obj) +{ + switch (obj->type) { + case WINESYNC_TYPE_SEM: + try_wake_any_sem(obj); + break; + case WINESYNC_TYPE_MUTEX: + try_wake_any_mutex(obj); + break; + case WINESYNC_TYPE_EVENT: + try_wake_any_event(obj); + break; + } +} + +static int winesync_wait_any(struct winesync_device *dev, void __user *argp) +{ + struct winesync_wait_args args; + struct winesync_q *q; + __u32 i, total_count; + ktime_t timeout; + int signaled; + int ret; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + ret = setup_wait(dev, &args, false, &timeout, &q); + if (ret < 0) + return ret; + + total_count = args.count; + if (args.alert) + total_count++; + + /* queue ourselves */ + + for (i = 0; i < total_count; i++) { + struct winesync_q_entry *entry = &q->entries[i]; + struct winesync_obj *obj = entry->obj; + + spin_lock(&obj->lock); + list_add_tail(&entry->node, &obj->any_waiters); + spin_unlock(&obj->lock); + } + + /* + * Check if we are already signaled. + * + * Note that the API requires that normal objects are checked before + * the alert event. Hence we queue the alert event last, and check + * objects in order. + */ + + for (i = 0; i < total_count; i++) { + struct winesync_obj *obj = q->entries[i].obj; + + if (atomic_read(&q->signaled) != -1) + break; + + spin_lock(&obj->lock); + try_wake_any_obj(obj); + spin_unlock(&obj->lock); + } + + /* sleep */ + + ret = winesync_schedule(q, args.timeout ? &timeout : NULL); + + /* and finally, unqueue */ + + for (i = 0; i < total_count; i++) { + struct winesync_q_entry *entry = &q->entries[i]; + struct winesync_obj *obj = entry->obj; + + spin_lock(&obj->lock); + list_del(&entry->node); + spin_unlock(&obj->lock); + + put_obj(obj); + } + + signaled = atomic_read(&q->signaled); + if (signaled != -1) { + struct winesync_wait_args __user *user_args = argp; + + /* even if we caught a signal, we need to communicate success */ + ret = q->ownerdead ? -EOWNERDEAD : 0; + + if (put_user(signaled, &user_args->index)) + ret = -EFAULT; + } else if (!ret) { + ret = -ETIMEDOUT; + } + + kfree(q); + return ret; +} + +static int winesync_wait_all(struct winesync_device *dev, void __user *argp) +{ + struct winesync_wait_args args; + struct winesync_q *q; + ktime_t timeout; + int signaled; + __u32 i; + int ret; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + ret = setup_wait(dev, &args, true, &timeout, &q); + if (ret < 0) + return ret; + + /* queue ourselves */ + + spin_lock(&dev->wait_all_lock); + + for (i = 0; i < args.count; i++) { + struct winesync_q_entry *entry = &q->entries[i]; + struct winesync_obj *obj = entry->obj; + + atomic_inc(&obj->all_hint); + + /* + * obj->all_waiters is protected by dev->wait_all_lock rather + * than obj->lock, so there is no need to acquire it here. + */ + list_add_tail(&entry->node, &obj->all_waiters); + } + if (args.alert) { + struct winesync_q_entry *entry = &q->entries[args.count]; + struct winesync_obj *obj = entry->obj; + + spin_lock(&obj->lock); + list_add_tail(&entry->node, &obj->any_waiters); + spin_unlock(&obj->lock); + } + + /* check if we are already signaled */ + + try_wake_all(dev, q, NULL); + + spin_unlock(&dev->wait_all_lock); + + /* + * Check if the alert event is signaled, making sure to do so only + * after checking if the other objects are signaled. + */ + + if (args.alert) { + struct winesync_obj *obj = q->entries[args.count].obj; + + if (atomic_read(&q->signaled) == -1) { + spin_lock(&obj->lock); + try_wake_any_obj(obj); + spin_unlock(&obj->lock); + } + } + + /* sleep */ + + ret = winesync_schedule(q, args.timeout ? &timeout : NULL); + + /* and finally, unqueue */ + + spin_lock(&dev->wait_all_lock); + + for (i = 0; i < args.count; i++) { + struct winesync_q_entry *entry = &q->entries[i]; + struct winesync_obj *obj = entry->obj; + + /* + * obj->all_waiters is protected by dev->wait_all_lock rather + * than obj->lock, so there is no need to acquire it here. + */ + list_del(&entry->node); + + atomic_dec(&obj->all_hint); + + put_obj(obj); + } + if (args.alert) { + struct winesync_q_entry *entry = &q->entries[args.count]; + struct winesync_obj *obj = entry->obj; + + spin_lock(&obj->lock); + list_del(&entry->node); + spin_unlock(&obj->lock); + + put_obj(obj); + } + + spin_unlock(&dev->wait_all_lock); + + signaled = atomic_read(&q->signaled); + if (signaled != -1) { + struct winesync_wait_args __user *user_args = argp; + + /* even if we caught a signal, we need to communicate success */ + ret = q->ownerdead ? -EOWNERDEAD : 0; + + if (put_user(signaled, &user_args->index)) + ret = -EFAULT; + } else if (!ret) { + ret = -ETIMEDOUT; + } + + kfree(q); + return ret; +} + +static long winesync_char_ioctl(struct file *file, unsigned int cmd, + unsigned long parm) +{ + struct winesync_device *dev = file->private_data; + void __user *argp = (void __user *)parm; + + switch (cmd) { + case WINESYNC_IOC_CREATE_EVENT: + return winesync_create_event(dev, argp); + case WINESYNC_IOC_CREATE_MUTEX: + return winesync_create_mutex(dev, argp); + case WINESYNC_IOC_CREATE_SEM: + return winesync_create_sem(dev, argp); + case WINESYNC_IOC_DELETE: + return winesync_delete(dev, argp); + case WINESYNC_IOC_KILL_OWNER: + return winesync_kill_owner(dev, argp); + case WINESYNC_IOC_PULSE_EVENT: + return winesync_set_event(dev, argp, true); + case WINESYNC_IOC_PUT_MUTEX: + return winesync_put_mutex(dev, argp); + case WINESYNC_IOC_PUT_SEM: + return winesync_put_sem(dev, argp); + case WINESYNC_IOC_READ_EVENT: + return winesync_read_event(dev, argp); + case WINESYNC_IOC_READ_MUTEX: + return winesync_read_mutex(dev, argp); + case WINESYNC_IOC_READ_SEM: + return winesync_read_sem(dev, argp); + case WINESYNC_IOC_RESET_EVENT: + return winesync_reset_event(dev, argp); + case WINESYNC_IOC_SET_EVENT: + return winesync_set_event(dev, argp, false); + case WINESYNC_IOC_WAIT_ALL: + return winesync_wait_all(dev, argp); + case WINESYNC_IOC_WAIT_ANY: + return winesync_wait_any(dev, argp); + default: + return -ENOSYS; + } +} + +static const struct file_operations winesync_fops = { + .owner = THIS_MODULE, + .open = winesync_char_open, + .release = winesync_char_release, + .unlocked_ioctl = winesync_char_ioctl, + .compat_ioctl = winesync_char_ioctl, + .llseek = no_llseek, +}; + +static struct miscdevice winesync_misc = { + .minor = WINESYNC_MINOR, + .name = WINESYNC_NAME, + .fops = &winesync_fops, +}; + +static int __init winesync_init(void) +{ + return misc_register(&winesync_misc); +} + +static void __exit winesync_exit(void) +{ + misc_deregister(&winesync_misc); +} + +module_init(winesync_init); +module_exit(winesync_exit); + +MODULE_AUTHOR("Zebediah Figura"); +MODULE_DESCRIPTION("Kernel driver for Wine synchronization primitives"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("devname:" WINESYNC_NAME); +MODULE_ALIAS_MISCDEV(WINESYNC_MINOR); diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index ec4277d7835b23..6b84a6f7ee6e45 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3488,6 +3488,107 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x65f8, quirk_intel_mc_errata); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x65f9, quirk_intel_mc_errata); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x65fa, quirk_intel_mc_errata); +static bool acs_on_downstream; +static bool acs_on_multifunction; + +#define NUM_ACS_IDS 16 +struct acs_on_id { + unsigned short vendor; + unsigned short device; +}; +static struct acs_on_id acs_on_ids[NUM_ACS_IDS]; +static u8 max_acs_id; + +static __init int pcie_acs_override_setup(char *p) +{ + if (!p) + return -EINVAL; + + while (*p) { + if (!strncmp(p, "downstream", 10)) + acs_on_downstream = true; + if (!strncmp(p, "multifunction", 13)) + acs_on_multifunction = true; + if (!strncmp(p, "id:", 3)) { + char opt[5]; + int ret; + long val; + + if (max_acs_id >= NUM_ACS_IDS - 1) { + pr_warn("Out of PCIe ACS override slots (%d)\n", + NUM_ACS_IDS); + goto next; + } + + p += 3; + snprintf(opt, 5, "%s", p); + ret = kstrtol(opt, 16, &val); + if (ret) { + pr_warn("PCIe ACS ID parse error %d\n", ret); + goto next; + } + acs_on_ids[max_acs_id].vendor = val; + + p += strcspn(p, ":"); + if (*p != ':') { + pr_warn("PCIe ACS invalid ID\n"); + goto next; + } + + p++; + snprintf(opt, 5, "%s", p); + ret = kstrtol(opt, 16, &val); + if (ret) { + pr_warn("PCIe ACS ID parse error %d\n", ret); + goto next; + } + acs_on_ids[max_acs_id].device = val; + max_acs_id++; + } +next: + p += strcspn(p, ","); + if (*p == ',') + p++; + } + + if (acs_on_downstream || acs_on_multifunction || max_acs_id) + pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n"); + + return 0; +} +early_param("pcie_acs_override", pcie_acs_override_setup); + +static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags) +{ + int i; + + /* Never override ACS for legacy devices or devices with ACS caps */ + if (!pci_is_pcie(dev) || + pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS)) + return -ENOTTY; + + for (i = 0; i < max_acs_id; i++) + if (acs_on_ids[i].vendor == dev->vendor && + acs_on_ids[i].device == dev->device) + return 1; + + switch (pci_pcie_type(dev)) { + case PCI_EXP_TYPE_DOWNSTREAM: + case PCI_EXP_TYPE_ROOT_PORT: + if (acs_on_downstream) + return 1; + break; + case PCI_EXP_TYPE_ENDPOINT: + case PCI_EXP_TYPE_UPSTREAM: + case PCI_EXP_TYPE_LEG_END: + case PCI_EXP_TYPE_RC_END: + if (acs_on_multifunction && dev->multifunction) + return 1; + } + + return -ENOTTY; +} + /* * Ivytown NTB BAR sizes are misreported by the hardware due to an erratum. * To work around this, query the size it should be configured to by the @@ -5136,6 +5237,8 @@ static const struct pci_dev_acs_enabled { { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, /* Wangxun nics */ { PCI_VENDOR_ID_WANGXUN, PCI_ANY_ID, pci_quirk_wangxun_nic_acs }, + /* custom ACS overrides for any PCIe device */ + { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, { 0 } }; diff --git a/fs/proc/base.c b/fs/proc/base.c index 6e61d93ffa5523..417c0d720f05c8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3284,6 +3284,9 @@ static const struct pid_entry tgid_base_stuff[] = { REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), +#ifdef CONFIG_MEM_SOFT_DIRTY + REG("pagemap_reset", S_IRUSR, proc_pagemap_reset_operations), +#endif #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 9a8f32f21ff569..f3a16b26dd6e43 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -303,6 +303,7 @@ extern const struct file_operations proc_pid_smaps_operations; extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; +extern const struct file_operations proc_pagemap_reset_operations; extern unsigned long task_vsize(struct mm_struct *); extern unsigned long task_statm(struct mm_struct *, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 59571737e16771..bcabc55ed83859 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1077,6 +1077,8 @@ enum clear_refs_types { struct clear_refs_private { enum clear_refs_types type; + unsigned long start, end; + bool clear_range; }; #ifdef CONFIG_MEM_SOFT_DIRTY @@ -1097,7 +1099,7 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, return page_maybe_dma_pinned(page); } -static inline void clear_soft_dirty(struct vm_area_struct *vma, +static inline bool clear_soft_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { /* @@ -1107,37 +1109,46 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, * of how soft-dirty works. */ pte_t ptent = ptep_get(pte); + bool ret = false; if (pte_present(ptent)) { pte_t old_pte; if (pte_is_pinned(vma, addr, ptent)) - return; + return ret; old_pte = ptep_modify_prot_start(vma, addr, pte); + ret = pte_soft_dirty(old_pte); ptent = pte_wrprotect(old_pte); ptent = pte_clear_soft_dirty(ptent); ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); } else if (is_swap_pte(ptent)) { + ret = pte_swp_soft_dirty(ptent); ptent = pte_swp_clear_soft_dirty(ptent); set_pte_at(vma->vm_mm, addr, pte, ptent); } + return ret; } #else -static inline void clear_soft_dirty(struct vm_area_struct *vma, +static inline bool clear_soft_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { + return false; } #endif #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) -static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, +static inline bool clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { pmd_t old, pmd = *pmdp; + bool ret = false; if (pmd_present(pmd)) { /* See comment in change_huge_pmd() */ old = pmdp_invalidate(vma, addr, pmdp); + + ret = pmd_soft_dirty(old); + if (pmd_dirty(old)) pmd = pmd_mkdirty(pmd); if (pmd_young(old)) @@ -1148,14 +1159,17 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { + ret = pmd_swp_soft_dirty(pmd); pmd = pmd_swp_clear_soft_dirty(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } + return ret; } #else -static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, +static inline bool clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { + return false; } #endif @@ -1168,6 +1182,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, spinlock_t *ptl; struct page *page; + BUG_ON(addr < cp->start || end > cp->end); + ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { if (cp->type == CLEAR_REFS_SOFT_DIRTY) { @@ -1225,9 +1241,11 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end, struct clear_refs_private *cp = walk->private; struct vm_area_struct *vma = walk->vma; - if (vma->vm_flags & VM_PFNMAP) + if (!cp->clear_range && (vma->vm_flags & VM_PFNMAP)) return 1; + BUG_ON(start < cp->start || end > cp->end); + /* * Writing 1 to /proc/pid/clear_refs affects all pages. * Writing 2 to /proc/pid/clear_refs only affects anonymous pages. @@ -1251,10 +1269,12 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; - char buffer[PROC_NUMBUF]; + char buffer[18]; struct mm_struct *mm; struct vm_area_struct *vma; enum clear_refs_types type; + unsigned long start, end; + bool clear_range; int itype; int rv; @@ -1263,12 +1283,34 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - rv = kstrtoint(strstrip(buffer), 10, &itype); - if (rv < 0) - return rv; - type = (enum clear_refs_types)itype; - if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) - return -EINVAL; + + if (buffer[0] == '6') + { + static int once; + + if (!once++) + printk(KERN_DEBUG "task_mmu: Using POC clear refs range implementation.\n"); + + if (count != 17) + return -EINVAL; + + type = CLEAR_REFS_SOFT_DIRTY; + start = *(unsigned long *)(buffer + 1); + end = *(unsigned long *)(buffer + 1 + 8); + } + else + { + rv = kstrtoint(strstrip(buffer), 10, &itype); + if (rv < 0) + return rv; + type = (enum clear_refs_types)itype; + + if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) + return -EINVAL; + + start = 0; + end = -1UL; + } task = get_proc_task(file_inode(file)); if (!task) @@ -1281,40 +1323,86 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, .type = type, }; - if (mmap_write_lock_killable(mm)) { - count = -EINTR; - goto out_mm; + if (start || end != -1UL) + { + start = min(start, -1UL) & PAGE_MASK; + end = min(end, -1UL) & PAGE_MASK; + + if (start >= end) + { + count = -EINVAL; + goto out_mm; + } + clear_range = true; } + else + { + clear_range = false; + } + + cp.start = start; + cp.end = end; + cp.clear_range = clear_range; + if (type == CLEAR_REFS_MM_HIWATER_RSS) { + if (mmap_write_lock_killable(mm)) { + count = -EINTR; + goto out_mm; + } + /* * Writing 5 to /proc/pid/clear_refs resets the peak * resident set size to this mm's current rss value. */ reset_mm_hiwater_rss(mm); - goto out_unlock; + mmap_write_unlock(mm); + goto out_mm; } if (type == CLEAR_REFS_SOFT_DIRTY) { - for_each_vma(vmi, vma) { - if (!(vma->vm_flags & VM_SOFTDIRTY)) - continue; - vm_flags_clear(vma, VM_SOFTDIRTY); - vma_set_page_prot(vma); + if (mmap_read_lock_killable(mm)) { + count = -EINTR; + goto out_mm; } - + if (!clear_range) + for_each_vma(vmi, vma) { + if (!(vma->vm_flags & VM_SOFTDIRTY)) + continue; + mmap_read_unlock(mm); + if (mmap_write_lock_killable(mm)) { + count = -EINTR; + goto out_mm; + } + for_each_vma(vmi, vma) { + vm_flags_clear(vma, VM_SOFTDIRTY); + vma_set_page_prot(vma); + } + mmap_write_downgrade(mm); + break; + } inc_tlb_flush_pending(mm); mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, - 0, mm, 0, -1UL); + 0, mm, start, end); mmu_notifier_invalidate_range_start(&range); } - walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); + else + { + if (mmap_write_lock_killable(mm)) { + count = -EINTR; + goto out_mm; + } + } + walk_page_range(mm, start, end == -1UL ? -1 : end, &clear_refs_walk_ops, &cp); if (type == CLEAR_REFS_SOFT_DIRTY) { mmu_notifier_invalidate_range_end(&range); flush_tlb_mm(mm); dec_tlb_flush_pending(mm); + mmap_read_unlock(mm); + } + else + { + mmap_write_unlock(mm); } -out_unlock: - mmap_write_unlock(mm); out_mm: mmput(mm); } @@ -1336,6 +1424,7 @@ struct pagemapread { int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ pagemap_entry_t *buffer; bool show_pfn; + bool reset; }; #define PAGEMAP_WALK_SIZE (PMD_SIZE) @@ -1346,6 +1435,7 @@ struct pagemapread { #define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) #define PM_SOFT_DIRTY BIT_ULL(55) #define PM_MMAP_EXCLUSIVE BIT_ULL(56) +#define PM_SOFT_DIRTY_PAGE BIT_ULL(57) #define PM_UFFD_WP BIT_ULL(57) #define PM_FILE BIT_ULL(61) #define PM_SWAP BIT_ULL(62) @@ -1366,6 +1456,14 @@ static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm) return 0; } +static int add_addr_to_pagemap(unsigned long addr, struct pagemapread *pm) +{ + ((unsigned long *)pm->buffer)[pm->pos++] = addr; + if (pm->pos >= pm->len) + return PM_END_OF_BUFFER; + return 0; +} + static int pagemap_pte_hole(unsigned long start, unsigned long end, __always_unused int depth, struct mm_walk *walk) { @@ -1373,6 +1471,9 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, unsigned long addr = start; int err = 0; + if (pm->reset) + goto out; + while (addr < end) { struct vm_area_struct *vma = find_vma(walk->mm, addr); pagemap_entry_t pme = make_pme(0, 0); @@ -1418,13 +1519,13 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, flags |= PM_PRESENT; page = vm_normal_page(vma, addr, pte); if (pte_soft_dirty(pte)) - flags |= PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; if (pte_uffd_wp(pte)) flags |= PM_UFFD_WP; } else if (is_swap_pte(pte)) { swp_entry_t entry; if (pte_swp_soft_dirty(pte)) - flags |= PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; if (pte_swp_uffd_wp(pte)) flags |= PM_UFFD_WP; entry = pte_to_swp_entry(pte); @@ -1475,6 +1576,20 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, pmd_t pmd = *pmdp; struct page *page = NULL; + if (pm->reset) + { + if (clear_soft_dirty_pmd(vma, addr, pmdp)) + { + for (; addr != end; addr += PAGE_SIZE) + { + err = add_addr_to_pagemap(addr, pm); + if (err) + break; + } + } + goto trans_huge_done; + } + if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; @@ -1483,7 +1598,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, flags |= PM_PRESENT; if (pmd_soft_dirty(pmd)) - flags |= PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; if (pmd_uffd_wp(pmd)) flags |= PM_UFFD_WP; if (pm->show_pfn) @@ -1504,7 +1619,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, } flags |= PM_SWAP; if (pmd_swp_soft_dirty(pmd)) - flags |= PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; if (pmd_swp_uffd_wp(pmd)) flags |= PM_UFFD_WP; VM_BUG_ON(!is_pmd_migration_entry(pmd)); @@ -1534,6 +1649,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, frame += (1 << MAX_SWAPFILES_SHIFT); } } +trans_huge_done: spin_unlock(ptl); return err; } @@ -1549,10 +1665,18 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, return err; } for (; addr < end; pte++, addr += PAGE_SIZE) { - pagemap_entry_t pme; + if (pm->reset) + { + if (clear_soft_dirty(vma, addr, pte)) + err = add_addr_to_pagemap(addr, pm); + } + else + { + pagemap_entry_t pme; - pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); - err = add_to_pagemap(&pme, pm); + pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); + err = add_to_pagemap(&pme, pm); + } if (err) break; } @@ -1651,8 +1775,8 @@ static const struct mm_walk_ops pagemap_ops = { * determine which areas of memory are actually mapped and llseek to * skip over unmapped regions. */ -static ssize_t pagemap_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +static ssize_t do_pagemap_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos, bool reset) { struct mm_struct *mm = file->private_data; struct pagemapread pm; @@ -1661,6 +1785,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, unsigned long start_vaddr; unsigned long end_vaddr; int ret = 0, copied = 0; + struct mmu_notifier_range range; + size_t buffer_len; if (!mm || !mmget_not_zero(mm)) goto out; @@ -1676,19 +1802,38 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, /* do not disclose physical addresses: attack vector */ pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); + pm.reset = reset; + + buffer_len = min(PAGEMAP_WALK_SIZE >> PAGE_SHIFT, count / PM_ENTRY_BYTES); - pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); - pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL); + pm.buffer = kmalloc_array(buffer_len, PM_ENTRY_BYTES, GFP_KERNEL); ret = -ENOMEM; if (!pm.buffer) goto out_mm; src = *ppos; svpfn = src / PM_ENTRY_BYTES; - end_vaddr = mm->task_size; + + start_vaddr = svpfn << PAGE_SHIFT; + + if (reset) + { + if (count < sizeof(end_vaddr)) + { + ret = -EINVAL; + goto out_mm; + } + if (copy_from_user(&end_vaddr, buf, sizeof(end_vaddr))) + return -EFAULT; + end_vaddr = min(end_vaddr, mm->task_size); + } + else + { + end_vaddr = mm->task_size; + start_vaddr = end_vaddr; + } /* watch out for wraparound */ - start_vaddr = end_vaddr; if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) { unsigned long end; @@ -1713,18 +1858,35 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, unsigned long end; pm.pos = 0; - end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; + pm.len = min(buffer_len, count / PM_ENTRY_BYTES); + + end = reset ? end_vaddr : (start_vaddr + (pm.len << PAGE_SHIFT)); /* overflow ? */ if (end < start_vaddr || end > end_vaddr) end = end_vaddr; + ret = mmap_read_lock_killable(mm); if (ret) goto out_free; + + if (reset) + { + inc_tlb_flush_pending(mm); + mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, + 0, mm, start_vaddr, end); + mmu_notifier_invalidate_range_start(&range); + } ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm); + if (reset) + { + mmu_notifier_invalidate_range_end(&range); + flush_tlb_mm(mm); + dec_tlb_flush_pending(mm); + } mmap_read_unlock(mm); - start_vaddr = end; len = min(count, PM_ENTRY_BYTES * pm.pos); + BUG_ON(ret && ret != PM_END_OF_BUFFER); if (copy_to_user(buf, pm.buffer, len)) { ret = -EFAULT; goto out_free; @@ -1732,6 +1894,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, copied += len; buf += len; count -= len; + + start_vaddr = reset && pm.pos == pm.len ? ((unsigned long *)pm.buffer)[pm.pos - 1] + PAGE_SIZE : end; } *ppos += copied; if (!ret || ret == PM_END_OF_BUFFER) @@ -1745,6 +1909,18 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, return ret; } +static ssize_t pagemap_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return do_pagemap_read(file, buf, count, ppos, false); +} + +static ssize_t pagemap_reset_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return do_pagemap_read(file, buf, count, ppos, true); +} + static int pagemap_open(struct inode *inode, struct file *file) { struct mm_struct *mm; @@ -1771,6 +1947,14 @@ const struct file_operations proc_pagemap_operations = { .open = pagemap_open, .release = pagemap_release, }; + +const struct file_operations proc_pagemap_reset_operations = { + .llseek = mem_lseek, /* borrow this */ + .read = pagemap_reset_read, + .open = pagemap_open, + .release = pagemap_release, +}; + #endif /* CONFIG_PROC_PAGE_MONITOR */ #ifdef CONFIG_NUMA diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 4a1dc88ddbff9a..74d33218cada80 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -509,6 +509,9 @@ extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, #ifdef CONFIG_IRQ_FORCED_THREADING # ifdef CONFIG_PREEMPT_RT # define force_irqthreads() (true) +# elif defined(CONFIG_FORCE_IRQ_THREADING) +DECLARE_STATIC_KEY_TRUE(force_irqthreads_key); +# define force_irqthreads() (static_branch_likely(&force_irqthreads_key)) # else DECLARE_STATIC_KEY_FALSE(force_irqthreads_key); # define force_irqthreads() (static_branch_unlikely(&force_irqthreads_key)) diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h index c0fea6ca507681..36fc5d5315a414 100644 --- a/include/linux/miscdevice.h +++ b/include/linux/miscdevice.h @@ -71,6 +71,7 @@ #define USERIO_MINOR 240 #define VHOST_VSOCK_MINOR 241 #define RFKILL_MINOR 242 +#define WINESYNC_MINOR 243 #define MISC_DYNAMIC_MINOR 255 struct device; diff --git a/include/linux/mm.h b/include/linux/mm.h index 3d617d0d696751..ddc17bbd3ef6e0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -191,7 +191,7 @@ static inline void __mm_zero_struct_page(struct page *page) * that. */ #define MAPCOUNT_ELF_CORE_MARGIN (5) -#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) +#define DEFAULT_MAX_MAP_COUNT (INT_MAX - MAPCOUNT_ELF_CORE_MARGIN) extern int sysctl_max_map_count; diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index e83c4c09504173..a029ac2e355405 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -48,7 +48,11 @@ extern unsigned int pageblock_order; #else /* CONFIG_HUGETLB_PAGE */ /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */ +#ifdef CONFIG_ZEN_INTERACTIVE +#define pageblock_order PAGE_ALLOC_COSTLY_ORDER +#else #define pageblock_order MAX_ORDER +#endif #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h index 71a5df8d26898b..d375ab21cbf83f 100644 --- a/include/uapi/linux/futex.h +++ b/include/uapi/linux/futex.h @@ -22,6 +22,7 @@ #define FUTEX_WAIT_REQUEUE_PI 11 #define FUTEX_CMP_REQUEUE_PI 12 #define FUTEX_LOCK_PI2 13 +#define FUTEX_WAIT_MULTIPLE 31 #define FUTEX_PRIVATE_FLAG 128 #define FUTEX_CLOCK_REALTIME 256 @@ -68,6 +69,18 @@ struct futex_waitv { __u32 __reserved; }; +/** + * struct futex_wait_block - Block of futexes to be waited for + * @uaddr: User address of the futex + * @val: Futex value expected by userspace + * @bitset: Bitset for the optional bitmasked wakeup + */ +struct futex_wait_block { + __u32 __user *uaddr; + __u32 val; + __u32 bitset; +}; + /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. diff --git a/include/uapi/linux/winesync.h b/include/uapi/linux/winesync.h new file mode 100644 index 00000000000000..5b4e369f74693a --- /dev/null +++ b/include/uapi/linux/winesync.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Kernel support for Wine synchronization primitives + * + * Copyright (C) 2021 Zebediah Figura + */ + +#ifndef __LINUX_WINESYNC_H +#define __LINUX_WINESYNC_H + +#include + +struct winesync_sem_args { + __u32 sem; + __u32 count; + __u32 max; +}; + +struct winesync_mutex_args { + __u32 mutex; + __u32 owner; + __u32 count; +}; + +struct winesync_event_args { + __u32 event; + __u32 manual; + __u32 signaled; +}; + +struct winesync_wait_args { + __u64 timeout; + __u64 objs; + __u32 count; + __u32 owner; + __u32 index; + __u32 alert; +}; + +#define WINESYNC_IOC_BASE 0xf7 + +#define WINESYNC_IOC_CREATE_SEM _IOWR(WINESYNC_IOC_BASE, 0, \ + struct winesync_sem_args) +#define WINESYNC_IOC_DELETE _IOW (WINESYNC_IOC_BASE, 1, __u32) +#define WINESYNC_IOC_PUT_SEM _IOWR(WINESYNC_IOC_BASE, 2, \ + struct winesync_sem_args) +#define WINESYNC_IOC_WAIT_ANY _IOWR(WINESYNC_IOC_BASE, 3, \ + struct winesync_wait_args) +#define WINESYNC_IOC_WAIT_ALL _IOWR(WINESYNC_IOC_BASE, 4, \ + struct winesync_wait_args) +#define WINESYNC_IOC_CREATE_MUTEX _IOWR(WINESYNC_IOC_BASE, 5, \ + struct winesync_mutex_args) +#define WINESYNC_IOC_PUT_MUTEX _IOWR(WINESYNC_IOC_BASE, 6, \ + struct winesync_mutex_args) +#define WINESYNC_IOC_KILL_OWNER _IOW (WINESYNC_IOC_BASE, 7, __u32) +#define WINESYNC_IOC_READ_SEM _IOWR(WINESYNC_IOC_BASE, 8, \ + struct winesync_sem_args) +#define WINESYNC_IOC_READ_MUTEX _IOWR(WINESYNC_IOC_BASE, 9, \ + struct winesync_mutex_args) +#define WINESYNC_IOC_CREATE_EVENT _IOWR(WINESYNC_IOC_BASE, 10, \ + struct winesync_event_args) +#define WINESYNC_IOC_SET_EVENT _IOWR(WINESYNC_IOC_BASE, 11, \ + struct winesync_event_args) +#define WINESYNC_IOC_RESET_EVENT _IOWR(WINESYNC_IOC_BASE, 12, \ + struct winesync_event_args) +#define WINESYNC_IOC_PULSE_EVENT _IOWR(WINESYNC_IOC_BASE, 13, \ + struct winesync_event_args) +#define WINESYNC_IOC_READ_EVENT _IOWR(WINESYNC_IOC_BASE, 14, \ + struct winesync_event_args) + +#endif diff --git a/init/Kconfig b/init/Kconfig index e403a292563573..2a54d857a8ee30 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -132,6 +132,40 @@ config THREAD_INFO_IN_TASK menu "General setup" +config ZEN_INTERACTIVE + bool "Tune kernel for interactivity" + default y + help + Tunes the kernel for responsiveness at the cost of throughput and power usage. + + --- Block Layer ---------------------------------------- + + Default scheduler for SQ..: mq-deadline -> bfq + Default scheduler for MQ..: none -> kyber + + --- Virtual Memory Subsystem --------------------------- + + Background-reclaim hugepages...: no -> yes + MG-LRU minimum cache TTL.......: 0 -> 1000 ms + Compact unevictable............: yes -> no + Compaction proactiveness.......: 20 -> 0 + Watermark boost factor.........: 1.5 -> 0 + Pageblock order................: 10 -> 3 + Swap-in readahead..............: 3 -> 0 + + --- EEVDF CPU Scheduler -------------------------------- + + Minimal granularity............: 0.75 -> 0.4 ms + Bandwidth slice size...........: 5 -> 3 ms + Task rebalancing threshold.....: 32 -> 8 + Migration cost.................: 0.5 -> 0.25 ms + + --- CPUFreq Settings ----------------------------------- + + Ondemand sampling down factor..: 1 -> 5 + Ondemand default up threshold..: 80 -> 55 + Ondemand micro up threshold....: 95 -> 60 + config BROKEN bool diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index a8074079b09e87..26d6da72d494c6 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -142,6 +142,7 @@ static __always_inline bool futex_cmd_has_timeout(u32 cmd) case FUTEX_LOCK_PI2: case FUTEX_WAIT_BITSET: case FUTEX_WAIT_REQUEUE_PI: + case FUTEX_WAIT_MULTIPLE: return true; } return false; @@ -154,13 +155,79 @@ futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t) return -EINVAL; *t = timespec64_to_ktime(*ts); - if (cmd == FUTEX_WAIT) + if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) *t = ktime_add_safe(ktime_get(), *t); else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t); return 0; } +/** + * futex_read_wait_block - Read an array of futex_wait_block from userspace + * @uaddr: Userspace address of the block + * @count: Number of blocks to be read + * + * This function creates and allocate an array of futex_q (we zero it to + * initialize the fields) and then, for each futex_wait_block element from + * userspace, fill a futex_q element with proper values. + */ +inline struct futex_vector *futex_read_wait_block(u32 __user *uaddr, u32 count) +{ + unsigned int i; + struct futex_vector *futexv; + struct futex_wait_block fwb; + struct futex_wait_block __user *entry = + (struct futex_wait_block __user *)uaddr; + + if (!count || count > FUTEX_WAITV_MAX) + return ERR_PTR(-EINVAL); + + futexv = kcalloc(count, sizeof(*futexv), GFP_KERNEL); + if (!futexv) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < count; i++) { + if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { + kfree(futexv); + return ERR_PTR(-EFAULT); + } + + futexv[i].w.flags = FUTEX_32; + futexv[i].w.val = fwb.val; + futexv[i].w.uaddr = (uintptr_t) (fwb.uaddr); + futexv[i].q = futex_q_init; + } + + return futexv; +} + +int futex_wait_multiple(struct futex_vector *vs, unsigned int count, + struct hrtimer_sleeper *to); + +int futex_opcode_31(ktime_t *abs_time, u32 __user *uaddr, int count) +{ + int ret; + struct futex_vector *vs; + struct hrtimer_sleeper *to = NULL, timeout; + + to = futex_setup_timer(abs_time, &timeout, 0, 0); + + vs = futex_read_wait_block(uaddr, count); + + if (IS_ERR(vs)) + return PTR_ERR(vs); + + ret = futex_wait_multiple(vs, count, abs_time ? to : NULL); + kfree(vs); + + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + + return ret; +} + SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, const struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, u32, val3) @@ -180,6 +247,9 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, tp = &t; } + if (cmd == FUTEX_WAIT_MULTIPLE) + return futex_opcode_31(tp, uaddr, val); + return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } @@ -373,6 +443,9 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, tp = &t; } + if (cmd == FUTEX_WAIT_MULTIPLE) + return futex_opcode_31(tp, uaddr, val); + return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } #endif /* CONFIG_COMPAT_32BIT_TIME */ diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 2531f3496ab6d7..ab1b43161837be 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -112,6 +112,23 @@ config GENERIC_IRQ_RESERVATION_MODE config IRQ_FORCED_THREADING bool +config FORCE_IRQ_THREADING + bool "Make IRQ threading compulsory" + depends on IRQ_FORCED_THREADING + default n + help + + Make IRQ threading mandatory for any IRQ handlers that support it + instead of being optional and requiring the threadirqs kernel + parameter. Instead they can be optionally disabled with the + nothreadirqs kernel parameter. + + Enabling this may make some architectures not boot with runqueue + sharing and MuQSS. + + Enable if you are building for a desktop or low latency system, + otherwise say N. + config SPARSE_IRQ bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ help diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index a054cd5ec08bce..7b3f997fec3e37 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -25,7 +25,18 @@ #include "internals.h" #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) +#ifdef CONFIG_FORCE_IRQ_THREADING +DEFINE_STATIC_KEY_TRUE(force_irqthreads_key); +#else DEFINE_STATIC_KEY_FALSE(force_irqthreads_key); +#endif + +static int __init setup_noforced_irqthreads(char *arg) +{ + static_branch_disable(&force_irqthreads_key); + return 0; +} +early_param("nothreadirqs", setup_noforced_irqthreads); static int __init setup_forced_irqthreads(char *arg) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b2e1009e5706ee..4a4bd8fd0a58fe 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -75,8 +75,13 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; * * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ +#ifdef CONFIG_ZEN_INTERACTIVE +unsigned int sysctl_sched_base_slice = 400000ULL; +static unsigned int normalized_sysctl_sched_base_slice = 400000ULL; +#else unsigned int sysctl_sched_base_slice = 750000ULL; static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; +#endif /* * After fork, child runs first. If set to 0 (default) then @@ -84,7 +89,11 @@ static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; */ unsigned int sysctl_sched_child_runs_first __read_mostly; +#ifdef CONFIG_ZEN_INTERACTIVE +const_debug unsigned int sysctl_sched_migration_cost = 250000UL; +#else const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +#endif int sched_thermal_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) @@ -135,8 +144,12 @@ int __weak arch_asym_cpu_priority(int cpu) * * (default: 5 msec, units: microseconds) */ +#ifdef CONFIG_ZEN_INTERACTIVE +static unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; +#else static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +#endif #ifdef CONFIG_NUMA_BALANCING /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ @@ -7367,6 +7380,37 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool return idle_cpu; } +/* + * For the multiple-LLC per node case, make sure to try the other LLC's if the + * local LLC comes up empty. + */ +static int +select_idle_node(struct task_struct *p, struct sched_domain *sd, int target) +{ + struct sched_domain *parent = sd->parent; + struct sched_group *sg; + + /* Make sure to not cross nodes. */ + if (!parent || parent->flags & SD_NUMA) + return -1; + + sg = parent->groups; + do { + int cpu = cpumask_first(sched_group_span(sg)); + + if (!cpus_share_cache(cpu, target)) { + int i = select_idle_cpu(p, per_cpu(sd_llc, cpu), + test_idle_cores(cpu), cpu); + if ((unsigned)i < nr_cpumask_bits) + return i; + } + + sg = sg->next; + } while (sg != parent->groups); + + return -1; +} + /* * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which * the task fits. If no CPU is big enough, but there are idle ones, try to @@ -7539,6 +7583,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if ((unsigned)i < nr_cpumask_bits) return i; + if (sched_feat(SIS_NODE)) { + i = select_idle_node(p, sd, target); + if ((unsigned)i < nr_cpumask_bits) + return i; + } + return target; } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index f770168230ae4a..03ed013b23fd7a 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -51,6 +51,7 @@ SCHED_FEAT(TTWU_QUEUE, true) */ SCHED_FEAT(SIS_PROP, false) SCHED_FEAT(SIS_UTIL, true) +SCHED_FEAT(SIS_NODE, true) /* * Issue a WARN when we do multiple update_rq_clock() calls diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8cbbbea7fdbbd6..b10c780d114c70 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2518,7 +2518,7 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); -#ifdef CONFIG_PREEMPT_RT +#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_ZEN_INTERACTIVE) #define SCHED_NR_MIGRATE_BREAK 8 #else #define SCHED_NR_MIGRATE_BREAK 32 diff --git a/mm/Kconfig b/mm/Kconfig index 264a2df5ecf5b9..025833f2442694 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -653,7 +653,7 @@ config COMPACTION config COMPACT_UNEVICTABLE_DEFAULT int depends on COMPACTION - default 0 if PREEMPT_RT + default 0 if PREEMPT_RT || ZEN_INTERACTIVE default 1 # diff --git a/mm/compaction.c b/mm/compaction.c index 61c741f11e9bb3..fc959a11fee2d7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1809,7 +1809,11 @@ static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNE * aggressively the kernel should compact memory in the * background. It takes values in the range [0, 100]. */ +#ifdef CONFIG_ZEN_INTERACTIVE +static unsigned int __read_mostly sysctl_compaction_proactiveness; +#else static unsigned int __read_mostly sysctl_compaction_proactiveness = 20; +#endif static int sysctl_extfrag_threshold = 500; static int __read_mostly sysctl_compact_memory; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 79fbd6ddec49f5..21911157436db0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -61,7 +61,11 @@ unsigned long transparent_hugepage_flags __read_mostly = #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE (1<1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) @@ -286,7 +288,11 @@ const char * const migratetype_names[MIGRATE_TYPES] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +#ifdef CONFIG_ZEN_INTERACTIVE +static int watermark_boost_factor __read_mostly; +#else static int watermark_boost_factor __read_mostly = 15000; +#endif static int watermark_scale_factor = 10; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ @@ -2120,16 +2126,17 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, } /* - * Obtain a specified number of elements from the buddy allocator, all under - * a single hold of the lock, for efficiency. Add them to the supplied list. - * Returns the number of new pages which were placed at *list. + * Obtain a specified number of elements from the buddy allocator, and relax the + * zone lock when needed. Add them to the supplied list. Returns the number of + * new pages which were placed at *list. */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, unsigned int alloc_flags) { + const bool can_resched = !preempt_count() && !irqs_disabled(); unsigned long flags; - int i; + int i, last_mod = 0; spin_lock_irqsave(&zone->lock, flags); for (i = 0; i < count; ++i) { @@ -2138,6 +2145,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, if (unlikely(page == NULL)) break; + /* Reschedule and ease the contention on the lock if needed */ + if (i + 1 < count && ((can_resched && need_resched()) || + spin_needbreak(&zone->lock))) { + __mod_zone_page_state(zone, NR_FREE_PAGES, + -((i + 1 - last_mod) << order)); + last_mod = i + 1; + spin_unlock_irqrestore(&zone->lock, flags); + if (can_resched) + cond_resched(); + spin_lock_irqsave(&zone->lock, flags); + } + /* * Split buddy pages returned by expand() are received here in * physical page order. The page is added to the tail of @@ -2154,7 +2173,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, -(1 << order)); } - __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); + __mod_zone_page_state(zone, NR_FREE_PAGES, -((i - last_mod) << order)); spin_unlock_irqrestore(&zone->lock, flags); return i; @@ -3916,6 +3935,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int cpuset_mems_cookie; unsigned int zonelist_iter_cookie; int reserve_flags; + bool woke_kswapd = false; restart: compaction_retries = 0; @@ -3955,8 +3975,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; } - if (alloc_flags & ALLOC_KSWAPD) + if (alloc_flags & ALLOC_KSWAPD) { + if (!woke_kswapd) { + atomic_long_inc(&kswapd_waiters); + woke_kswapd = true; + } wake_all_kswapds(order, gfp_mask, ac); + } /* * The adjusted alloc_flags might result in immediate success, so try @@ -4172,9 +4197,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto retry; } fail: - warn_alloc(gfp_mask, ac->nodemask, - "page allocation failure: order:%u", order); got_pg: + if (woke_kswapd) + atomic_long_dec(&kswapd_waiters); + if (!page) + warn_alloc(gfp_mask, ac->nodemask, + "page allocation failure: order:%u", order); return page; } diff --git a/mm/swap.c b/mm/swap.c index cd8f0150ba3aa8..ed791c2906aa51 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1090,6 +1090,10 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) */ void __init swap_setup(void) { +#ifdef CONFIG_ZEN_INTERACTIVE + /* Only swap-in pages requested, avoid readahead */ + page_cluster = 0; +#else unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ @@ -1101,4 +1105,5 @@ void __init swap_setup(void) * Right now other parts of the system means that we * _really_ don't want to cluster much more */ +#endif } diff --git a/mm/vmscan.c b/mm/vmscan.c index 83fa8e924f8aea..60d2e009825f16 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4619,7 +4619,11 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc } /* to protect the working set of the last N jiffies */ +#ifdef CONFIG_ZEN_INTERACTIVE +static unsigned long lru_gen_min_ttl __read_mostly = HZ; +#else static unsigned long lru_gen_min_ttl __read_mostly; +#endif static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { @@ -6943,7 +6947,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, return 0; } -static bool allow_direct_reclaim(pg_data_t *pgdat) +static bool allow_direct_reclaim(pg_data_t *pgdat, bool using_kswapd) { struct zone *zone; unsigned long pfmemalloc_reserve = 0; @@ -6972,6 +6976,10 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) wmark_ok = free_pages > pfmemalloc_reserve / 2; + /* The throttled direct reclaimer is now a kswapd waiter */ + if (unlikely(!using_kswapd && !wmark_ok)) + atomic_long_inc(&kswapd_waiters); + /* kswapd must be awake if processes are being throttled */ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) @@ -7037,7 +7045,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, /* Throttle based on the first usable node */ pgdat = zone->zone_pgdat; - if (allow_direct_reclaim(pgdat)) + if (allow_direct_reclaim(pgdat, gfp_mask & __GFP_KSWAPD_RECLAIM)) goto out; break; } @@ -7059,11 +7067,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, */ if (!(gfp_mask & __GFP_FS)) wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, - allow_direct_reclaim(pgdat), HZ); + allow_direct_reclaim(pgdat, true), HZ); else /* Throttle until kswapd wakes the process */ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, - allow_direct_reclaim(pgdat)); + allow_direct_reclaim(pgdat, true)); + + if (unlikely(!(gfp_mask & __GFP_KSWAPD_RECLAIM))) + atomic_long_dec(&kswapd_waiters); if (fatal_signal_pending(current)) return true; @@ -7563,14 +7574,15 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) * able to safely make forward progress. Wake them */ if (waitqueue_active(&pgdat->pfmemalloc_wait) && - allow_direct_reclaim(pgdat)) + allow_direct_reclaim(pgdat, true)) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ __fs_reclaim_release(_THIS_IP_); ret = try_to_freeze(); __fs_reclaim_acquire(_THIS_IP_); - if (ret || kthread_should_stop()) + if (ret || kthread_should_stop() || + !atomic_long_read(&kswapd_waiters)) break; /* diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 697f13bbbc3217..c594eb398fdf3b 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -18,6 +18,7 @@ TARGETS += drivers/dma-buf TARGETS += drivers/s390x/uvdevice TARGETS += drivers/net/bonding TARGETS += drivers/net/team +TARGETS += drivers/winesync TARGETS += efivarfs TARGETS += exec TARGETS += fchmodat2 diff --git a/tools/testing/selftests/drivers/winesync/Makefile b/tools/testing/selftests/drivers/winesync/Makefile new file mode 100644 index 00000000000000..43b39fdeea10ec --- /dev/null +++ b/tools/testing/selftests/drivers/winesync/Makefile @@ -0,0 +1,8 @@ +# SPDX-LICENSE-IDENTIFIER: GPL-2.0-only +TEST_GEN_PROGS := winesync + +top_srcdir =../../../../.. +CFLAGS += -I$(top_srcdir)/usr/include +LDLIBS += -lpthread + +include ../../lib.mk diff --git a/tools/testing/selftests/drivers/winesync/config b/tools/testing/selftests/drivers/winesync/config new file mode 100644 index 00000000000000..60539c826d0624 --- /dev/null +++ b/tools/testing/selftests/drivers/winesync/config @@ -0,0 +1 @@ +CONFIG_WINESYNC=y diff --git a/tools/testing/selftests/drivers/winesync/winesync.c b/tools/testing/selftests/drivers/winesync/winesync.c new file mode 100644 index 00000000000000..169e922484b008 --- /dev/null +++ b/tools/testing/selftests/drivers/winesync/winesync.c @@ -0,0 +1,1479 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Various unit tests for the "winesync" synchronization primitive driver. + * + * Copyright (C) 2021 Zebediah Figura + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include "../../kselftest_harness.h" + +static int read_sem_state(int fd, __u32 sem, __u32 *count, __u32 *max) +{ + struct winesync_sem_args args; + int ret; + + args.sem = sem; + args.count = 0xdeadbeef; + args.max = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_READ_SEM, &args); + *count = args.count; + *max = args.max; + return ret; +} + +#define check_sem_state(fd, sem, count, max) \ + ({ \ + __u32 __count, __max; \ + int ret = read_sem_state((fd), (sem), &__count, &__max); \ + EXPECT_EQ(0, ret); \ + EXPECT_EQ((count), __count); \ + EXPECT_EQ((max), __max); \ + }) + +static int put_sem(int fd, __u32 sem, __u32 *count) +{ + struct winesync_sem_args args; + int ret; + + args.sem = sem; + args.count = *count; + ret = ioctl(fd, WINESYNC_IOC_PUT_SEM, &args); + *count = args.count; + return ret; +} + +static int read_mutex_state(int fd, __u32 mutex, __u32 *count, __u32 *owner) +{ + struct winesync_mutex_args args; + int ret; + + args.mutex = mutex; + args.count = 0xdeadbeef; + args.owner = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_READ_MUTEX, &args); + *count = args.count; + *owner = args.owner; + return ret; +} + +#define check_mutex_state(fd, mutex, count, owner) \ + ({ \ + __u32 __count, __owner; \ + int ret = read_mutex_state((fd), (mutex), &__count, &__owner); \ + EXPECT_EQ(0, ret); \ + EXPECT_EQ((count), __count); \ + EXPECT_EQ((owner), __owner); \ + }) + +static int put_mutex(int fd, __u32 mutex, __u32 owner, __u32 *count) +{ + struct winesync_mutex_args args; + int ret; + + args.mutex = mutex; + args.owner = owner; + args.count = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_PUT_MUTEX, &args); + *count = args.count; + return ret; +} + +static int read_event_state(int fd, __u32 event, __u32 *signaled, __u32 *manual) +{ + struct winesync_event_args args; + int ret; + + args.event = event; + args.signaled = 0xdeadbeef; + args.manual = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_READ_EVENT, &args); + *signaled = args.signaled; + *manual = args.manual; + return ret; +} + +#define check_event_state(fd, event, signaled, manual) \ + ({ \ + __u32 __signaled, __manual; \ + int ret = read_event_state((fd), (event), \ + &__signaled, &__manual); \ + EXPECT_EQ(0, ret); \ + EXPECT_EQ((signaled), __signaled); \ + EXPECT_EQ((manual), __manual); \ + }) + +static int wait_objs(int fd, unsigned long request, __u32 count, + const __u32 *objs, __u32 owner, __u32 alert, __u32 *index) +{ + struct winesync_wait_args args = {0}; + struct timespec timeout; + int ret; + + clock_gettime(CLOCK_MONOTONIC, &timeout); + + args.timeout = (uintptr_t)&timeout; + args.count = count; + args.objs = (uintptr_t)objs; + args.owner = owner; + args.index = 0xdeadbeef; + args.alert = alert; + ret = ioctl(fd, request, &args); + *index = args.index; + return ret; +} + +static int wait_any(int fd, __u32 count, const __u32 *objs, + __u32 owner, __u32 *index) +{ + return wait_objs(fd, WINESYNC_IOC_WAIT_ANY, + count, objs, owner, 0, index); +} + +static int wait_all(int fd, __u32 count, const __u32 *objs, + __u32 owner, __u32 *index) +{ + return wait_objs(fd, WINESYNC_IOC_WAIT_ALL, + count, objs, owner, 0, index); +} + +static int wait_any_alert(int fd, __u32 count, const __u32 *objs, + __u32 owner, __u32 alert, __u32 *index) +{ + return wait_objs(fd, WINESYNC_IOC_WAIT_ANY, + count, objs, owner, alert, index); +} + +static int wait_all_alert(int fd, __u32 count, const __u32 *objs, + __u32 owner, __u32 alert, __u32 *index) +{ + return wait_objs(fd, WINESYNC_IOC_WAIT_ALL, + count, objs, owner, alert, index); +} + +TEST(semaphore_state) +{ + struct winesync_sem_args sem_args; + struct timespec timeout; + __u32 sem, count, index; + int fd, ret; + + clock_gettime(CLOCK_MONOTONIC, &timeout); + + fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + sem_args.count = 3; + sem_args.max = 2; + sem_args.sem = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + sem_args.count = 2; + sem_args.max = 2; + sem_args.sem = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, sem_args.sem); + check_sem_state(fd, sem, 2, 2); + + count = 0; + ret = put_sem(fd, sem, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, count); + check_sem_state(fd, sem, 2, 2); + + count = 1; + ret = put_sem(fd, sem, &count); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOVERFLOW, errno); + check_sem_state(fd, sem, 2, 2); + + ret = wait_any(fd, 1, &sem, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(fd, sem, 1, 2); + + ret = wait_any(fd, 1, &sem, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(fd, sem, 0, 2); + + ret = wait_any(fd, 1, &sem, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + count = 3; + ret = put_sem(fd, sem, &count); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOVERFLOW, errno); + check_sem_state(fd, sem, 0, 2); + + count = 2; + ret = put_sem(fd, sem, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, count); + check_sem_state(fd, sem, 2, 2); + + ret = wait_any(fd, 1, &sem, 123, &index); + EXPECT_EQ(0, ret); + ret = wait_any(fd, 1, &sem, 123, &index); + EXPECT_EQ(0, ret); + + count = 1; + ret = put_sem(fd, sem, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, count); + check_sem_state(fd, sem, 1, 2); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &sem); + EXPECT_EQ(0, ret); + + close(fd); +} + +TEST(mutex_state) +{ + struct winesync_mutex_args mutex_args; + __u32 mutex, owner, count, index; + struct timespec timeout; + int fd, ret; + + clock_gettime(CLOCK_MONOTONIC, &timeout); + + fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + mutex_args.owner = 123; + mutex_args.count = 0; + ret = ioctl(fd, WINESYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + mutex_args.owner = 0; + mutex_args.count = 2; + ret = ioctl(fd, WINESYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + mutex_args.owner = 123; + mutex_args.count = 2; + mutex_args.mutex = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, mutex_args.mutex); + mutex = mutex_args.mutex; + check_mutex_state(fd, mutex, 2, 123); + + ret = put_mutex(fd, mutex, 0, &count); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = put_mutex(fd, mutex, 456, &count); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EPERM, errno); + check_mutex_state(fd, mutex, 2, 123); + + ret = put_mutex(fd, mutex, 123, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, count); + check_mutex_state(fd, mutex, 1, 123); + + ret = put_mutex(fd, mutex, 123, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, count); + check_mutex_state(fd, mutex, 0, 0); + + ret = put_mutex(fd, mutex, 123, &count); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EPERM, errno); + + ret = wait_any(fd, 1, &mutex, 456, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_mutex_state(fd, mutex, 1, 456); + + ret = wait_any(fd, 1, &mutex, 456, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_mutex_state(fd, mutex, 2, 456); + + ret = put_mutex(fd, mutex, 456, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, count); + check_mutex_state(fd, mutex, 1, 456); + + ret = wait_any(fd, 1, &mutex, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + owner = 0; + ret = ioctl(fd, WINESYNC_IOC_KILL_OWNER, &owner); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + owner = 123; + ret = ioctl(fd, WINESYNC_IOC_KILL_OWNER, &owner); + EXPECT_EQ(0, ret); + check_mutex_state(fd, mutex, 1, 456); + + owner = 456; + ret = ioctl(fd, WINESYNC_IOC_KILL_OWNER, &owner); + EXPECT_EQ(0, ret); + + mutex_args.count = 0xdeadbeef; + mutex_args.owner = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_READ_MUTEX, &mutex_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOWNERDEAD, errno); + EXPECT_EQ(0, mutex_args.count); + EXPECT_EQ(0, mutex_args.owner); + + mutex_args.count = 0xdeadbeef; + mutex_args.owner = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_READ_MUTEX, &mutex_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOWNERDEAD, errno); + EXPECT_EQ(0, mutex_args.count); + EXPECT_EQ(0, mutex_args.owner); + + ret = wait_any(fd, 1, &mutex, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOWNERDEAD, errno); + EXPECT_EQ(0, index); + check_mutex_state(fd, mutex, 1, 123); + + owner = 123; + ret = ioctl(fd, WINESYNC_IOC_KILL_OWNER, &owner); + EXPECT_EQ(0, ret); + + mutex_args.count = 0xdeadbeef; + mutex_args.owner = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_READ_MUTEX, &mutex_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOWNERDEAD, errno); + EXPECT_EQ(0, mutex_args.count); + EXPECT_EQ(0, mutex_args.owner); + + ret = wait_any(fd, 1, &mutex, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOWNERDEAD, errno); + EXPECT_EQ(0, index); + check_mutex_state(fd, mutex, 1, 123); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &mutex); + EXPECT_EQ(0, ret); + + mutex_args.owner = 0; + mutex_args.count = 0; + mutex_args.mutex = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, mutex_args.mutex); + mutex = mutex_args.mutex; + check_mutex_state(fd, mutex, 0, 0); + + ret = wait_any(fd, 1, &mutex, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_mutex_state(fd, mutex, 1, 123); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &mutex_args.mutex); + EXPECT_EQ(0, ret); + + close(fd); +} + +TEST(manual_event_state) +{ + struct winesync_event_args event_args; + __u32 index; + int fd, ret; + + fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + event_args.manual = 1; + event_args.signaled = 0; + event_args.event = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, event_args.event); + check_event_state(fd, event_args.event, 0, 1); + + event_args.signaled = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, event_args.signaled); + check_event_state(fd, event_args.event, 1, 1); + + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, event_args.signaled); + check_event_state(fd, event_args.event, 1, 1); + + ret = wait_any(fd, 1, &event_args.event, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_event_state(fd, event_args.event, 1, 1); + + event_args.signaled = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_RESET_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, event_args.signaled); + check_event_state(fd, event_args.event, 0, 1); + + ret = ioctl(fd, WINESYNC_IOC_RESET_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, event_args.signaled); + check_event_state(fd, event_args.event, 0, 1); + + ret = wait_any(fd, 1, &event_args.event, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, event_args.signaled); + + ret = ioctl(fd, WINESYNC_IOC_PULSE_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, event_args.signaled); + check_event_state(fd, event_args.event, 0, 1); + + ret = ioctl(fd, WINESYNC_IOC_PULSE_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, event_args.signaled); + check_event_state(fd, event_args.event, 0, 1); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &event_args.event); + EXPECT_EQ(0, ret); + + close(fd); +} + +TEST(auto_event_state) +{ + struct winesync_event_args event_args; + __u32 index; + int fd, ret; + + fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + event_args.manual = 0; + event_args.signaled = 1; + event_args.event = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, event_args.event); + + check_event_state(fd, event_args.event, 1, 0); + + event_args.signaled = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, event_args.signaled); + check_event_state(fd, event_args.event, 1, 0); + + ret = wait_any(fd, 1, &event_args.event, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_event_state(fd, event_args.event, 0, 0); + + event_args.signaled = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_RESET_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, event_args.signaled); + check_event_state(fd, event_args.event, 0, 0); + + ret = wait_any(fd, 1, &event_args.event, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, event_args.signaled); + + ret = ioctl(fd, WINESYNC_IOC_PULSE_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, event_args.signaled); + check_event_state(fd, event_args.event, 0, 0); + + ret = ioctl(fd, WINESYNC_IOC_PULSE_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, event_args.signaled); + check_event_state(fd, event_args.event, 0, 0); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &event_args.event); + EXPECT_EQ(0, ret); + + close(fd); +} + +TEST(test_wait_any) +{ + struct winesync_mutex_args mutex_args = {0}; + struct winesync_wait_args wait_args = {0}; + struct winesync_sem_args sem_args = {0}; + __u32 objs[2], owner, index; + struct timespec timeout; + int fd, ret; + + clock_gettime(CLOCK_MONOTONIC, &timeout); + + fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + sem_args.count = 2; + sem_args.max = 3; + sem_args.sem = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, sem_args.sem); + + mutex_args.owner = 0; + mutex_args.count = 0; + mutex_args.mutex = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, mutex_args.mutex); + + objs[0] = sem_args.sem; + objs[1] = mutex_args.mutex; + + ret = wait_any(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(fd, sem_args.sem, 1, 3); + check_mutex_state(fd, mutex_args.mutex, 0, 0); + + ret = wait_any(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(fd, sem_args.sem, 0, 3); + check_mutex_state(fd, mutex_args.mutex, 0, 0); + + ret = wait_any(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, index); + check_sem_state(fd, sem_args.sem, 0, 3); + check_mutex_state(fd, mutex_args.mutex, 1, 123); + + sem_args.count = 1; + ret = ioctl(fd, WINESYNC_IOC_PUT_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, sem_args.count); + + ret = wait_any(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(fd, sem_args.sem, 0, 3); + check_mutex_state(fd, mutex_args.mutex, 1, 123); + + ret = wait_any(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, index); + check_sem_state(fd, sem_args.sem, 0, 3); + check_mutex_state(fd, mutex_args.mutex, 2, 123); + + ret = wait_any(fd, 2, objs, 456, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + owner = 123; + ret = ioctl(fd, WINESYNC_IOC_KILL_OWNER, &owner); + EXPECT_EQ(0, ret); + + ret = wait_any(fd, 2, objs, 456, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOWNERDEAD, errno); + EXPECT_EQ(1, index); + + ret = wait_any(fd, 2, objs, 456, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, index); + + /* test waiting on the same object twice */ + sem_args.count = 2; + ret = ioctl(fd, WINESYNC_IOC_PUT_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, sem_args.count); + + objs[0] = objs[1] = sem_args.sem; + ret = wait_any(fd, 2, objs, 456, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, wait_args.index); + check_sem_state(fd, sem_args.sem, 1, 3); + + ret = wait_any(fd, 0, NULL, 456, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &sem_args.sem); + EXPECT_EQ(0, ret); + ret = ioctl(fd, WINESYNC_IOC_DELETE, &mutex_args.mutex); + EXPECT_EQ(0, ret); + + close(fd); +} + +TEST(test_wait_all) +{ + struct winesync_event_args event_args = {0}; + struct winesync_mutex_args mutex_args = {0}; + struct winesync_sem_args sem_args = {0}; + __u32 objs[2], owner, index; + int fd, ret; + + fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + sem_args.count = 2; + sem_args.max = 3; + sem_args.sem = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, sem_args.sem); + + mutex_args.owner = 0; + mutex_args.count = 0; + mutex_args.mutex = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, mutex_args.mutex); + + event_args.manual = true; + event_args.signaled = true; + ret = ioctl(fd, WINESYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_EQ(0, ret); + + objs[0] = sem_args.sem; + objs[1] = mutex_args.mutex; + + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(fd, sem_args.sem, 1, 3); + check_mutex_state(fd, mutex_args.mutex, 1, 123); + + ret = wait_all(fd, 2, objs, 456, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + check_sem_state(fd, sem_args.sem, 1, 3); + check_mutex_state(fd, mutex_args.mutex, 1, 123); + + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(fd, sem_args.sem, 0, 3); + check_mutex_state(fd, mutex_args.mutex, 2, 123); + + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + check_sem_state(fd, sem_args.sem, 0, 3); + check_mutex_state(fd, mutex_args.mutex, 2, 123); + + sem_args.count = 3; + ret = ioctl(fd, WINESYNC_IOC_PUT_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, sem_args.count); + + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(fd, sem_args.sem, 2, 3); + check_mutex_state(fd, mutex_args.mutex, 3, 123); + + owner = 123; + ret = ioctl(fd, WINESYNC_IOC_KILL_OWNER, &owner); + EXPECT_EQ(0, ret); + + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOWNERDEAD, errno); + check_sem_state(fd, sem_args.sem, 1, 3); + check_mutex_state(fd, mutex_args.mutex, 1, 123); + + objs[0] = sem_args.sem; + objs[1] = event_args.event; + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(fd, sem_args.sem, 0, 3); + check_event_state(fd, event_args.event, 1, 1); + + /* test waiting on the same object twice */ + objs[0] = objs[1] = sem_args.sem; + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &sem_args.sem); + EXPECT_EQ(0, ret); + ret = ioctl(fd, WINESYNC_IOC_DELETE, &mutex_args.mutex); + EXPECT_EQ(0, ret); + ret = ioctl(fd, WINESYNC_IOC_DELETE, &event_args.event); + EXPECT_EQ(0, ret); + + close(fd); +} + +TEST(invalid_objects) +{ + struct winesync_event_args event_args = {0}; + struct winesync_mutex_args mutex_args = {0}; + struct winesync_wait_args wait_args = {0}; + struct winesync_sem_args sem_args = {0}; + __u32 objs[2] = {0}; + int fd, ret; + + fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + ret = ioctl(fd, WINESYNC_IOC_PUT_SEM, &sem_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_READ_SEM, &sem_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_PUT_MUTEX, &mutex_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_READ_MUTEX, &mutex_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_RESET_EVENT, &event_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_PULSE_EVENT, &event_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_READ_EVENT, &event_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + wait_args.objs = (uintptr_t)objs; + wait_args.count = 1; + ret = ioctl(fd, WINESYNC_IOC_WAIT_ANY, &wait_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + ret = ioctl(fd, WINESYNC_IOC_WAIT_ALL, &wait_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &objs[0]); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + sem_args.max = 1; + ret = ioctl(fd, WINESYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_EQ(0, ret); + + mutex_args.mutex = sem_args.sem; + ret = ioctl(fd, WINESYNC_IOC_PUT_MUTEX, &mutex_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_READ_MUTEX, &mutex_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + event_args.event = sem_args.sem; + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_RESET_EVENT, &event_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_PULSE_EVENT, &event_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_READ_EVENT, &event_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + objs[0] = sem_args.sem; + objs[1] = sem_args.sem + 1; + wait_args.count = 2; + ret = ioctl(fd, WINESYNC_IOC_WAIT_ANY, &wait_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + ret = ioctl(fd, WINESYNC_IOC_WAIT_ALL, &wait_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + objs[0] = sem_args.sem + 1; + objs[1] = sem_args.sem; + ret = ioctl(fd, WINESYNC_IOC_WAIT_ANY, &wait_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + ret = ioctl(fd, WINESYNC_IOC_WAIT_ALL, &wait_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &sem_args.sem); + EXPECT_EQ(0, ret); + + ret = ioctl(fd, WINESYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_EQ(0, ret); + + sem_args.sem = mutex_args.mutex; + ret = ioctl(fd, WINESYNC_IOC_PUT_SEM, &sem_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_READ_SEM, &sem_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &mutex_args.mutex); + EXPECT_EQ(0, ret); + + close(fd); +} + +struct wake_args +{ + int fd; + __u32 obj; +}; + +struct wait_args +{ + int fd; + unsigned long request; + struct winesync_wait_args *args; + int ret; + int err; +}; + +static void *wait_thread(void *arg) +{ + struct wait_args *args = arg; + + args->ret = ioctl(args->fd, args->request, args->args); + args->err = errno; + return NULL; +} + +static void get_abs_timeout(struct timespec *timeout, clockid_t clock, + unsigned int ms) +{ + clock_gettime(clock, timeout); + timeout->tv_nsec += ms * 1000000; + timeout->tv_sec += (timeout->tv_nsec / 1000000000); + timeout->tv_nsec %= 1000000000; +} + +static int wait_for_thread(pthread_t thread, unsigned int ms) +{ + struct timespec timeout; + get_abs_timeout(&timeout, CLOCK_REALTIME, ms); + return pthread_timedjoin_np(thread, NULL, &timeout); +} + +TEST(wake_any) +{ + struct winesync_event_args event_args = {0}; + struct winesync_mutex_args mutex_args = {0}; + struct winesync_wait_args wait_args = {0}; + struct winesync_sem_args sem_args = {0}; + struct wait_args thread_args; + __u32 objs[2], count, index; + struct timespec timeout; + pthread_t thread; + int fd, ret; + + fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + sem_args.count = 0; + sem_args.max = 3; + sem_args.sem = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, sem_args.sem); + + mutex_args.owner = 123; + mutex_args.count = 1; + mutex_args.mutex = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, mutex_args.mutex); + + objs[0] = sem_args.sem; + objs[1] = mutex_args.mutex; + + /* test waking the semaphore */ + + get_abs_timeout(&timeout, CLOCK_MONOTONIC, 1000); + wait_args.timeout = (uintptr_t)&timeout; + wait_args.objs = (uintptr_t)objs; + wait_args.count = 2; + wait_args.owner = 456; + wait_args.index = 0xdeadbeef; + thread_args.fd = fd; + thread_args.args = &wait_args; + thread_args.request = WINESYNC_IOC_WAIT_ANY; + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + sem_args.count = 1; + ret = ioctl(fd, WINESYNC_IOC_PUT_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, sem_args.count); + check_sem_state(fd, sem_args.sem, 0, 3); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(0, wait_args.index); + + /* test waking the mutex */ + + /* first grab it again for owner 123 */ + ret = wait_any(fd, 1, &mutex_args.mutex, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + + get_abs_timeout(&timeout, CLOCK_MONOTONIC, 1000); + wait_args.owner = 456; + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = put_mutex(fd, mutex_args.mutex, 123, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, count); + + ret = pthread_tryjoin_np(thread, NULL); + EXPECT_EQ(EBUSY, ret); + + ret = put_mutex(fd, mutex_args.mutex, 123, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, mutex_args.count); + check_mutex_state(fd, mutex_args.mutex, 1, 456); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(1, wait_args.index); + + /* test waking events */ + + event_args.manual = false; + event_args.signaled = false; + ret = ioctl(fd, WINESYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_EQ(0, ret); + + objs[1] = event_args.event; + get_abs_timeout(&timeout, CLOCK_MONOTONIC, 1000); + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, event_args.signaled); + check_event_state(fd, event_args.event, 0, 0); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(1, wait_args.index); + + get_abs_timeout(&timeout, CLOCK_MONOTONIC, 1000); + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = ioctl(fd, WINESYNC_IOC_PULSE_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, event_args.signaled); + check_event_state(fd, event_args.event, 0, 0); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(1, wait_args.index); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &event_args.event); + EXPECT_EQ(0, ret); + + event_args.manual = true; + event_args.signaled = false; + ret = ioctl(fd, WINESYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_EQ(0, ret); + + objs[1] = event_args.event; + get_abs_timeout(&timeout, CLOCK_MONOTONIC, 1000); + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, event_args.signaled); + check_event_state(fd, event_args.event, 1, 1); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(1, wait_args.index); + + ret = ioctl(fd, WINESYNC_IOC_RESET_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, event_args.signaled); + + get_abs_timeout(&timeout, CLOCK_MONOTONIC, 1000); + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = ioctl(fd, WINESYNC_IOC_PULSE_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, event_args.signaled); + check_event_state(fd, event_args.event, 0, 1); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(1, wait_args.index); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &event_args.event); + EXPECT_EQ(0, ret); + + /* delete an object while it's being waited on */ + + get_abs_timeout(&timeout, CLOCK_MONOTONIC, 200); + wait_args.owner = 123; + objs[1] = mutex_args.mutex; + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &sem_args.sem); + EXPECT_EQ(0, ret); + ret = ioctl(fd, WINESYNC_IOC_DELETE, &mutex_args.mutex); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 200); + EXPECT_EQ(0, ret); + EXPECT_EQ(-1, thread_args.ret); + EXPECT_EQ(ETIMEDOUT, thread_args.err); + + close(fd); +} + +TEST(wake_all) +{ + struct winesync_event_args manual_event_args = {0}; + struct winesync_event_args auto_event_args = {0}; + struct winesync_mutex_args mutex_args = {0}; + struct winesync_wait_args wait_args = {0}; + struct winesync_sem_args sem_args = {0}; + struct wait_args thread_args; + __u32 objs[4], count, index; + struct timespec timeout; + pthread_t thread; + int fd, ret; + + fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + sem_args.count = 0; + sem_args.max = 3; + sem_args.sem = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, sem_args.sem); + + mutex_args.owner = 123; + mutex_args.count = 1; + mutex_args.mutex = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, mutex_args.mutex); + + manual_event_args.manual = true; + manual_event_args.signaled = true; + ret = ioctl(fd, WINESYNC_IOC_CREATE_EVENT, &manual_event_args); + EXPECT_EQ(0, ret); + + auto_event_args.manual = false; + auto_event_args.signaled = true; + ret = ioctl(fd, WINESYNC_IOC_CREATE_EVENT, &auto_event_args); + EXPECT_EQ(0, ret); + + objs[0] = sem_args.sem; + objs[1] = mutex_args.mutex; + objs[2] = manual_event_args.event; + objs[3] = auto_event_args.event; + + get_abs_timeout(&timeout, CLOCK_MONOTONIC, 1000); + wait_args.timeout = (uintptr_t)&timeout; + wait_args.objs = (uintptr_t)objs; + wait_args.count = 4; + wait_args.owner = 456; + thread_args.fd = fd; + thread_args.args = &wait_args; + thread_args.request = WINESYNC_IOC_WAIT_ALL; + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + sem_args.count = 1; + ret = ioctl(fd, WINESYNC_IOC_PUT_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, sem_args.count); + + ret = pthread_tryjoin_np(thread, NULL); + EXPECT_EQ(EBUSY, ret); + + check_sem_state(fd, sem_args.sem, 1, 3); + + ret = wait_any(fd, 1, &sem_args.sem, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + + ret = put_mutex(fd, mutex_args.mutex, 123, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, count); + + ret = pthread_tryjoin_np(thread, NULL); + EXPECT_EQ(EBUSY, ret); + + check_mutex_state(fd, mutex_args.mutex, 0, 0); + + ret = ioctl(fd, WINESYNC_IOC_RESET_EVENT, &manual_event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, manual_event_args.signaled); + + sem_args.count = 2; + ret = ioctl(fd, WINESYNC_IOC_PUT_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, sem_args.count); + check_sem_state(fd, sem_args.sem, 2, 3); + + ret = ioctl(fd, WINESYNC_IOC_RESET_EVENT, &auto_event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, auto_event_args.signaled); + + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &manual_event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, manual_event_args.signaled); + + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &auto_event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, auto_event_args.signaled); + + check_sem_state(fd, sem_args.sem, 1, 3); + check_mutex_state(fd, mutex_args.mutex, 1, 456); + check_event_state(fd, manual_event_args.event, 1, 1); + check_event_state(fd, auto_event_args.event, 0, 0); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + + /* delete an object while it's being waited on */ + + get_abs_timeout(&timeout, CLOCK_MONOTONIC, 200); + wait_args.owner = 123; + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &sem_args.sem); + EXPECT_EQ(0, ret); + ret = ioctl(fd, WINESYNC_IOC_DELETE, &mutex_args.mutex); + EXPECT_EQ(0, ret); + ret = ioctl(fd, WINESYNC_IOC_DELETE, &manual_event_args.event); + EXPECT_EQ(0, ret); + ret = ioctl(fd, WINESYNC_IOC_DELETE, &auto_event_args.event); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 200); + EXPECT_EQ(0, ret); + EXPECT_EQ(-1, thread_args.ret); + EXPECT_EQ(ETIMEDOUT, thread_args.err); + + close(fd); +} + +TEST(alert_any) +{ + struct winesync_event_args event_args = {0}; + struct winesync_wait_args wait_args = {0}; + struct winesync_sem_args sem_args = {0}; + struct wait_args thread_args; + struct timespec timeout; + __u32 objs[2], index; + pthread_t thread; + int fd, ret; + + fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + sem_args.count = 0; + sem_args.max = 2; + sem_args.sem = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, sem_args.sem); + objs[0] = sem_args.sem; + + sem_args.count = 1; + sem_args.max = 2; + sem_args.sem = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, sem_args.sem); + objs[1] = sem_args.sem; + + event_args.manual = true; + event_args.signaled = true; + ret = ioctl(fd, WINESYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_EQ(0, ret); + + ret = wait_any_alert(fd, 0, NULL, 123, event_args.event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + + ret = ioctl(fd, WINESYNC_IOC_RESET_EVENT, &event_args); + EXPECT_EQ(0, ret); + + ret = wait_any_alert(fd, 0, NULL, 123, event_args.event, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(0, ret); + + ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, index); + + ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, index); + + /* test wakeup via alert */ + + ret = ioctl(fd, WINESYNC_IOC_RESET_EVENT, &event_args); + EXPECT_EQ(0, ret); + + get_abs_timeout(&timeout, CLOCK_MONOTONIC, 1000); + wait_args.timeout = (uintptr_t)&timeout; + wait_args.objs = (uintptr_t)objs; + wait_args.count = 2; + wait_args.owner = 123; + wait_args.index = 0xdeadbeef; + wait_args.alert = event_args.event; + thread_args.fd = fd; + thread_args.args = &wait_args; + thread_args.request = WINESYNC_IOC_WAIT_ANY; + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(2, wait_args.index); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &event_args.event); + EXPECT_EQ(0, ret); + + /* test with an auto-reset event */ + + event_args.manual = false; + event_args.signaled = true; + ret = ioctl(fd, WINESYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_EQ(0, ret); + + sem_args.sem = objs[0]; + sem_args.count = 1; + ret = ioctl(fd, WINESYNC_IOC_PUT_SEM, &sem_args); + EXPECT_EQ(0, ret); + + ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + + ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, index); + + ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &event_args.event); + EXPECT_EQ(0, ret); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &objs[0]); + EXPECT_EQ(0, ret); + ret = ioctl(fd, WINESYNC_IOC_DELETE, &objs[1]); + EXPECT_EQ(0, ret); + + close(fd); +} + +TEST(alert_all) +{ + struct winesync_event_args event_args = {0}; + struct winesync_wait_args wait_args = {0}; + struct winesync_sem_args sem_args = {0}; + struct wait_args thread_args; + struct timespec timeout; + __u32 objs[2], index; + pthread_t thread; + int fd, ret; + + fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + sem_args.count = 2; + sem_args.max = 2; + sem_args.sem = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, sem_args.sem); + objs[0] = sem_args.sem; + + sem_args.count = 1; + sem_args.max = 2; + sem_args.sem = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, sem_args.sem); + objs[1] = sem_args.sem; + + event_args.manual = true; + event_args.signaled = true; + ret = ioctl(fd, WINESYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_EQ(0, ret); + + ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + + ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, index); + + /* test wakeup via alert */ + + ret = ioctl(fd, WINESYNC_IOC_RESET_EVENT, &event_args); + EXPECT_EQ(0, ret); + + get_abs_timeout(&timeout, CLOCK_MONOTONIC, 1000); + wait_args.timeout = (uintptr_t)&timeout; + wait_args.objs = (uintptr_t)objs; + wait_args.count = 2; + wait_args.owner = 123; + wait_args.index = 0xdeadbeef; + wait_args.alert = event_args.event; + thread_args.fd = fd; + thread_args.args = &wait_args; + thread_args.request = WINESYNC_IOC_WAIT_ALL; + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(2, wait_args.index); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &event_args.event); + EXPECT_EQ(0, ret); + + /* test with an auto-reset event */ + + event_args.manual = false; + event_args.signaled = true; + ret = ioctl(fd, WINESYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_EQ(0, ret); + + sem_args.sem = objs[1]; + sem_args.count = 2; + ret = ioctl(fd, WINESYNC_IOC_PUT_SEM, &sem_args); + EXPECT_EQ(0, ret); + + ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + + ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, index); + + ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &event_args.event); + EXPECT_EQ(0, ret); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &objs[0]); + EXPECT_EQ(0, ret); + ret = ioctl(fd, WINESYNC_IOC_DELETE, &objs[1]); + EXPECT_EQ(0, ret); + + close(fd); +} + +TEST_HARNESS_MAIN