diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b5aba0e5a69904..8d465478adb922 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -284,6 +284,16 @@ struct wq_flusher { struct wq_device; +/* + * Unlike in a per-cpu workqueue where max_active limits its concurrency level + * on each CPU, in an unbound workqueue, max_active applies to the whole system. + * As sharing a single nr_active across multiple sockets can be very expensive, + * the counting and enforcement is per NUMA node. + */ +struct wq_node_nr_active { + atomic_t nr; /* per-node nr_active count */ +}; + /* * The externally visible workqueue. It relays the issued work items to * the appropriate worker_pool through its pool_workqueues. @@ -330,6 +340,7 @@ struct workqueue_struct { /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */ + struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */ }; static struct kmem_cache *pwq_cache; @@ -1425,6 +1436,31 @@ work_func_t wq_worker_last_func(struct task_struct *task) return worker->last_func; } +/** + * wq_node_nr_active - Determine wq_node_nr_active to use + * @wq: workqueue of interest + * @node: NUMA node, can be %NUMA_NO_NODE + * + * Determine wq_node_nr_active to use for @wq on @node. Returns: + * + * - %NULL for per-cpu workqueues as they don't need to use shared nr_active. + * + * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE. + * + * - Otherwise, node_nr_active[@node]. + */ +static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq, + int node) +{ + if (!(wq->flags & WQ_UNBOUND)) + return NULL; + + if (node == NUMA_NO_NODE) + node = nr_node_ids; + + return wq->node_nr_active[node]; +} + /** * get_pwq - get an extra reference on the specified pool_workqueue * @pwq: pool_workqueue to get @@ -1506,12 +1542,17 @@ static bool pwq_activate_work(struct pool_workqueue *pwq, struct work_struct *work) { struct worker_pool *pool = pwq->pool; + struct wq_node_nr_active *nna; lockdep_assert_held(&pool->lock); if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE)) return false; + nna = wq_node_nr_active(pwq->wq, pool->node); + if (nna) + atomic_inc(&nna->nr); + pwq->nr_active++; __pwq_activate_work(pwq, work); return true; @@ -1528,14 +1569,18 @@ static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq) { struct workqueue_struct *wq = pwq->wq; struct worker_pool *pool = pwq->pool; + struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node); bool obtained; lockdep_assert_held(&pool->lock); obtained = pwq->nr_active < READ_ONCE(wq->max_active); - if (obtained) + if (obtained) { pwq->nr_active++; + if (nna) + atomic_inc(&nna->nr); + } return obtained; } @@ -1572,10 +1617,26 @@ static bool pwq_activate_first_inactive(struct pool_workqueue *pwq) static void pwq_dec_nr_active(struct pool_workqueue *pwq) { struct worker_pool *pool = pwq->pool; + struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node); lockdep_assert_held(&pool->lock); + /* + * @pwq->nr_active should be decremented for both percpu and unbound + * workqueues. + */ pwq->nr_active--; + + /* + * For a percpu workqueue, it's simple. Just need to kick the first + * inactive work item on @pwq itself. + */ + if (!nna) { + pwq_activate_first_inactive(pwq); + return; + } + + atomic_dec(&nna->nr); pwq_activate_first_inactive(pwq); } @@ -4039,11 +4100,63 @@ static void wq_free_lockdep(struct workqueue_struct *wq) } #endif +static void free_node_nr_active(struct wq_node_nr_active **nna_ar) +{ + int node; + + for_each_node(node) { + kfree(nna_ar[node]); + nna_ar[node] = NULL; + } + + kfree(nna_ar[nr_node_ids]); + nna_ar[nr_node_ids] = NULL; +} + +static void init_node_nr_active(struct wq_node_nr_active *nna) +{ + atomic_set(&nna->nr, 0); +} + +/* + * Each node's nr_active counter will be accessed mostly from its own node and + * should be allocated in the node. + */ +static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar) +{ + struct wq_node_nr_active *nna; + int node; + + for_each_node(node) { + nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node); + if (!nna) + goto err_free; + init_node_nr_active(nna); + nna_ar[node] = nna; + } + + /* [nr_node_ids] is used as the fallback */ + nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE); + if (!nna) + goto err_free; + init_node_nr_active(nna); + nna_ar[nr_node_ids] = nna; + + return 0; + +err_free: + free_node_nr_active(nna_ar); + return -ENOMEM; +} + static void rcu_free_wq(struct rcu_head *rcu) { struct workqueue_struct *wq = container_of(rcu, struct workqueue_struct, rcu); + if (wq->flags & WQ_UNBOUND) + free_node_nr_active(wq->node_nr_active); + wq_free_lockdep(wq); free_percpu(wq->cpu_pwq); free_workqueue_attrs(wq->unbound_attrs); @@ -4785,7 +4898,8 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, { va_list args; struct workqueue_struct *wq; - int len; + size_t wq_size; + int name_len; /* * Unbound && max_active == 1 used to imply ordered, which is no longer @@ -4801,7 +4915,12 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, flags |= WQ_UNBOUND; /* allocate wq and format name */ - wq = kzalloc(sizeof(*wq), GFP_KERNEL); + if (flags & WQ_UNBOUND) + wq_size = struct_size(wq, node_nr_active, nr_node_ids + 1); + else + wq_size = sizeof(*wq); + + wq = kzalloc(wq_size, GFP_KERNEL); if (!wq) return NULL; @@ -4812,11 +4931,12 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, } va_start(args, max_active); - len = vsnprintf(wq->name, sizeof(wq->name), fmt, args); + name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args); va_end(args); - if (len >= WQ_NAME_LEN) - pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n", wq->name); + if (name_len >= WQ_NAME_LEN) + pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n", + wq->name); max_active = max_active ?: WQ_DFL_ACTIVE; max_active = wq_clamp_max_active(max_active, flags, wq->name); @@ -4835,8 +4955,13 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, wq_init_lockdep(wq); INIT_LIST_HEAD(&wq->list); + if (flags & WQ_UNBOUND) { + if (alloc_node_nr_active(wq->node_nr_active) < 0) + goto err_unreg_lockdep; + } + if (alloc_and_link_pwqs(wq) < 0) - goto err_unreg_lockdep; + goto err_free_node_nr_active; if (wq_online && init_rescuer(wq) < 0) goto err_destroy; @@ -4861,6 +4986,9 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, return wq; +err_free_node_nr_active: + if (wq->flags & WQ_UNBOUND) + free_node_nr_active(wq->node_nr_active); err_unreg_lockdep: wq_unregister_lockdep(wq); wq_free_lockdep(wq);