Skip to content

Commit

Permalink
perf/core: Make cgroup switch visit only cpuctxs with cgroup events
Browse files Browse the repository at this point in the history
This patch follows from a conversation in CQM/CMT's last series about
speeding up the context switch for cgroup events:

  https://patchwork.kernel.org/patch/9478617/

This is a low-hanging fruit optimization. It replaces the iteration over
the "pmus" list in cgroup switch by an iteration over a new list that
contains only cpuctxs with at least one cgroup event.

This is necessary because the number of PMUs have increased over the years
e.g modern x86 server systems have well above 50 PMUs.

The iteration over the full PMU list is unneccessary and can be costly in
heavy cache contention scenarios.

Below are some instrumentation measurements with 10, 50 and 90 percentiles
of the total cost of context switch before and after this optimization for
a simple array read/write microbenchark.

  Contention
    Level    Nr events      Before (us)            After (us)       Median
  L2    L3     types      (10%, 50%, 90%)       (10%, 50%, 90%     Speedup
  --------------------------------------------------------------------------
  Low   Low       1       (1.72, 2.42, 5.85)    (1.35, 1.64, 5.46)     29%
  High  Low       1       (2.08, 4.56, 19.8)    (1720, 2.20, 13.7)     51%
  High  High      1       (2.86, 10.4, 12.7)    (2.54, 4.32, 12.1)     58%

  Low   Low       2       (1.98, 3.20, 6.89)    (1.68, 2.41, 8.89)     24%
  High  Low       2       (2.48, 5.28, 22.4)    (2150, 3.69, 14.6)     30%
  High  High      2       (3.32, 8.09, 13.9)    (2.80, 5.15, 13.7)     36%

where:

  1 event type  = cycles
  2 event types = cycles,intel_cqm/llc_occupancy/

   Contention L2 Low: workset  <  L2 cache size.
                 High:  "     >>  L2   "     " .
   Contention L3 Low: workset of task on all sockets  <  L3 cache size.
                 High:   "     "   "   "   "    "    >>  L3   "     " .

   Median Speedup is (50%ile Before - 50%ile After) /  50%ile Before

Unsurprisingly, the benefits of this optimization decrease with the number
of cpuctxs with a cgroup events, yet, is never detrimental.

Tested-by: Mark Rutland <[email protected]>
Signed-off-by: David Carrillo-Cisneros <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Acked-by: Mark Rutland <[email protected]>
Cc: Alexander Shishkin <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Kan Liang <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Paul Turner <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Srinivas Pandruvada <[email protected]>
Cc: Stephane Eranian <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Vikas Shivappa <[email protected]>
Cc: Vince Weaver <[email protected]>
Cc: Vince Weaver <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
  • Loading branch information
David Carrillo-Cisneros authored and Ingo Molnar committed Jan 30, 2017
1 parent ae5112a commit 058fe1c
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 53 deletions.
1 change: 1 addition & 0 deletions include/linux/perf_event.h
Original file line number Diff line number Diff line change
Expand Up @@ -788,6 +788,7 @@ struct perf_cpu_context {
struct pmu *unique_pmu;
#ifdef CONFIG_CGROUP_PERF
struct perf_cgroup *cgrp;
struct list_head cgrp_cpuctx_entry;
#endif

struct list_head sched_cb_entry;
Expand Down
98 changes: 45 additions & 53 deletions kernel/events/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,8 @@ perf_cgroup_set_timestamp(struct task_struct *task,
info->timestamp = ctx->timestamp;
}

static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);

#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */

Expand All @@ -690,61 +692,46 @@ perf_cgroup_set_timestamp(struct task_struct *task,
static void perf_cgroup_switch(struct task_struct *task, int mode)
{
struct perf_cpu_context *cpuctx;
struct pmu *pmu;
struct list_head *list;
unsigned long flags;

/*
* disable interrupts to avoid geting nr_cgroup
* changes via __perf_event_disable(). Also
* avoids preemption.
* Disable interrupts and preemption to avoid this CPU's
* cgrp_cpuctx_entry to change under us.
*/
local_irq_save(flags);

/*
* we reschedule only in the presence of cgroup
* constrained events.
*/
list = this_cpu_ptr(&cgrp_cpuctx_list);
list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);

list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
if (cpuctx->unique_pmu != pmu)
continue; /* ensure we process each cpuctx once */
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(cpuctx->ctx.pmu);

/*
* perf_cgroup_events says at least one
* context on this CPU has cgroup events.
*
* ctx->nr_cgroups reports the number of cgroup
* events for a context.
*/
if (cpuctx->ctx.nr_cgroups > 0) {
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(cpuctx->ctx.pmu);

if (mode & PERF_CGROUP_SWOUT) {
cpu_ctx_sched_out(cpuctx, EVENT_ALL);
/*
* must not be done before ctxswout due
* to event_filter_match() in event_sched_out()
*/
cpuctx->cgrp = NULL;
}
if (mode & PERF_CGROUP_SWOUT) {
cpu_ctx_sched_out(cpuctx, EVENT_ALL);
/*
* must not be done before ctxswout due
* to event_filter_match() in event_sched_out()
*/
cpuctx->cgrp = NULL;
}

if (mode & PERF_CGROUP_SWIN) {
WARN_ON_ONCE(cpuctx->cgrp);
/*
* set cgrp before ctxsw in to allow
* event_filter_match() to not have to pass
* task around
* we pass the cpuctx->ctx to perf_cgroup_from_task()
* because cgorup events are only per-cpu
*/
cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
}
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
if (mode & PERF_CGROUP_SWIN) {
WARN_ON_ONCE(cpuctx->cgrp);
/*
* set cgrp before ctxsw in to allow
* event_filter_match() to not have to pass
* task around
* we pass the cpuctx->ctx to perf_cgroup_from_task()
* because cgorup events are only per-cpu
*/
cpuctx->cgrp = perf_cgroup_from_task(task,
&cpuctx->ctx);
cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
}
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}

local_irq_restore(flags);
Expand Down Expand Up @@ -889,6 +876,7 @@ list_update_cgroup_event(struct perf_event *event,
struct perf_event_context *ctx, bool add)
{
struct perf_cpu_context *cpuctx;
struct list_head *cpuctx_entry;

if (!is_cgroup_event(event))
return;
Expand All @@ -902,15 +890,16 @@ list_update_cgroup_event(struct perf_event *event,
* this will always be called from the right CPU.
*/
cpuctx = __get_cpu_context(ctx);

/*
* cpuctx->cgrp is NULL until a cgroup event is sched in or
* ctx->nr_cgroup == 0 .
*/
if (add && perf_cgroup_from_task(current, ctx) == event->cgrp)
cpuctx->cgrp = event->cgrp;
else if (!add)
cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
/* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
if (add) {
list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
if (perf_cgroup_from_task(current, ctx) == event->cgrp)
cpuctx->cgrp = event->cgrp;
} else {
list_del(cpuctx_entry);
cpuctx->cgrp = NULL;
}
}

#else /* !CONFIG_CGROUP_PERF */
Expand Down Expand Up @@ -10709,6 +10698,9 @@ static void __init perf_event_init_all_cpus(void)
INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));

#ifdef CONFIG_CGROUP_PERF
INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
#endif
INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
}
}
Expand Down

0 comments on commit 058fe1c

Please sign in to comment.