Skip to content

Commit

Permalink
Merge pull request raspberrypi#120 from sched-ext/htejun
Browse files Browse the repository at this point in the history
scx: Sync from scx repo
  • Loading branch information
htejun authored Jan 11, 2024
2 parents 4361d23 + dfb1210 commit 6eb6c92
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 41 deletions.
92 changes: 62 additions & 30 deletions tools/sched_ext/scx_flatcg.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,11 @@
#include <scx/common.bpf.h>
#include "scx_flatcg.h"

/*
* Maximum amount of retries to find a valid cgroup.
*/
#define CGROUP_MAX_RETRIES 1024

char _license[] SEC("license") = "GPL";

const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */
Expand Down Expand Up @@ -302,6 +307,17 @@ static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
bpf_spin_unlock(&cgv_tree_lock);
}

static void set_bypassed_at(struct task_struct *p, struct fcg_task_ctx *taskc)
{
/*
* Tell fcg_stopping() that this bypassed the regular scheduling path
* and should be force charged to the cgroup. 0 is used to indicate that
* the task isn't bypassing, so if the current runtime is 0, go back by
* one nanosecond.
*/
taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1;
}

s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
{
struct fcg_task_ctx *taskc;
Expand All @@ -319,35 +335,12 @@ s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake
/*
* If select_cpu_dfl() is recommending local enqueue, the target CPU is
* idle. Follow it and charge the cgroup later in fcg_stopping() after
* the fact. Use the same mechanism to deal with tasks with custom
* affinities so that we don't have to worry about per-cgroup dq's
* containing tasks that can't be executed from some CPUs.
* the fact.
*/
if (is_idle || p->nr_cpus_allowed != nr_cpus) {
/*
* Tell fcg_stopping() that this bypassed the regular scheduling
* path and should be force charged to the cgroup. 0 is used to
* indicate that the task isn't bypassing, so if the current
* runtime is 0, go back by one nanosecond.
*/
taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1;

/*
* The global dq is deprioritized as we don't want to let tasks
* to boost themselves by constraining its cpumask. The
* deprioritization is rather severe, so let's not apply that to
* per-cpu kernel threads. This is ham-fisted. We probably wanna
* implement per-cgroup fallback dq's instead so that we have
* more control over when tasks with custom cpumask get issued.
*/
if (is_idle ||
(p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD))) {
stat_inc(FCG_STAT_LOCAL);
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
} else {
stat_inc(FCG_STAT_GLOBAL);
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
}
if (is_idle) {
set_bypassed_at(p, taskc);
stat_inc(FCG_STAT_LOCAL);
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
}

return cpu;
Expand All @@ -365,6 +358,32 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
return;
}

/*
* Use the direct dispatching and force charging to deal with tasks with
* custom affinities so that we don't have to worry about per-cgroup
* dq's containing tasks that can't be executed from some CPUs.
*/
if (p->nr_cpus_allowed != nr_cpus) {
set_bypassed_at(p, taskc);

/*
* The global dq is deprioritized as we don't want to let tasks
* to boost themselves by constraining its cpumask. The
* deprioritization is rather severe, so let's not apply that to
* per-cpu kernel threads. This is ham-fisted. We probably wanna
* implement per-cgroup fallback dq's instead so that we have
* more control over when tasks with custom cpumask get issued.
*/
if (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD)) {
stat_inc(FCG_STAT_LOCAL);
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
} else {
stat_inc(FCG_STAT_GLOBAL);
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
}
return;
}

cgrp = scx_bpf_task_cgroup(p);
cgc = find_cgrp_ctx(cgrp);
if (!cgc)
Expand Down Expand Up @@ -691,6 +710,7 @@ static bool try_pick_next_cgroup(u64 *cgidp)
bpf_spin_lock(&cgv_tree_lock);
bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
bpf_spin_unlock(&cgv_tree_lock);
stat_inc(FCG_STAT_PNC_RACE);
} else {
cgv_node = bpf_kptr_xchg(&stash->node, cgv_node);
if (cgv_node) {
Expand All @@ -712,6 +732,7 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
struct fcg_cgrp_ctx *cgc;
struct cgroup *cgrp;
u64 now = bpf_ktime_get_ns();
bool picked_next = false;

cpuc = find_cpu_ctx();
if (!cpuc)
Expand Down Expand Up @@ -766,10 +787,21 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
return;
}

bpf_repeat(BPF_MAX_LOOPS) {
if (try_pick_next_cgroup(&cpuc->cur_cgid))
bpf_repeat(CGROUP_MAX_RETRIES) {
if (try_pick_next_cgroup(&cpuc->cur_cgid)) {
picked_next = true;
break;
}
}

/*
* This only happens if try_pick_next_cgroup() races against enqueue
* path for more than CGROUP_MAX_RETRIES times, which is extremely
* unlikely and likely indicates an underlying bug. There shouldn't be
* any stall risk as the race is against enqueue.
*/
if (!picked_next)
stat_inc(FCG_STAT_PNC_FAIL);
}

s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p,
Expand Down
24 changes: 13 additions & 11 deletions tools/sched_ext/scx_flatcg.c
Original file line number Diff line number Diff line change
Expand Up @@ -186,29 +186,31 @@ int main(int argc, char **argv)

printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n",
seq++, cpu_util * 100.0, skel->data->hweight_gen);
printf(" act:%6llu deact:%6llu local:%6llu global:%6llu\n",
printf(" act:%6llu deact:%6llu global:%6llu local:%6llu\n",
stats[FCG_STAT_ACT],
stats[FCG_STAT_DEACT],
stats[FCG_STAT_LOCAL],
stats[FCG_STAT_GLOBAL]);
printf("HWT skip:%6llu race:%6llu cache:%6llu update:%6llu\n",
stats[FCG_STAT_HWT_SKIP],
stats[FCG_STAT_HWT_RACE],
stats[FCG_STAT_GLOBAL],
stats[FCG_STAT_LOCAL]);
printf("HWT cache:%6llu update:%6llu skip:%6llu race:%6llu\n",
stats[FCG_STAT_HWT_CACHE],
stats[FCG_STAT_HWT_UPDATES]);
stats[FCG_STAT_HWT_UPDATES],
stats[FCG_STAT_HWT_SKIP],
stats[FCG_STAT_HWT_RACE]);
printf("ENQ skip:%6llu race:%6llu\n",
stats[FCG_STAT_ENQ_SKIP],
stats[FCG_STAT_ENQ_RACE]);
printf("CNS keep:%6llu expire:%6llu empty:%6llu gone:%6llu\n",
printf("CNS keep:%6llu expire:%6llu empty:%6llu gone:%6llu\n",
stats[FCG_STAT_CNS_KEEP],
stats[FCG_STAT_CNS_EXPIRE],
stats[FCG_STAT_CNS_EMPTY],
stats[FCG_STAT_CNS_GONE]);
printf("PNC nocgrp:%6llu next:%6llu empty:%6llu gone:%6llu\n",
stats[FCG_STAT_PNC_NO_CGRP],
printf("PNC next:%6llu empty:%6llu nocgrp:%6llu gone:%6llu race:%6llu fail:%6llu\n",
stats[FCG_STAT_PNC_NEXT],
stats[FCG_STAT_PNC_EMPTY],
stats[FCG_STAT_PNC_GONE]);
stats[FCG_STAT_PNC_NO_CGRP],
stats[FCG_STAT_PNC_GONE],
stats[FCG_STAT_PNC_RACE],
stats[FCG_STAT_PNC_FAIL]);
printf("BAD remove:%6llu\n",
acc_stats[FCG_STAT_BAD_REMOVAL]);
fflush(stdout);
Expand Down
2 changes: 2 additions & 0 deletions tools/sched_ext/scx_flatcg.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ enum fcg_stat_idx {
FCG_STAT_PNC_NEXT,
FCG_STAT_PNC_EMPTY,
FCG_STAT_PNC_GONE,
FCG_STAT_PNC_RACE,
FCG_STAT_PNC_FAIL,

FCG_STAT_BAD_REMOVAL,

Expand Down

0 comments on commit 6eb6c92

Please sign in to comment.