diff --git a/patch-Revert-xen-credit2-limit-the-max-number-of-CPUs-in-a.patch b/patch-Revert-xen-credit2-limit-the-max-number-of-CPUs-in-a.patch new file mode 100644 index 00000000..d1c84031 --- /dev/null +++ b/patch-Revert-xen-credit2-limit-the-max-number-of-CPUs-in-a.patch @@ -0,0 +1,296 @@ +From cca3d7e05bc31c03bf573a1572aa0e913fc6cd94 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= + +Date: Sat, 3 Oct 2020 14:48:45 +0200 +Subject: [PATCH] Revert "xen: credit2: limit the max number of CPUs in a + runqueue" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit +Organization: Invisible Things Lab +Cc: Marek Marczykowski-Górecki + +This triggers assert failure in sched2.c:2273. The bug itself is +somewhere else (under investigation), for now revert the commit that +makes it much more likely to happen it on S3 resume. + +This reverts commit 8e2aa76dc1670e82eaa15683353853bc66bf54fc. + +Signed-off-by: Marek Marczykowski-Górecki +--- + docs/misc/xen-command-line.pandoc | 14 --- + xen/common/sched/credit2.c | 145 ++---------------------------- + xen/include/asm-arm/cpufeature.h | 5 -- + xen/include/asm-x86/processor.h | 5 -- + 4 files changed, 6 insertions(+), 163 deletions(-) + +diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc +index 4ae9391fcd3d..1f754d3ec1d7 100644 +--- a/docs/misc/xen-command-line.pandoc ++++ b/docs/misc/xen-command-line.pandoc +@@ -1880,20 +1880,6 @@ with read and write permissions. + + Choose the default scheduler. + +-### sched_credit2_max_cpus_runqueue +-> `= ` +- +-> Default: `16` +- +-Defines how many CPUs will be put, at most, in each Credit2 runqueue. +- +-Runqueues are still arranged according to the host topology (and following +-what indicated by the 'credit2_runqueue' parameter). But we also have a cap +-to the number of CPUs that share each runqueues. +- +-A value that is a submultiple of the number of online CPUs is recommended, +-as that would likely produce a perfectly balanced runqueue configuration. +- + ### sched_credit2_migrate_resist + > `= ` + +diff --git a/xen/common/sched/credit2.c b/xen/common/sched/credit2.c +index eb5e5a78c5e7..8a4f28b9f546 100644 +--- a/xen/common/sched/credit2.c ++++ b/xen/common/sched/credit2.c +@@ -25,8 +25,6 @@ + #include + #include + #include +-#include +-#include + + #include "private.h" + +@@ -473,22 +471,6 @@ static int __init parse_credit2_runqueue(const char *s) + } + custom_param("credit2_runqueue", parse_credit2_runqueue); + +-/* +- * How many CPUs will be put, at most, in each runqueue. +- * +- * Runqueues are still arranged according to the host topology (and according +- * to the value of the 'credit2_runqueue' parameter). But we also have a cap +- * to the number of CPUs that share runqueues. +- * +- * This should be considered an upper limit. In fact, we also try to balance +- * the number of CPUs in each runqueue. And, when doing that, it is possible +- * that fewer CPUs than what this parameters mandates will actually be put +- * in each runqueue. +- */ +-#define MAX_CPUS_RUNQ 16 +-static unsigned int __read_mostly opt_max_cpus_runqueue = MAX_CPUS_RUNQ; +-integer_param("sched_credit2_max_cpus_runqueue", opt_max_cpus_runqueue); +- + /* + * Per-runqueue data + */ +@@ -870,83 +852,18 @@ cpu_runqueue_match(const struct csched2_runqueue_data *rqd, unsigned int cpu) + (opt_runqueue == OPT_RUNQUEUE_NODE && same_node(peer_cpu, cpu)); + } + +-/* +- * Additional checks, to avoid separating siblings in different runqueues. +- * This deals with both Intel's HTs and AMD's CUs. An arch that does not have +- * any similar concept will just have cpu_nr_siblings() always return 1, and +- * setup the cpu_sibling_mask-s acordingly (as currently does ARM), and things +- * will just work as well. +- */ +-static bool +-cpu_runqueue_siblings_match(const struct csched2_runqueue_data *rqd, +- unsigned int cpu, unsigned int max_cpus_runq) +-{ +- unsigned int nr_sibls = cpu_nr_siblings(cpu); +- unsigned int rcpu, tot_sibls = 0; +- +- /* +- * If we put the CPU in this runqueue, we must be sure that there will +- * be enough room for accepting its sibling(s) as well. +- */ +- cpumask_clear(cpumask_scratch_cpu(cpu)); +- for_each_cpu ( rcpu, &rqd->active ) +- { +- ASSERT(rcpu != cpu); +- if ( !cpumask_intersects(per_cpu(cpu_sibling_mask, rcpu), cpumask_scratch_cpu(cpu)) ) +- { +- /* +- * For each CPU already in the runqueue, account for it and for +- * its sibling(s), independently from whether they are in the +- * runqueue or not. Of course, we do this only once, for each CPU +- * that is already inside the runqueue and all its siblings! +- * +- * This way, even if there are CPUs in the runqueue with siblings +- * in a different cpupools, we still count all of them here. +- * The reason for this is that, if at some future point we will +- * move those sibling CPUs to this cpupool, we want them to land +- * in this runqueue. Hence we must be sure to leave space for them. +- */ +- cpumask_or(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), +- per_cpu(cpu_sibling_mask, rcpu)); +- tot_sibls += cpu_nr_siblings(rcpu); +- } +- } +- /* +- * We know that neither the CPU, nor any of its sibling are here, +- * or we wouldn't even have entered the function. +- */ +- ASSERT(!cpumask_intersects(cpumask_scratch_cpu(cpu), +- per_cpu(cpu_sibling_mask, cpu))); +- +- /* Try adding CPU and its sibling(s) to the count and check... */ +- return tot_sibls + nr_sibls <= max_cpus_runq; +-} +- + static struct csched2_runqueue_data * +-cpu_add_to_runqueue(const struct scheduler *ops, unsigned int cpu) ++cpu_add_to_runqueue(struct csched2_private *prv, unsigned int cpu) + { +- struct csched2_private *prv = csched2_priv(ops); + struct csched2_runqueue_data *rqd, *rqd_new; +- struct csched2_runqueue_data *rqd_valid = NULL; + struct list_head *rqd_ins; + unsigned long flags; + int rqi = 0; +- unsigned int min_rqs, max_cpus_runq; +- bool rqi_unused = false; ++ bool rqi_unused = false, rqd_valid = false; + + /* Prealloc in case we need it - not allowed with interrupts off. */ + rqd_new = xzalloc(struct csched2_runqueue_data); + +- /* +- * While respecting the limit of not having more than the max number of +- * CPUs per runqueue, let's also try to "spread" the CPU, as evenly as +- * possible, among the runqueues. For doing that, we need to know upfront +- * how many CPUs we have, so let's use the number of CPUs that are online +- * for that. +- */ +- min_rqs = ((num_online_cpus() - 1) / opt_max_cpus_runqueue) + 1; +- max_cpus_runq = num_online_cpus() / min_rqs; +- + write_lock_irqsave(&prv->lock, flags); + + rqd_ins = &prv->rql; +@@ -956,59 +873,10 @@ cpu_add_to_runqueue(const struct scheduler *ops, unsigned int cpu) + if ( !rqi_unused && rqd->id > rqi ) + rqi_unused = true; + +- /* +- * First of all, let's check whether, according to the system +- * topology, this CPU belongs in this runqueue. +- */ + if ( cpu_runqueue_match(rqd, cpu) ) + { +- /* +- * If the CPU has any siblings, they are online and they are +- * being added to this cpupool, always keep them together. Even +- * if that means violating what the opt_max_cpus_runqueue param +- * indicates. However, if this happens, chances are high that a +- * too small value was used for the parameter, so warn the user +- * about that. +- * +- * Note that we cannot check this once and for all, say, during +- * scheduler initialization. In fact, at least in theory, the +- * number of siblings a CPU has may not be the same for all the +- * CPUs. +- */ +- if ( cpumask_intersects(&rqd->active, per_cpu(cpu_sibling_mask, cpu)) ) +- { +- if ( cpumask_weight(&rqd->active) >= opt_max_cpus_runqueue ) +- { +- printk("WARNING: %s: more than opt_max_cpus_runqueue " +- "in a runqueue (%u vs %u), due to topology constraints.\n" +- "Consider raising it!\n", +- __func__, opt_max_cpus_runqueue, +- cpumask_weight(&rqd->active)); +- } +- rqd_valid = rqd; +- break; +- } +- +- /* +- * If we're using core (or socket) scheduling, no need to do any +- * further checking beyond the number of CPUs already in this +- * runqueue respecting our upper bound. +- * +- * Otherwise, let's try to make sure that siblings stay in the +- * same runqueue, pretty much under any cinrcumnstances. +- */ +- if ( rqd->refcnt < max_cpus_runq && (ops->cpupool->gran != SCHED_GRAN_cpu || +- cpu_runqueue_siblings_match(rqd, cpu, max_cpus_runq)) ) +- { +- /* +- * This runqueue is ok, but as we said, we also want an even +- * distribution of the CPUs. So, unless this is the very first +- * match, we go on, check all runqueues and actually add the +- * CPU into the one that is less full. +- */ +- if ( !rqd_valid || rqd->refcnt < rqd_valid->refcnt ) +- rqd_valid = rqd; +- } ++ rqd_valid = true; ++ break; + } + + if ( !rqi_unused ) +@@ -1032,8 +900,6 @@ cpu_add_to_runqueue(const struct scheduler *ops, unsigned int cpu) + rqd->pick_bias = cpu; + rqd->id = rqi; + } +- else +- rqd = rqd_valid; + + rqd->refcnt++; + +@@ -3878,6 +3744,7 @@ csched2_dump(const struct scheduler *ops) + static void * + csched2_alloc_pdata(const struct scheduler *ops, int cpu) + { ++ struct csched2_private *prv = csched2_priv(ops); + struct csched2_pcpu *spc; + struct csched2_runqueue_data *rqd; + +@@ -3887,7 +3754,7 @@ csched2_alloc_pdata(const struct scheduler *ops, int cpu) + if ( spc == NULL ) + return ERR_PTR(-ENOMEM); + +- rqd = cpu_add_to_runqueue(ops, cpu); ++ rqd = cpu_add_to_runqueue(prv, cpu); + if ( IS_ERR(rqd) ) + { + xfree(spc); +diff --git a/xen/include/asm-arm/cpufeature.h b/xen/include/asm-arm/cpufeature.h +index 10878ead8a27..6bff5ce13110 100644 +--- a/xen/include/asm-arm/cpufeature.h ++++ b/xen/include/asm-arm/cpufeature.h +@@ -64,11 +64,6 @@ static inline bool cpus_have_cap(unsigned int num) + return test_bit(num, cpu_hwcaps); + } + +-static inline int cpu_nr_siblings(unsigned int cpu) +-{ +- return 1; +-} +- + /* System capability check for constant cap */ + #define cpus_have_const_cap(num) ({ \ + register_t __ret; \ +diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h +index 9acb80fdcd37..badd7e60e506 100644 +--- a/xen/include/asm-x86/processor.h ++++ b/xen/include/asm-x86/processor.h +@@ -171,11 +171,6 @@ extern void init_intel_cacheinfo(struct cpuinfo_x86 *c); + + unsigned int apicid_to_socket(unsigned int); + +-static inline int cpu_nr_siblings(unsigned int cpu) +-{ +- return cpu_data[cpu].x86_num_siblings; +-} +- + /* + * Generic CPUID function + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx +-- +2.25.4 + diff --git a/patch-x86-S3-Fix-Shadow-Stack-resume-path.patch b/patch-x86-S3-Fix-Shadow-Stack-resume-path.patch new file mode 100644 index 00000000..bc685c37 --- /dev/null +++ b/patch-x86-S3-Fix-Shadow-Stack-resume-path.patch @@ -0,0 +1,37 @@ +From bd032cbd664a229603592d595ce562dc1a22d158 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= + +Date: Sun, 27 Sep 2020 17:29:35 +0200 +Subject: [PATCH] x86/S3: Fix Shadow Stack resume path +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit +Organization: Invisible Things Lab +Cc: Marek Marczykowski-Górecki + +Fix the resume path to load the shadow stack pointer from saved_ssp (not +saved_rsp), to match what suspend path does. + +Signed-off-by: Marek Marczykowski-Górecki +Fixes: 633ecc4a7cb2 ("x86/S3: Save and restore Shadow Stack configuration") +Backport: 4.14 +--- + xen/arch/x86/acpi/wakeup_prot.S | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/arch/x86/acpi/wakeup_prot.S b/xen/arch/x86/acpi/wakeup_prot.S +index a2c41c4f3f26..c6b3fcc93d93 100644 +--- a/xen/arch/x86/acpi/wakeup_prot.S ++++ b/xen/arch/x86/acpi/wakeup_prot.S +@@ -69,7 +69,7 @@ ENTRY(s3_resume) + * so SETSSBSY will successfully load a value useful for us, then + * reset MSR_PL0_SSP to its usual value and pop the temporary token. + */ +- mov saved_rsp(%rip), %rdi ++ mov saved_ssp(%rip), %rdi + cmpq $1, %rdi + je .L_shstk_done + +-- +2.25.4 + diff --git a/patch-x86-S3-Restore-CR4-earlier-during-resume.patch b/patch-x86-S3-Restore-CR4-earlier-during-resume.patch new file mode 100644 index 00000000..5619fb3a --- /dev/null +++ b/patch-x86-S3-Restore-CR4-earlier-during-resume.patch @@ -0,0 +1,66 @@ +From: Andrew Cooper +Date: Fri, 2 Oct 2020 17:49:32 +0000 (+0100) +Subject: x86/S3: Restore CR4 earlier during resume +X-Git-Url: https://xenbits.xen.org/gitweb/?p=people%2Fandrewcoop%2Fxen.git;a=commitdiff_plain;h=eb2dd26a5060012377368cfe7edb38b76ba04058 + +x86/S3: Restore CR4 earlier during resume + +c/s 4304ff420e5 "x86/S3: Drop {save,restore}_rest_processor_state() +completely" moved CR4 restoration up into C, to account for the fact that MCE +was explicitly handled later. + +However, time_resume() ends up making an EFI Runtime Service call, and EFI +explodes without OSFXSR, presumably when trying to spill %xmm registers onto +the stack. + +Given this codepath, and the potential for other issues of a similar kind (TLB +flushing vs INVPCID, HVM logic vs VMXE, etc), restore CR4 in asm before +entering C. + +Ignore the previous MCE special case, because its not actually necessary. The +handler is already suitably configured from before suspend. + +Fixes: 4304ff420e5 ("x86/S3: Drop {save,restore}_rest_processor_state() completely") +Reported-by: Marek Marczykowski-Górecki +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +--- + +diff --git a/xen/arch/x86/acpi/power.c b/xen/arch/x86/acpi/power.c +index 4fb1e7a148..7f162a4df9 100644 +--- a/xen/arch/x86/acpi/power.c ++++ b/xen/arch/x86/acpi/power.c +@@ -276,9 +276,6 @@ static int enter_state(u32 state) + + mcheck_init(&boot_cpu_data, false); + +- /* Restore CR4 from cached value, now MCE is set up. */ +- write_cr4(read_cr4()); +- + printk(XENLOG_INFO "Finishing wakeup from ACPI S%d state.\n", state); + + if ( (state == ACPI_STATE_S3) && error ) +diff --git a/xen/arch/x86/acpi/wakeup_prot.S b/xen/arch/x86/acpi/wakeup_prot.S +index c6b3fcc93d..15052c300f 100644 +--- a/xen/arch/x86/acpi/wakeup_prot.S ++++ b/xen/arch/x86/acpi/wakeup_prot.S +@@ -1,3 +1,4 @@ ++#include + #include + #include + #include +@@ -110,6 +111,11 @@ ENTRY(s3_resume) + + call load_system_tables + ++ /* Restore CR4 from the cpuinfo block. */ ++ GET_STACK_END(bx) ++ mov STACK_CPUINFO_FIELD(cr4)(%rbx), %rax ++ mov %rax, %cr4 ++ + .Lsuspend_err: + pop %r15 + pop %r14 +-- +2.11.0 + diff --git a/patch-x86-smpboot-Unconditionally-call-memguard_unguard_stack.patch b/patch-x86-smpboot-Unconditionally-call-memguard_unguard_stack.patch new file mode 100644 index 00000000..5d225e78 --- /dev/null +++ b/patch-x86-smpboot-Unconditionally-call-memguard_unguard_stack.patch @@ -0,0 +1,57 @@ +From: Andrew Cooper +Subject: [PATCH] x86/smpboot: Unconditionally call memguard_unguard_stack() in cpu_smpboot_free() +Date: Mon, 5 Oct 2020 13:23:25 +0100 + +For simplicity between various configuration, Xen always uses shadow stack +mappings (Read-only + Dirty) for the guard page, irrespective of whether +CET-SS is enabled. + +memguard_guard_stack() writes shadow stack tokens with plain writes. This is +necessary to configure the BSP shadow stack correctly, and cannot be +implemented with WRSS. + +Therefore, unconditionally call memguard_unguard_stack() to return the +mappings to fully writeable, so a subsequent call to memguard_guard_stack() +will succeed. + +Fixes: 91d26ed304f ("x86/shstk: Create shadow stacks") +Reported-by: Marek Marczykowski-Górecki +Signed-off-by: Andrew Cooper +--- +CC: Jan Beulich +CC: Roger Pau Monné +CC: Wei Liu + +This can more easily be demonstrated with CPU hotplug than S3, and the absence +of bug reports goes to show how rarely hotplug is used. +--- + xen/arch/x86/smpboot.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c +index 5708573c41..c193cc0fb8 100644 +--- a/xen/arch/x86/smpboot.c ++++ b/xen/arch/x86/smpboot.c +@@ -971,16 +971,16 @@ static void cpu_smpboot_free(unsigned int cpu, bool remove) + if ( IS_ENABLED(CONFIG_PV32) ) + FREE_XENHEAP_PAGE(per_cpu(compat_gdt, cpu)); + ++ if ( stack_base[cpu] ) ++ memguard_unguard_stack(stack_base[cpu]); ++ + if ( remove ) + { + FREE_XENHEAP_PAGE(per_cpu(gdt, cpu)); + FREE_XENHEAP_PAGE(idt_tables[cpu]); + + if ( stack_base[cpu] ) +- { +- memguard_unguard_stack(stack_base[cpu]); + FREE_XENHEAP_PAGES(stack_base[cpu], STACK_ORDER); +- } + } + } + +-- +2.11.0 + diff --git a/xen.spec.in b/xen.spec.in index 210ca0f3..a8dab5b8 100644 --- a/xen.spec.in +++ b/xen.spec.in @@ -126,6 +126,10 @@ Patch622: patch-xen-disable-efi-gettime.patch Patch627: patch-libxl-automatically-enable-gfx_passthru-if-IGD-is-as.patch Patch628: patch-libxl-workaround-gcc-10.2-maybe-uninitialized-warnin.patch Patch629: patch-libxl-fix-Werror-stringop-truncation-in-libxl__prepa.patch +Patch630: patch-x86-S3-Fix-Shadow-Stack-resume-path.patch +Patch631: patch-Revert-xen-credit2-limit-the-max-number-of-CPUs-in-a.patch +Patch632: patch-x86-S3-Restore-CR4-earlier-during-resume.patch +Patch633: patch-x86-smpboot-Unconditionally-call-memguard_unguard_stack.patch # GCC8 fixes Patch714: patch-tools-kdd-mute-spurious-gcc-warning.patch